git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_correlate_q15.c
   9 *
  10 * Description:  Correlation of Q15 sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated
  28 *
  29 * Version 0.0.7  2010/06/10
  30 *    Misra-C changes done
  31 *
  32 * -------------------------------------------------------------------- */
  33
  34 #include "arm_math.h"
  35
  36 /**
  37  * @ingroup groupFilters
  38  */
  39
  40 /**
  41  * @addtogroup Corr
  42  * @{
  43  */
  44
  45 /**
  46  * @brief Correlation of Q15 sequences.
  47  * @param[in] *pSrcA points to the first input sequence.
  48  * @param[in] srcALen length of the first input sequence.
  49  * @param[in] *pSrcB points to the second input sequence.
  50  * @param[in] srcBLen length of the second input sequence.
  51  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
  52  * @return none.
  53  *
  54  * @details
  55  * <b>Scaling and Overflow Behavior:</b>
  56  *
  57  * \par
  58  * The function is implemented using a 64-bit internal accumulator.
  59  * Both inputs are in 1.15 format and multiplications yield a 2.30 result.
  60  * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
  61  * This approach provides 33 guard bits and there is no risk of overflow.
  62  * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
  63  *
  64  * \par
  65  * Refer to <code>arm_correlate_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
  66  */
  67
  68 void arm_correlate_q15(
  69   q15_t * pSrcA,
  70   uint32_t srcALen,
  71   q15_t * pSrcB,
  72   uint32_t srcBLen,
  73   q15_t * pDst)
  74 {
  75
  76 #ifndef ARM_MATH_CM0
  77
  78   /* Run the below code for Cortex-M4 and Cortex-M3 */
  79
  80   q15_t *pIn1;                                   /* inputA pointer               */
  81   q15_t *pIn2;                                   /* inputB pointer               */
  82   q15_t *pOut = pDst;                            /* output pointer               */
  83   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
  84   q15_t *px;                                     /* Intermediate inputA pointer  */
  85   q15_t *py;                                     /* Intermediate inputB pointer  */
  86   q15_t *pSrc1;                                  /* Intermediate pointers        */
  87   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */
  88   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
  89   int32_t inc = 1;                               /* Destination address modifier */
  90   q31_t *pb;                                     /* 32 bit pointer for inputB buffer */
  91
  92
  93   /* The algorithm implementation is based on the lengths of the inputs. */
  94   /* srcB is always made to slide across srcA. */
  95   /* So srcBLen is always considered as shorter or equal to srcALen */
  96   /* But CORR(x, y) is reverse of CORR(y, x) */
  97   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
  98   /* and the destination pointer modifier, inc is set to -1 */
  99   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
 100   /* But to improve the performance,
 101    * we include zeroes in the output instead of zero padding either of the the inputs*/
 102   /* If srcALen > srcBLen,
 103    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
 104   /* If srcALen < srcBLen,
 105    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
 106   if(srcALen >= srcBLen)
 107   {
 108     /* Initialization of inputA pointer */
 109     pIn1 = (pSrcA);
 110
 111     /* Initialization of inputB pointer */
 112     pIn2 = (pSrcB);
 113
 114     /* Number of output samples is calculated */
 115     outBlockSize = (2u * srcALen) - 1u;
 116
 117     /* When srcALen > srcBLen, zero padding is done to srcB
 118      * to make their lengths equal.
 119      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
 120      * number of output samples are made zero */
 121     j = outBlockSize - (srcALen + (srcBLen - 1u));
 122
 123     /* Updating the pointer position to non zero value */
 124     pOut += j;
 125
 126   }
 127   else
 128   {
 129     /* Initialization of inputA pointer */
 130     pIn1 = (pSrcB);
 131
 132     /* Initialization of inputB pointer */
 133     pIn2 = (pSrcA);
 134
 135     /* srcBLen is always considered as shorter or equal to srcALen */
 136     j = srcBLen;
 137     srcBLen = srcALen;
 138     srcALen = j;
 139
 140     /* CORR(x, y) = Reverse order(CORR(y, x)) */
 141     /* Hence set the destination pointer to point to the last output sample */
 142     pOut = pDst + ((srcALen + srcBLen) - 2u);
 143
 144     /* Destination address modifier is set to -1 */
 145     inc = -1;
 146
 147   }
 148
 149   /* The function is internally
 150    * divided into three parts according to the number of multiplications that has to be
 151    * taken place between inputA samples and inputB samples. In the first part of the
 152    * algorithm, the multiplications increase by one for every iteration.
 153    * In the second part of the algorithm, srcBLen number of multiplications are done.
 154    * In the third part of the algorithm, the multiplications decrease by one
 155    * for every iteration.*/
 156   /* The algorithm is implemented in three stages.
 157    * The loop counters of each stage is initiated here. */
 158   blockSize1 = srcBLen - 1u;
 159   blockSize2 = srcALen - (srcBLen - 1u);
 160   blockSize3 = blockSize1;
 161
 162   /* --------------------------
 163    * Initializations of stage1
 164    * -------------------------*/
 165
 166   /* sum = x[0] * y[srcBlen - 1]
 167    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
 168    * ....
 169    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
 170    */
 171
 172   /* In this stage the MAC operations are increased by 1 for every iteration.
 173      The count variable holds the number of MAC operations performed */
 174   count = 1u;
 175
 176   /* Working pointer of inputA */
 177   px = pIn1;
 178
 179   /* Working pointer of inputB */
 180   pSrc1 = pIn2 + (srcBLen - 1u);
 181   py = pSrc1;
 182
 183   /* ------------------------
 184    * Stage1 process
 185    * ----------------------*/
 186
 187   /* The first loop starts here */
 188   while(blockSize1 > 0u)
 189   {
 190     /* Accumulator is made zero for every iteration */
 191     sum = 0;
 192
 193     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 194     k = count >> 2;
 195
 196     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 197      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 198     while(k > 0u)
 199     {
 200       /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
 201       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
 202       /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
 203       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
 204
 205       /* Decrement the loop counter */
 206       k--;
 207     }
 208
 209     /* If the count is not a multiple of 4, compute any remaining MACs here.
 210      ** No loop unrolling is used. */
 211     k = count % 0x4u;
 212
 213     while(k > 0u)
 214     {
 215       /* Perform the multiply-accumulates */
 216       /* x[0] * y[srcBLen - 1] */
 217       sum = __SMLALD(*px++, *py++, sum);
 218
 219       /* Decrement the loop counter */
 220       k--;
 221     }
 222
 223     /* Store the result in the accumulator in the destination buffer. */
 224     *pOut = (q15_t) (__SSAT((sum >> 15), 16));
 225     /* Destination pointer is updated according to the address modifier, inc */
 226     pOut += inc;
 227
 228     /* Update the inputA and inputB pointers for next MAC calculation */
 229     py = pSrc1 - count;
 230     px = pIn1;
 231
 232     /* Increment the MAC count */
 233     count++;
 234
 235     /* Decrement the loop counter */
 236     blockSize1--;
 237   }
 238
 239   /* --------------------------
 240    * Initializations of stage2
 241    * ------------------------*/
 242
 243   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
 244    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
 245    * ....
 246    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 247    */
 248
 249   /* Working pointer of inputA */
 250   px = pIn1;
 251
 252   /* Working pointer of inputB */
 253   py = pIn2;
 254
 255   /* Initialize inputB pointer of type q31 */
 256   pb = (q31_t *) (py);
 257
 258   /* count is index by which the pointer pIn1 to be incremented */
 259   count = 0u;
 260
 261   /* -------------------
 262    * Stage2 process
 263    * ------------------*/
 264
 265   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 266    * So, to loop unroll over blockSize2,
 267    * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
 268   if(srcBLen >= 4u)
 269   {
 270     /* Loop unroll over blockSize2, by 4 */
 271     blkCnt = blockSize2 >> 2u;
 272
 273     while(blkCnt > 0u)
 274     {
 275       /* Set all accumulators to zero */
 276       acc0 = 0;
 277       acc1 = 0;
 278       acc2 = 0;
 279       acc3 = 0;
 280
 281       /* read x[0], x[1] samples */
 282       x0 = *(q31_t *) (px++);
 283       /* read x[1], x[2] samples */
 284       x1 = *(q31_t *) (px++);
 285
 286       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 287       k = srcBLen >> 2u;
 288
 289       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 290        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 291       do
 292       {
 293         /* Read the first two inputB samples using SIMD:
 294          * y[0] and y[1] */
 295         c0 = *(pb++);
 296
 297         /* acc0 +=  x[0] * y[0] + x[1] * y[1] */
 298         acc0 = __SMLALD(x0, c0, acc0);
 299
 300         /* acc1 +=  x[1] * y[0] + x[2] * y[1] */
 301         acc1 = __SMLALD(x1, c0, acc1);
 302
 303         /* Read x[2], x[3] */
 304         x2 = *(q31_t *) (px++);
 305
 306         /* Read x[3], x[4] */
 307         x3 = *(q31_t *) (px++);
 308
 309         /* acc2 +=  x[2] * y[0] + x[3] * y[1] */
 310         acc2 = __SMLALD(x2, c0, acc2);
 311
 312         /* acc3 +=  x[3] * y[0] + x[4] * y[1] */
 313         acc3 = __SMLALD(x3, c0, acc3);
 314
 315         /* Read y[2] and y[3] */
 316         c0 = *(pb++);
 317
 318         /* acc0 +=  x[2] * y[2] + x[3] * y[3] */
 319         acc0 = __SMLALD(x2, c0, acc0);
 320
 321         /* acc1 +=  x[3] * y[2] + x[4] * y[3] */
 322         acc1 = __SMLALD(x3, c0, acc1);
 323
 324         /* Read x[4], x[5] */
 325         x0 = *(q31_t *) (px++);
 326
 327         /* Read x[5], x[6] */
 328         x1 = *(q31_t *) (px++);
 329
 330         /* acc2 +=  x[4] * y[2] + x[5] * y[3] */
 331         acc2 = __SMLALD(x0, c0, acc2);
 332
 333         /* acc3 +=  x[5] * y[2] + x[6] * y[3] */
 334         acc3 = __SMLALD(x1, c0, acc3);
 335
 336       } while(--k);
 337
 338       /* For the next MAC operations, SIMD is not used
 339        * So, the 16 bit pointer if inputB, py is updated */
 340       py = (q15_t *) (pb);
 341
 342       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 343        ** No loop unrolling is used. */
 344       k = srcBLen % 0x4u;
 345
 346       if(k == 1u)
 347       {
 348         /* Read y[4] */
 349         c0 = *py;
 350 #ifdef  ARM_MATH_BIG_ENDIAN
 351
 352         c0 = c0 << 16u;
 353
 354 #else
 355
 356         c0 = c0 & 0x0000FFFF;
 357
 358 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 359         /* Read x[7] */
 360         x3 = *(q31_t *) px++;
 361
 362         /* Perform the multiply-accumulates */
 363         acc0 = __SMLALD(x0, c0, acc0);
 364         acc1 = __SMLALD(x1, c0, acc1);
 365         acc2 = __SMLALDX(x1, c0, acc2);
 366         acc3 = __SMLALDX(x3, c0, acc3);
 367       }
 368
 369       if(k == 2u)
 370       {
 371         /* Read y[4], y[5] */
 372         c0 = *(pb);
 373
 374         /* Read x[7], x[8] */
 375         x3 = *(q31_t *) px++;
 376
 377         /* Read x[9] */
 378         x2 = *(q31_t *) px++;
 379
 380         /* Perform the multiply-accumulates */
 381         acc0 = __SMLALD(x0, c0, acc0);
 382         acc1 = __SMLALD(x1, c0, acc1);
 383         acc2 = __SMLALD(x3, c0, acc2);
 384         acc3 = __SMLALD(x2, c0, acc3);
 385       }
 386
 387       if(k == 3u)
 388       {
 389         /* Read y[4], y[5] */
 390         c0 = *pb++;
 391
 392         /* Read x[7], x[8] */
 393         x3 = *(q31_t *) px++;
 394
 395         /* Read x[9] */
 396         x2 = *(q31_t *) px++;
 397
 398         /* Perform the multiply-accumulates */
 399         acc0 = __SMLALD(x0, c0, acc0);
 400         acc1 = __SMLALD(x1, c0, acc1);
 401         acc2 = __SMLALD(x3, c0, acc2);
 402         acc3 = __SMLALD(x2, c0, acc3);
 403
 404         /* Read y[6] */
 405 #ifdef  ARM_MATH_BIG_ENDIAN
 406
 407         c0 = (*pb);
 408         c0 = c0 & 0xFFFF0000;
 409
 410 #else
 411
 412         c0 = (q15_t) (*pb);
 413         c0 = c0 & 0x0000FFFF;
 414
 415 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 416         /* Read x[10] */
 417         x3 = *(q31_t *) px++;
 418
 419         /* Perform the multiply-accumulates */
 420         acc0 = __SMLALDX(x1, c0, acc0);
 421         acc1 = __SMLALD(x2, c0, acc1);
 422         acc2 = __SMLALDX(x2, c0, acc2);
 423         acc3 = __SMLALDX(x3, c0, acc3);
 424       }
 425
 426       /* Store the result in the accumulator in the destination buffer. */
 427       *pOut = (q15_t) (__SSAT(acc0 >> 15, 16));
 428       /* Destination pointer is updated according to the address modifier, inc */
 429       pOut += inc;
 430
 431       *pOut = (q15_t) (__SSAT(acc1 >> 15, 16));
 432       pOut += inc;
 433
 434       *pOut = (q15_t) (__SSAT(acc2 >> 15, 16));
 435       pOut += inc;
 436
 437       *pOut = (q15_t) (__SSAT(acc3 >> 15, 16));
 438       pOut += inc;
 439
 440       /* Increment the count by 4 as 4 output values are computed */
 441       count += 4u;
 442
 443       /* Update the inputA and inputB pointers for next MAC calculation */
 444       px = pIn1 + count;
 445       py = pIn2;
 446       pb = (q31_t *) (py);
 447
 448
 449       /* Decrement the loop counter */
 450       blkCnt--;
 451     }
 452
 453     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 454      ** No loop unrolling is used. */
 455     blkCnt = blockSize2 % 0x4u;
 456
 457     while(blkCnt > 0u)
 458     {
 459       /* Accumulator is made zero for every iteration */
 460       sum = 0;
 461
 462       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 463       k = srcBLen >> 2u;
 464
 465       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 466        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 467       while(k > 0u)
 468       {
 469         /* Perform the multiply-accumulates */
 470         sum += ((q63_t) * px++ * *py++);
 471         sum += ((q63_t) * px++ * *py++);
 472         sum += ((q63_t) * px++ * *py++);
 473         sum += ((q63_t) * px++ * *py++);
 474
 475         /* Decrement the loop counter */
 476         k--;
 477       }
 478
 479       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 480        ** No loop unrolling is used. */
 481       k = srcBLen % 0x4u;
 482
 483       while(k > 0u)
 484       {
 485         /* Perform the multiply-accumulates */
 486         sum += ((q63_t) * px++ * *py++);
 487
 488         /* Decrement the loop counter */
 489         k--;
 490       }
 491
 492       /* Store the result in the accumulator in the destination buffer. */
 493       *pOut = (q15_t) (__SSAT(sum >> 15, 16));
 494       /* Destination pointer is updated according to the address modifier, inc */
 495       pOut += inc;
 496
 497       /* Increment count by 1, as one output value is computed */
 498       count++;
 499
 500       /* Update the inputA and inputB pointers for next MAC calculation */
 501       px = pIn1 + count;
 502       py = pIn2;
 503
 504       /* Decrement the loop counter */
 505       blkCnt--;
 506     }
 507   }
 508   else
 509   {
 510     /* If the srcBLen is not a multiple of 4,
 511      * the blockSize2 loop cannot be unrolled by 4 */
 512     blkCnt = blockSize2;
 513
 514     while(blkCnt > 0u)
 515     {
 516       /* Accumulator is made zero for every iteration */
 517       sum = 0;
 518
 519       /* Loop over srcBLen */
 520       k = srcBLen;
 521
 522       while(k > 0u)
 523       {
 524         /* Perform the multiply-accumulate */
 525         sum += ((q63_t) * px++ * *py++);
 526
 527         /* Decrement the loop counter */
 528         k--;
 529       }
 530
 531       /* Store the result in the accumulator in the destination buffer. */
 532       *pOut = (q15_t) (__SSAT(sum >> 15, 16));
 533       /* Destination pointer is updated according to the address modifier, inc */
 534       pOut += inc;
 535
 536       /* Increment the MAC count */
 537       count++;
 538
 539       /* Update the inputA and inputB pointers for next MAC calculation */
 540       px = pIn1 + count;
 541       py = pIn2;
 542
 543       /* Decrement the loop counter */
 544       blkCnt--;
 545     }
 546   }
 547
 548   /* --------------------------
 549    * Initializations of stage3
 550    * -------------------------*/
 551
 552   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 553    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 554    * ....
 555    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
 556    * sum +=  x[srcALen-1] * y[0]
 557    */
 558
 559   /* In this stage the MAC operations are decreased by 1 for every iteration.
 560      The count variable holds the number of MAC operations performed */
 561   count = srcBLen - 1u;
 562
 563   /* Working pointer of inputA */
 564   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 565   px = pSrc1;
 566
 567   /* Working pointer of inputB */
 568   py = pIn2;
 569
 570   /* -------------------
 571    * Stage3 process
 572    * ------------------*/
 573
 574   while(blockSize3 > 0u)
 575   {
 576     /* Accumulator is made zero for every iteration */
 577     sum = 0;
 578
 579     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 580     k = count >> 2u;
 581
 582     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 583      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 584     while(k > 0u)
 585     {
 586       /* Perform the multiply-accumulates */
 587       /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
 588       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
 589       /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
 590       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
 591
 592       /* Decrement the loop counter */
 593       k--;
 594     }
 595
 596     /* If the count is not a multiple of 4, compute any remaining MACs here.
 597      ** No loop unrolling is used. */
 598     k = count % 0x4u;
 599
 600     while(k > 0u)
 601     {
 602       /* Perform the multiply-accumulates */
 603       sum = __SMLALD(*px++, *py++, sum);
 604
 605       /* Decrement the loop counter */
 606       k--;
 607     }
 608
 609     /* Store the result in the accumulator in the destination buffer. */
 610     *pOut = (q15_t) (__SSAT((sum >> 15), 16));
 611     /* Destination pointer is updated according to the address modifier, inc */
 612     pOut += inc;
 613
 614     /* Update the inputA and inputB pointers for next MAC calculation */
 615     px = ++pSrc1;
 616     py = pIn2;
 617
 618     /* Decrement the MAC count */
 619     count--;
 620
 621     /* Decrement the loop counter */
 622     blockSize3--;
 623   }
 624
 625 #else
 626
 627 /* Run the below code for Cortex-M0 */
 628
 629   q15_t *pIn1 = pSrcA;                           /* inputA pointer               */
 630   q15_t *pIn2 = pSrcB + (srcBLen - 1u);          /* inputB pointer               */
 631   q63_t sum;                                     /* Accumulators                  */
 632   uint32_t i = 0u, j;                            /* loop counters */
 633   uint32_t inv = 0u;                             /* Reverse order flag */
 634   uint32_t tot = 0u;                             /* Length */
 635
 636   /* The algorithm implementation is based on the lengths of the inputs. */
 637   /* srcB is always made to slide across srcA. */
 638   /* So srcBLen is always considered as shorter or equal to srcALen */
 639   /* But CORR(x, y) is reverse of CORR(y, x) */
 640   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
 641   /* and a varaible, inv is set to 1 */
 642   /* If lengths are not equal then zero pad has to be done to  make the two
 643    * inputs of same length. But to improve the performance, we include zeroes
 644    * in the output instead of zero padding either of the the inputs*/
 645   /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
 646    * starting of the output buffer */
 647   /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
 648    * ending of the output buffer */
 649   /* Once the zero padding is done the remaining of the output is calcualted
 650    * using convolution but with the shorter signal time shifted. */
 651
 652   /* Calculate the length of the remaining sequence */
 653   tot = ((srcALen + srcBLen) - 2u);
 654
 655   if(srcALen > srcBLen)
 656   {
 657     /* Calculating the number of zeros to be padded to the output */
 658     j = srcALen - srcBLen;
 659
 660     /* Initialise the pointer after zero padding */
 661     pDst += j;
 662   }
 663
 664   else if(srcALen < srcBLen)
 665   {
 666     /* Initialization to inputB pointer */
 667     pIn1 = pSrcB;
 668
 669     /* Initialization to the end of inputA pointer */
 670     pIn2 = pSrcA + (srcALen - 1u);
 671
 672     /* Initialisation of the pointer after zero padding */
 673     pDst = pDst + tot;
 674
 675     /* Swapping the lengths */
 676     j = srcALen;
 677     srcALen = srcBLen;
 678     srcBLen = j;
 679
 680     /* Setting the reverse flag */
 681     inv = 1;
 682
 683   }
 684
 685   /* Loop to calculate convolution for output length number of times */
 686   for (i = 0u; i <= tot; i++)
 687   {
 688     /* Initialize sum with zero to carry on MAC operations */
 689     sum = 0;
 690
 691     /* Loop to perform MAC operations according to convolution equation */
 692     for (j = 0u; j <= i; j++)
 693     {
 694       /* Check the array limitations */
 695       if((((i - j) < srcBLen) && (j < srcALen)))
 696       {
 697         /* z[i] += x[i-j] * y[j] */
 698         sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
 699       }
 700     }
 701     /* Store the output in the destination buffer */
 702     if(inv == 1)
 703       *pDst-- = (q15_t) __SSAT((sum >> 15u), 16u);
 704     else
 705       *pDst++ = (q15_t) __SSAT((sum >> 15u), 16u);
 706   }
 707
 708 #endif /*   #ifndef ARM_MATH_CM0 */
 709
 710 }
 711
 712 /**
 713  * @} end of Corr group
 714  */