git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q31.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_correlate_q31.c
   9 *
  10 * Description:  Correlation of Q31 sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated
  28 *
  29 * Version 0.0.7  2010/06/10
  30 *    Misra-C changes done
  31 *
  32 * -------------------------------------------------------------------- */
  33
  34 #include "arm_math.h"
  35
  36 /**
  37  * @ingroup groupFilters
  38  */
  39
  40 /**
  41  * @addtogroup Corr
  42  * @{
  43  */
  44
  45 /**
  46  * @brief Correlation of Q31 sequences.
  47  * @param[in] *pSrcA points to the first input sequence.
  48  * @param[in] srcALen length of the first input sequence.
  49  * @param[in] *pSrcB points to the second input sequence.
  50  * @param[in] srcBLen length of the second input sequence.
  51  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
  52  * @return none.
  53  *
  54  * @details
  55  * <b>Scaling and Overflow Behavior:</b>
  56  *
  57  * \par
  58  * The function is implemented using an internal 64-bit accumulator.
  59  * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
  60  * There is no saturation on intermediate additions.
  61  * Thus, if the accumulator overflows it wraps around and distorts the result.
  62  * The input signals should be scaled down to avoid intermediate overflows.
  63  * Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a
  64  * maximum of min(srcALen, srcBLen) number of additions is carried internally.
  65  * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
  66  *
  67  * \par
  68  * See <code>arm_correlate_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
  69  */
  70
  71 void arm_correlate_q31(
  72   q31_t * pSrcA,
  73   uint32_t srcALen,
  74   q31_t * pSrcB,
  75   uint32_t srcBLen,
  76   q31_t * pDst)
  77 {
  78
  79 #ifndef ARM_MATH_CM0
  80
  81   /* Run the below code for Cortex-M4 and Cortex-M3 */
  82
  83   q31_t *pIn1;                                   /* inputA pointer               */
  84   q31_t *pIn2;                                   /* inputB pointer               */
  85   q31_t *pOut = pDst;                            /* output pointer               */
  86   q31_t *px;                                     /* Intermediate inputA pointer  */
  87   q31_t *py;                                     /* Intermediate inputB pointer  */
  88   q31_t *pSrc1;                                  /* Intermediate pointers        */
  89   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
  90   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */
  91   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
  92   int32_t inc = 1;                               /* Destination address modifier */
  93
  94
  95   /* The algorithm implementation is based on the lengths of the inputs. */
  96   /* srcB is always made to slide across srcA. */
  97   /* So srcBLen is always considered as shorter or equal to srcALen */
  98   /* But CORR(x, y) is reverse of CORR(y, x) */
  99   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
 100   /* and the destination pointer modifier, inc is set to -1 */
 101   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
 102   /* But to improve the performance,
 103    * we include zeroes in the output instead of zero padding either of the the inputs*/
 104   /* If srcALen > srcBLen,
 105    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
 106   /* If srcALen < srcBLen,
 107    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
 108   if(srcALen >= srcBLen)
 109   {
 110     /* Initialization of inputA pointer */
 111     pIn1 = (pSrcA);
 112
 113     /* Initialization of inputB pointer */
 114     pIn2 = (pSrcB);
 115
 116     /* Number of output samples is calculated */
 117     outBlockSize = (2u * srcALen) - 1u;
 118
 119     /* When srcALen > srcBLen, zero padding is done to srcB
 120      * to make their lengths equal.
 121      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
 122      * number of output samples are made zero */
 123     j = outBlockSize - (srcALen + (srcBLen - 1u));
 124
 125     /* Updating the pointer position to non zero value */
 126     pOut += j;
 127
 128   }
 129   else
 130   {
 131     /* Initialization of inputA pointer */
 132     pIn1 = (pSrcB);
 133
 134     /* Initialization of inputB pointer */
 135     pIn2 = (pSrcA);
 136
 137     /* srcBLen is always considered as shorter or equal to srcALen */
 138     j = srcBLen;
 139     srcBLen = srcALen;
 140     srcALen = j;
 141
 142     /* CORR(x, y) = Reverse order(CORR(y, x)) */
 143     /* Hence set the destination pointer to point to the last output sample */
 144     pOut = pDst + ((srcALen + srcBLen) - 2u);
 145
 146     /* Destination address modifier is set to -1 */
 147     inc = -1;
 148
 149   }
 150
 151   /* The function is internally
 152    * divided into three parts according to the number of multiplications that has to be
 153    * taken place between inputA samples and inputB samples. In the first part of the
 154    * algorithm, the multiplications increase by one for every iteration.
 155    * In the second part of the algorithm, srcBLen number of multiplications are done.
 156    * In the third part of the algorithm, the multiplications decrease by one
 157    * for every iteration.*/
 158   /* The algorithm is implemented in three stages.
 159    * The loop counters of each stage is initiated here. */
 160   blockSize1 = srcBLen - 1u;
 161   blockSize2 = srcALen - (srcBLen - 1u);
 162   blockSize3 = blockSize1;
 163
 164   /* --------------------------
 165    * Initializations of stage1
 166    * -------------------------*/
 167
 168   /* sum = x[0] * y[srcBlen - 1]
 169    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
 170    * ....
 171    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
 172    */
 173
 174   /* In this stage the MAC operations are increased by 1 for every iteration.
 175      The count variable holds the number of MAC operations performed */
 176   count = 1u;
 177
 178   /* Working pointer of inputA */
 179   px = pIn1;
 180
 181   /* Working pointer of inputB */
 182   pSrc1 = pIn2 + (srcBLen - 1u);
 183   py = pSrc1;
 184
 185   /* ------------------------
 186    * Stage1 process
 187    * ----------------------*/
 188
 189   /* The first stage starts here */
 190   while(blockSize1 > 0u)
 191   {
 192     /* Accumulator is made zero for every iteration */
 193     sum = 0;
 194
 195     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 196     k = count >> 2;
 197
 198     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 199      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 200     while(k > 0u)
 201     {
 202       /* x[0] * y[srcBLen - 4] */
 203       sum += (q63_t) * px++ * (*py++);
 204       /* x[1] * y[srcBLen - 3] */
 205       sum += (q63_t) * px++ * (*py++);
 206       /* x[2] * y[srcBLen - 2] */
 207       sum += (q63_t) * px++ * (*py++);
 208       /* x[3] * y[srcBLen - 1] */
 209       sum += (q63_t) * px++ * (*py++);
 210
 211       /* Decrement the loop counter */
 212       k--;
 213     }
 214
 215     /* If the count is not a multiple of 4, compute any remaining MACs here.
 216      ** No loop unrolling is used. */
 217     k = count % 0x4u;
 218
 219     while(k > 0u)
 220     {
 221       /* Perform the multiply-accumulates */
 222       /* x[0] * y[srcBLen - 1] */
 223       sum += (q63_t) * px++ * (*py++);
 224
 225       /* Decrement the loop counter */
 226       k--;
 227     }
 228
 229     /* Store the result in the accumulator in the destination buffer. */
 230     *pOut = (q31_t) (sum >> 31);
 231     /* Destination pointer is updated according to the address modifier, inc */
 232     pOut += inc;
 233
 234     /* Update the inputA and inputB pointers for next MAC calculation */
 235     py = pSrc1 - count;
 236     px = pIn1;
 237
 238     /* Increment the MAC count */
 239     count++;
 240
 241     /* Decrement the loop counter */
 242     blockSize1--;
 243   }
 244
 245   /* --------------------------
 246    * Initializations of stage2
 247    * ------------------------*/
 248
 249   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
 250    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
 251    * ....
 252    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 253    */
 254
 255   /* Working pointer of inputA */
 256   px = pIn1;
 257
 258   /* Working pointer of inputB */
 259   py = pIn2;
 260
 261   /* count is index by which the pointer pIn1 to be incremented */
 262   count = 1u;
 263
 264   /* -------------------
 265    * Stage2 process
 266    * ------------------*/
 267
 268   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 269    * So, to loop unroll over blockSize2,
 270    * srcBLen should be greater than or equal to 4 */
 271   if(srcBLen >= 4u)
 272   {
 273     /* Loop unroll over blockSize2, by 4 */
 274     blkCnt = blockSize2 >> 2u;
 275
 276     while(blkCnt > 0u)
 277     {
 278       /* Set all accumulators to zero */
 279       acc0 = 0;
 280       acc1 = 0;
 281       acc2 = 0;
 282       acc3 = 0;
 283
 284       /* read x[0], x[1], x[2] samples */
 285       x0 = *(px++);
 286       x1 = *(px++);
 287       x2 = *(px++);
 288
 289       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 290       k = srcBLen >> 2u;
 291
 292       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 293        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 294       do
 295       {
 296         /* Read y[0] sample */
 297         c0 = *(py++);
 298
 299         /* Read x[3] sample */
 300         x3 = *(px++);
 301
 302         /* Perform the multiply-accumulate */
 303         /* acc0 +=  x[0] * y[0] */
 304         acc0 += ((q63_t) x0 * c0);
 305         /* acc1 +=  x[1] * y[0] */
 306         acc1 += ((q63_t) x1 * c0);
 307         /* acc2 +=  x[2] * y[0] */
 308         acc2 += ((q63_t) x2 * c0);
 309         /* acc3 +=  x[3] * y[0] */
 310         acc3 += ((q63_t) x3 * c0);
 311
 312         /* Read y[1] sample */
 313         c0 = *(py++);
 314
 315         /* Read x[4] sample */
 316         x0 = *(px++);
 317
 318         /* Perform the multiply-accumulates */
 319         /* acc0 +=  x[1] * y[1] */
 320         acc0 += ((q63_t) x1 * c0);
 321         /* acc1 +=  x[2] * y[1] */
 322         acc1 += ((q63_t) x2 * c0);
 323         /* acc2 +=  x[3] * y[1] */
 324         acc2 += ((q63_t) x3 * c0);
 325         /* acc3 +=  x[4] * y[1] */
 326         acc3 += ((q63_t) x0 * c0);
 327         /* Read y[2] sample */
 328         c0 = *(py++);
 329
 330         /* Read x[5] sample */
 331         x1 = *(px++);
 332
 333         /* Perform the multiply-accumulates */
 334         /* acc0 +=  x[2] * y[2] */
 335         acc0 += ((q63_t) x2 * c0);
 336         /* acc1 +=  x[3] * y[2] */
 337         acc1 += ((q63_t) x3 * c0);
 338         /* acc2 +=  x[4] * y[2] */
 339         acc2 += ((q63_t) x0 * c0);
 340         /* acc3 +=  x[5] * y[2] */
 341         acc3 += ((q63_t) x1 * c0);
 342
 343         /* Read y[3] sample */
 344         c0 = *(py++);
 345
 346         /* Read x[6] sample */
 347         x2 = *(px++);
 348
 349         /* Perform the multiply-accumulates */
 350         /* acc0 +=  x[3] * y[3] */
 351         acc0 += ((q63_t) x3 * c0);
 352         /* acc1 +=  x[4] * y[3] */
 353         acc1 += ((q63_t) x0 * c0);
 354         /* acc2 +=  x[5] * y[3] */
 355         acc2 += ((q63_t) x1 * c0);
 356         /* acc3 +=  x[6] * y[3] */
 357         acc3 += ((q63_t) x2 * c0);
 358
 359
 360       } while(--k);
 361
 362       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 363        ** No loop unrolling is used. */
 364       k = srcBLen % 0x4u;
 365
 366       while(k > 0u)
 367       {
 368         /* Read y[4] sample */
 369         c0 = *(py++);
 370
 371         /* Read x[7] sample */
 372         x3 = *(px++);
 373
 374         /* Perform the multiply-accumulates */
 375         /* acc0 +=  x[4] * y[4] */
 376         acc0 += ((q63_t) x0 * c0);
 377         /* acc1 +=  x[5] * y[4] */
 378         acc1 += ((q63_t) x1 * c0);
 379         /* acc2 +=  x[6] * y[4] */
 380         acc2 += ((q63_t) x2 * c0);
 381         /* acc3 +=  x[7] * y[4] */
 382         acc3 += ((q63_t) x3 * c0);
 383
 384         /* Reuse the present samples for the next MAC */
 385         x0 = x1;
 386         x1 = x2;
 387         x2 = x3;
 388
 389         /* Decrement the loop counter */
 390         k--;
 391       }
 392
 393       /* Store the result in the accumulator in the destination buffer. */
 394       *pOut = (q31_t) (acc0 >> 31);
 395       /* Destination pointer is updated according to the address modifier, inc */
 396       pOut += inc;
 397
 398       *pOut = (q31_t) (acc1 >> 31);
 399       pOut += inc;
 400
 401       *pOut = (q31_t) (acc2 >> 31);
 402       pOut += inc;
 403
 404       *pOut = (q31_t) (acc3 >> 31);
 405       pOut += inc;
 406
 407       /* Update the inputA and inputB pointers for next MAC calculation */
 408       px = pIn1 + (count * 4u);
 409       py = pIn2;
 410
 411       /* Increment the pointer pIn1 index, count by 1 */
 412       count++;
 413
 414       /* Decrement the loop counter */
 415       blkCnt--;
 416     }
 417
 418     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 419      ** No loop unrolling is used. */
 420     blkCnt = blockSize2 % 0x4u;
 421
 422     while(blkCnt > 0u)
 423     {
 424       /* Accumulator is made zero for every iteration */
 425       sum = 0;
 426
 427       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 428       k = srcBLen >> 2u;
 429
 430       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 431        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 432       while(k > 0u)
 433       {
 434         /* Perform the multiply-accumulates */
 435         sum += (q63_t) * px++ * (*py++);
 436         sum += (q63_t) * px++ * (*py++);
 437         sum += (q63_t) * px++ * (*py++);
 438         sum += (q63_t) * px++ * (*py++);
 439
 440         /* Decrement the loop counter */
 441         k--;
 442       }
 443
 444       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 445        ** No loop unrolling is used. */
 446       k = srcBLen % 0x4u;
 447
 448       while(k > 0u)
 449       {
 450         /* Perform the multiply-accumulate */
 451         sum += (q63_t) * px++ * (*py++);
 452
 453         /* Decrement the loop counter */
 454         k--;
 455       }
 456
 457       /* Store the result in the accumulator in the destination buffer. */
 458       *pOut = (q31_t) (sum >> 31);
 459       /* Destination pointer is updated according to the address modifier, inc */
 460       pOut += inc;
 461
 462       /* Update the inputA and inputB pointers for next MAC calculation */
 463       px = pIn1 + count;
 464       py = pIn2;
 465
 466       /* Increment the MAC count */
 467       count++;
 468
 469       /* Decrement the loop counter */
 470       blkCnt--;
 471     }
 472   }
 473   else
 474   {
 475     /* If the srcBLen is not a multiple of 4,
 476      * the blockSize2 loop cannot be unrolled by 4 */
 477     blkCnt = blockSize2;
 478
 479     while(blkCnt > 0u)
 480     {
 481       /* Accumulator is made zero for every iteration */
 482       sum = 0;
 483
 484       /* Loop over srcBLen */
 485       k = srcBLen;
 486
 487       while(k > 0u)
 488       {
 489         /* Perform the multiply-accumulate */
 490         sum += (q63_t) * px++ * (*py++);
 491
 492         /* Decrement the loop counter */
 493         k--;
 494       }
 495
 496       /* Store the result in the accumulator in the destination buffer. */
 497       *pOut = (q31_t) (sum >> 31);
 498       /* Destination pointer is updated according to the address modifier, inc */
 499       pOut += inc;
 500
 501       /* Update the inputA and inputB pointers for next MAC calculation */
 502       px = pIn1 + count;
 503       py = pIn2;
 504
 505       /* Increment the MAC count */
 506       count++;
 507
 508       /* Decrement the loop counter */
 509       blkCnt--;
 510     }
 511   }
 512
 513   /* --------------------------
 514    * Initializations of stage3
 515    * -------------------------*/
 516
 517   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 518    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 519    * ....
 520    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
 521    * sum +=  x[srcALen-1] * y[0]
 522    */
 523
 524   /* In this stage the MAC operations are decreased by 1 for every iteration.
 525      The count variable holds the number of MAC operations performed */
 526   count = srcBLen - 1u;
 527
 528   /* Working pointer of inputA */
 529   pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
 530   px = pSrc1;
 531
 532   /* Working pointer of inputB */
 533   py = pIn2;
 534
 535   /* -------------------
 536    * Stage3 process
 537    * ------------------*/
 538
 539   while(blockSize3 > 0u)
 540   {
 541     /* Accumulator is made zero for every iteration */
 542     sum = 0;
 543
 544     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 545     k = count >> 2u;
 546
 547     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 548      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 549     while(k > 0u)
 550     {
 551       /* Perform the multiply-accumulates */
 552       /* sum += x[srcALen - srcBLen + 4] * y[3] */
 553       sum += (q63_t) * px++ * (*py++);
 554       /* sum += x[srcALen - srcBLen + 3] * y[2] */
 555       sum += (q63_t) * px++ * (*py++);
 556       /* sum += x[srcALen - srcBLen + 2] * y[1] */
 557       sum += (q63_t) * px++ * (*py++);
 558       /* sum += x[srcALen - srcBLen + 1] * y[0] */
 559       sum += (q63_t) * px++ * (*py++);
 560
 561       /* Decrement the loop counter */
 562       k--;
 563     }
 564
 565     /* If the count is not a multiple of 4, compute any remaining MACs here.
 566      ** No loop unrolling is used. */
 567     k = count % 0x4u;
 568
 569     while(k > 0u)
 570     {
 571       /* Perform the multiply-accumulates */
 572       sum += (q63_t) * px++ * (*py++);
 573
 574       /* Decrement the loop counter */
 575       k--;
 576     }
 577
 578     /* Store the result in the accumulator in the destination buffer. */
 579     *pOut = (q31_t) (sum >> 31);
 580     /* Destination pointer is updated according to the address modifier, inc */
 581     pOut += inc;
 582
 583     /* Update the inputA and inputB pointers for next MAC calculation */
 584     px = ++pSrc1;
 585     py = pIn2;
 586
 587     /* Decrement the MAC count */
 588     count--;
 589
 590     /* Decrement the loop counter */
 591     blockSize3--;
 592   }
 593
 594 #else
 595
 596   /* Run the below code for Cortex-M0 */
 597
 598   q31_t *pIn1 = pSrcA;                           /* inputA pointer               */
 599   q31_t *pIn2 = pSrcB + (srcBLen - 1u);          /* inputB pointer               */
 600   q63_t sum;                                     /* Accumulators                  */
 601   uint32_t i = 0u, j;                            /* loop counters */
 602   uint32_t inv = 0u;                             /* Reverse order flag */
 603   uint32_t tot = 0u;                             /* Length */
 604
 605   /* The algorithm implementation is based on the lengths of the inputs. */
 606   /* srcB is always made to slide across srcA. */
 607   /* So srcBLen is always considered as shorter or equal to srcALen */
 608   /* But CORR(x, y) is reverse of CORR(y, x) */
 609   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
 610   /* and a varaible, inv is set to 1 */
 611   /* If lengths are not equal then zero pad has to be done to  make the two
 612    * inputs of same length. But to improve the performance, we include zeroes
 613    * in the output instead of zero padding either of the the inputs*/
 614   /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
 615    * starting of the output buffer */
 616   /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
 617    * ending of the output buffer */
 618   /* Once the zero padding is done the remaining of the output is calcualted
 619    * using convolution but with the shorter signal time shifted. */
 620
 621   /* Calculate the length of the remaining sequence */
 622   tot = ((srcALen + srcBLen) - 2u);
 623
 624   if(srcALen > srcBLen)
 625   {
 626     /* Calculating the number of zeros to be padded to the output */
 627     j = srcALen - srcBLen;
 628
 629     /* Initialise the pointer after zero padding */
 630     pDst += j;
 631   }
 632
 633   else if(srcALen < srcBLen)
 634   {
 635     /* Initialization to inputB pointer */
 636     pIn1 = pSrcB;
 637
 638     /* Initialization to the end of inputA pointer */
 639     pIn2 = pSrcA + (srcALen - 1u);
 640
 641     /* Initialisation of the pointer after zero padding */
 642     pDst = pDst + tot;
 643
 644     /* Swapping the lengths */
 645     j = srcALen;
 646     srcALen = srcBLen;
 647     srcBLen = j;
 648
 649     /* Setting the reverse flag */
 650     inv = 1;
 651
 652   }
 653
 654   /* Loop to calculate convolution for output length number of times */
 655   for (i = 0u; i <= tot; i++)
 656   {
 657     /* Initialize sum with zero to carry on MAC operations */
 658     sum = 0;
 659
 660     /* Loop to perform MAC operations according to convolution equation */
 661     for (j = 0u; j <= i; j++)
 662     {
 663       /* Check the array limitations */
 664       if((((i - j) < srcBLen) && (j < srcALen)))
 665       {
 666         /* z[i] += x[i-j] * y[j] */
 667         sum += ((q63_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
 668       }
 669     }
 670     /* Store the output in the destination buffer */
 671     if(inv == 1)
 672       *pDst-- = (q31_t) (sum >> 31u);
 673     else
 674       *pDst++ = (q31_t) (sum >> 31u);
 675   }
 676
 677 #endif /*   #ifndef ARM_MATH_CM0 */
 678
 679 }
 680
 681 /**
 682  * @} end of Corr group
 683  */