git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_fast_q15.c
   9 *
  10 * Description:  Fast Q15 Convolution.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated.
  28 * -------------------------------------------------------------------- */
  29
  30 #include "arm_math.h"
  31
  32 /**
  33  * @ingroup groupFilters
  34  */
  35
  36 /**
  37  * @addtogroup Conv
  38  * @{
  39  */
  40
  41 /**
  42  * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
  43  * @param[in] *pSrcA points to the first input sequence.
  44  * @param[in] srcALen length of the first input sequence.
  45  * @param[in] *pSrcB points to the second input sequence.
  46  * @param[in] srcBLen length of the second input sequence.
  47  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
  48  * @return none.
  49  *
  50  * <b>Scaling and Overflow Behavior:</b>
  51  *
  52  * \par
  53  * This fast version uses a 32-bit accumulator with 2.30 format.
  54  * The accumulator maintains full precision of the intermediate multiplication results
  55  * but provides only a single guard bit. There is no saturation on intermediate additions.
  56  * Thus, if the accumulator overflows it wraps around and distorts the result.
  57  * The input signals should be scaled down to avoid intermediate overflows.
  58  * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
  59  * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
  60  * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
  61  *
  62  * \par
  63  * See <code>arm_conv_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
  64  */
  65
  66 void arm_conv_fast_q15(
  67   q15_t * pSrcA,
  68   uint32_t srcALen,
  69   q15_t * pSrcB,
  70   uint32_t srcBLen,
  71   q15_t * pDst)
  72 {
  73   q15_t *pIn1;                                   /* inputA pointer */
  74   q15_t *pIn2;                                   /* inputB pointer */
  75   q15_t *pOut = pDst;                            /* output pointer */
  76   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
  77   q15_t *px;                                     /* Intermediate inputA pointer  */
  78   q15_t *py;                                     /* Intermediate inputB pointer  */
  79   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
  80   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
  81   uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt;     /* loop counter */
  82   q31_t *pb;                                     /* 32 bit pointer for inputB buffer */
  83
  84
  85   /* The algorithm implementation is based on the lengths of the inputs. */
  86   /* srcB is always made to slide across srcA. */
  87   /* So srcBLen is always considered as shorter or equal to srcALen */
  88   if(srcALen >= srcBLen)
  89   {
  90     /* Initialization of inputA pointer */
  91     pIn1 = pSrcA;
  92
  93     /* Initialization of inputB pointer */
  94     pIn2 = pSrcB;
  95   }
  96   else
  97   {
  98     /* Initialization of inputA pointer */
  99     pIn1 = pSrcB;
 100
 101     /* Initialization of inputB pointer */
 102     pIn2 = pSrcA;
 103
 104     /* srcBLen is always considered as shorter or equal to srcALen */
 105     j = srcBLen;
 106     srcBLen = srcALen;
 107     srcALen = j;
 108   }
 109
 110   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
 111   /* The function is internally
 112    * divided into three stages according to the number of multiplications that has to be
 113    * taken place between inputA samples and inputB samples. In the first stage of the
 114    * algorithm, the multiplications increase by one for every iteration.
 115    * In the second stage of the algorithm, srcBLen number of multiplications are done.
 116    * In the third stage of the algorithm, the multiplications decrease by one
 117    * for every iteration. */
 118
 119   /* The algorithm is implemented in three stages.
 120      The loop counters of each stage is initiated here. */
 121   blockSize1 = srcBLen - 1u;
 122   blockSize2 = srcALen - (srcBLen - 1u);
 123   blockSize3 = blockSize1;
 124
 125   /* --------------------------
 126    * Initializations of stage1
 127    * -------------------------*/
 128
 129   /* sum = x[0] * y[0]
 130    * sum = x[0] * y[1] + x[1] * y[0]
 131    * ....
 132    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
 133    */
 134
 135   /* In this stage the MAC operations are increased by 1 for every iteration.
 136      The count variable holds the number of MAC operations performed */
 137   count = 1u;
 138
 139   /* Working pointer of inputA */
 140   px = pIn1;
 141
 142   /* Working pointer of inputB */
 143   py = pIn2;
 144
 145
 146   /* ------------------------
 147    * Stage1 process
 148    * ----------------------*/
 149
 150   /* For loop unrolling by 4, this stage is divided into two. */
 151   /* First part of this stage computes the MAC operations less than 4 */
 152   /* Second part of this stage computes the MAC operations greater than or equal to 4 */
 153
 154   /* The first part of the stage starts here */
 155   while((count < 4u) && (blockSize1 > 0u))
 156   {
 157     /* Accumulator is made zero for every iteration */
 158     sum = 0;
 159
 160     /* Loop over number of MAC operations between
 161      * inputA samples and inputB samples */
 162     k = count;
 163
 164     while(k > 0u)
 165     {
 166       /* Perform the multiply-accumulates */
 167       sum = __SMLAD(*px++, *py--, sum);
 168
 169       /* Decrement the loop counter */
 170       k--;
 171     }
 172
 173     /* Store the result in the accumulator in the destination buffer. */
 174     *pOut++ = (q15_t) (sum >> 15);
 175
 176     /* Update the inputA and inputB pointers for next MAC calculation */
 177     py = pIn2 + count;
 178     px = pIn1;
 179
 180     /* Increment the MAC count */
 181     count++;
 182
 183     /* Decrement the loop counter */
 184     blockSize1--;
 185   }
 186
 187   /* The second part of the stage starts here */
 188   /* The internal loop, over count, is unrolled by 4 */
 189   /* To, read the last two inputB samples using SIMD:
 190    * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
 191   py = py - 1;
 192
 193   while(blockSize1 > 0u)
 194   {
 195     /* Accumulator is made zero for every iteration */
 196     sum = 0;
 197
 198     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 199     k = count >> 2u;
 200
 201     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 202      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 203     while(k > 0u)
 204     {
 205       /* Perform the multiply-accumulates */
 206       /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
 207       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 208       /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
 209       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 210
 211       /* Decrement the loop counter */
 212       k--;
 213     }
 214
 215     /* For the next MAC operations, the pointer py is used without SIMD
 216      * So, py is incremented by 1 */
 217     py = py + 1u;
 218
 219     /* If the count is not a multiple of 4, compute any remaining MACs here.
 220      ** No loop unrolling is used. */
 221     k = count % 0x4u;
 222
 223     while(k > 0u)
 224     {
 225       /* Perform the multiply-accumulates */
 226       sum = __SMLAD(*px++, *py--, sum);
 227
 228       /* Decrement the loop counter */
 229       k--;
 230     }
 231
 232     /* Store the result in the accumulator in the destination buffer. */
 233     *pOut++ = (q15_t) (sum >> 15);
 234
 235     /* Update the inputA and inputB pointers for next MAC calculation */
 236     py = pIn2 + (count - 1u);
 237     px = pIn1;
 238
 239     /* Increment the MAC count */
 240     count++;
 241
 242     /* Decrement the loop counter */
 243     blockSize1--;
 244   }
 245
 246   /* --------------------------
 247    * Initializations of stage2
 248    * ------------------------*/
 249
 250   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
 251    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
 252    * ....
 253    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
 254    */
 255
 256   /* Working pointer of inputA */
 257   px = pIn1;
 258
 259   /* Working pointer of inputB */
 260   pSrc2 = pIn2 + (srcBLen - 1u);
 261   py = pSrc2;
 262
 263   /* Initialize inputB pointer of type q31 */
 264   pb = (q31_t *) (py - 1u);
 265
 266   /* count is the index by which the pointer pIn1 to be incremented */
 267   count = 1u;
 268
 269
 270   /* --------------------
 271    * Stage2 process
 272    * -------------------*/
 273
 274   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 275    * So, to loop unroll over blockSize2,
 276    * srcBLen should be greater than or equal to 4 */
 277   if(srcBLen >= 4u)
 278   {
 279     /* Loop unroll over blockSize2, by 4 */
 280     blkCnt = blockSize2 >> 2u;
 281
 282     while(blkCnt > 0u)
 283     {
 284       /* Set all accumulators to zero */
 285       acc0 = 0;
 286       acc1 = 0;
 287       acc2 = 0;
 288       acc3 = 0;
 289
 290
 291       /* read x[0], x[1] samples */
 292       x0 = *(q31_t *) (px++);
 293       /* read x[1], x[2] samples */
 294       x1 = *(q31_t *) (px++);
 295
 296
 297       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 298       k = srcBLen >> 2u;
 299
 300       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 301        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 302       do
 303       {
 304         /* Read the last two inputB samples using SIMD:
 305          * y[srcBLen - 1] and y[srcBLen - 2] */
 306         c0 = *(pb--);
 307
 308         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
 309         acc0 = __SMLADX(x0, c0, acc0);
 310
 311         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
 312         acc1 = __SMLADX(x1, c0, acc1);
 313
 314         /* Read x[2], x[3] */
 315         x2 = *(q31_t *) (px++);
 316
 317         /* Read x[3], x[4] */
 318         x3 = *(q31_t *) (px++);
 319
 320         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
 321         acc2 = __SMLADX(x2, c0, acc2);
 322
 323         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
 324         acc3 = __SMLADX(x3, c0, acc3);
 325
 326         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
 327         c0 = *(pb--);
 328
 329         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
 330         acc0 = __SMLADX(x2, c0, acc0);
 331
 332         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
 333         acc1 = __SMLADX(x3, c0, acc1);
 334
 335         /* Read x[4], x[5] */
 336         x0 = *(q31_t *) (px++);
 337
 338         /* Read x[5], x[6] */
 339         x1 = *(q31_t *) (px++);
 340
 341         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
 342         acc2 = __SMLADX(x0, c0, acc2);
 343
 344         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
 345         acc3 = __SMLADX(x1, c0, acc3);
 346
 347       } while(--k);
 348
 349       /* For the next MAC operations, SIMD is not used
 350        * So, the 16 bit pointer if inputB, py is updated */
 351       py = (q15_t *) pb;
 352       py = py + 1;
 353
 354       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 355        ** No loop unrolling is used. */
 356       k = srcBLen % 0x4u;
 357
 358       if(k == 1u)
 359       {
 360         /* Read y[srcBLen - 5] */
 361         c0 = *(py);
 362 #ifdef  ARM_MATH_BIG_ENDIAN
 363
 364 //          c0 = unallign_rev(p, c0);
 365         c0 = c0 << 16;
 366 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 367
 368         /* Read x[7] */
 369         x3 = *(q31_t *) px++;
 370
 371         /* Perform the multiply-accumulates */
 372         acc0 = __SMLAD(x0, c0, acc0);
 373         acc1 = __SMLAD(x1, c0, acc1);
 374         acc2 = __SMLADX(x1, c0, acc2);
 375         acc3 = __SMLADX(x3, c0, acc3);
 376       }
 377
 378       if(k == 2u)
 379       {
 380         /* Read y[srcBLen - 5], y[srcBLen - 6] */
 381         c0 = *(pb);
 382
 383         /* Read x[7], x[8] */
 384         x3 = *(q31_t *) px++;
 385
 386         /* Read x[9] */
 387         x2 = *(q31_t *) px++;
 388
 389         /* Perform the multiply-accumulates */
 390         acc0 = __SMLADX(x0, c0, acc0);
 391         acc1 = __SMLADX(x1, c0, acc1);
 392         acc2 = __SMLADX(x3, c0, acc2);
 393         acc3 = __SMLADX(x2, c0, acc3);
 394       }
 395
 396       if(k == 3u)
 397       {
 398         /* Read y[srcBLen - 5], y[srcBLen - 6] */
 399         c0 = *pb--;
 400
 401         /* Read x[7], x[8] */
 402         x3 = *(q31_t *) px++;
 403
 404         /* Read x[9] */
 405         x2 = *(q31_t *) px++;
 406
 407         /* Perform the multiply-accumulates */
 408         acc0 = __SMLADX(x0, c0, acc0);
 409         acc1 = __SMLADX(x1, c0, acc1);
 410         acc2 = __SMLADX(x3, c0, acc2);
 411         acc3 = __SMLADX(x2, c0, acc3);
 412
 413         /* Read y[srcBLen - 7] */
 414 #ifdef  ARM_MATH_BIG_ENDIAN
 415
 416         c0 = (*pb);
 417 //              c0 = (c0 & 0x0000FFFF)<<16;
 418         c0 = (c0) << 16;
 419
 420 #else
 421
 422         c0 = (q15_t) (*pb >> 16);
 423
 424 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 425
 426         /* Read x[10] */
 427         x3 = *(q31_t *) px++;
 428
 429         /* Perform the multiply-accumulates */
 430         acc0 = __SMLADX(x1, c0, acc0);
 431         acc1 = __SMLAD(x2, c0, acc1);
 432         acc2 = __SMLADX(x2, c0, acc2);
 433         acc3 = __SMLADX(x3, c0, acc3);
 434       }
 435
 436       /* Store the results in the accumulators in the destination buffer. */
 437 #ifndef ARM_MATH_BIG_ENDIAN
 438
 439       *__SIMD32(pOut)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16);
 440       *__SIMD32(pOut)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16);
 441
 442 #else
 443
 444       *__SIMD32(pOut)++ = __PKHBT((acc1 >> 15), (acc0 >> 15), 16);
 445       *__SIMD32(pOut)++ = __PKHBT((acc3 >> 15), (acc2 >> 15), 16);
 446
 447 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
 448       /* Update the inputA and inputB pointers for next MAC calculation */
 449       px = pIn1 + (count * 4u);
 450       py = pSrc2;
 451       pb = (q31_t *) (py - 1);
 452
 453       /* Increment the pointer pIn1 index, count by 1 */
 454       count++;
 455
 456       /* Decrement the loop counter */
 457       blkCnt--;
 458     }
 459
 460     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 461      ** No loop unrolling is used. */
 462     blkCnt = blockSize2 % 0x4u;
 463
 464     while(blkCnt > 0u)
 465     {
 466       /* Accumulator is made zero for every iteration */
 467       sum = 0;
 468
 469       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 470       k = srcBLen >> 2u;
 471
 472       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 473        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 474       while(k > 0u)
 475       {
 476         /* Perform the multiply-accumulates */
 477         sum += ((q31_t) * px++ * *py--);
 478         sum += ((q31_t) * px++ * *py--);
 479         sum += ((q31_t) * px++ * *py--);
 480         sum += ((q31_t) * px++ * *py--);
 481
 482         /* Decrement the loop counter */
 483         k--;
 484       }
 485
 486       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 487        ** No loop unrolling is used. */
 488       k = srcBLen % 0x4u;
 489
 490       while(k > 0u)
 491       {
 492         /* Perform the multiply-accumulates */
 493         sum += ((q31_t) * px++ * *py--);
 494
 495         /* Decrement the loop counter */
 496         k--;
 497       }
 498
 499       /* Store the result in the accumulator in the destination buffer. */
 500       *pOut++ = (q15_t) (sum >> 15);
 501
 502       /* Update the inputA and inputB pointers for next MAC calculation */
 503       px = pIn1 + count;
 504       py = pSrc2;
 505
 506       /* Increment the pointer pIn1 index, count by 1 */
 507       count++;
 508
 509       /* Decrement the loop counter */
 510       blkCnt--;
 511     }
 512   }
 513   else
 514   {
 515     /* If the srcBLen is not a multiple of 4,
 516      * the blockSize2 loop cannot be unrolled by 4 */
 517     blkCnt = blockSize2;
 518
 519     while(blkCnt > 0u)
 520     {
 521       /* Accumulator is made zero for every iteration */
 522       sum = 0;
 523
 524       /* srcBLen number of MACS should be performed */
 525       k = srcBLen;
 526
 527       while(k > 0u)
 528       {
 529         /* Perform the multiply-accumulate */
 530         sum += ((q31_t) * px++ * *py--);
 531
 532         /* Decrement the loop counter */
 533         k--;
 534       }
 535
 536       /* Store the result in the accumulator in the destination buffer. */
 537       *pOut++ = (q15_t) (sum >> 15);
 538
 539       /* Update the inputA and inputB pointers for next MAC calculation */
 540       px = pIn1 + count;
 541       py = pSrc2;
 542
 543       /* Increment the MAC count */
 544       count++;
 545
 546       /* Decrement the loop counter */
 547       blkCnt--;
 548     }
 549   }
 550
 551
 552   /* --------------------------
 553    * Initializations of stage3
 554    * -------------------------*/
 555
 556   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
 557    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
 558    * ....
 559    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
 560    * sum +=  x[srcALen-1] * y[srcBLen-1]
 561    */
 562
 563   /* In this stage the MAC operations are decreased by 1 for every iteration.
 564      The blockSize3 variable holds the number of MAC operations performed */
 565
 566   /* Working pointer of inputA */
 567   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 568   px = pSrc1;
 569
 570   /* Working pointer of inputB */
 571   pSrc2 = pIn2 + (srcBLen - 1u);
 572   pIn2 = pSrc2 - 1u;
 573   py = pIn2;
 574
 575   /* -------------------
 576    * Stage3 process
 577    * ------------------*/
 578
 579   /* For loop unrolling by 4, this stage is divided into two. */
 580   /* First part of this stage computes the MAC operations greater than 4 */
 581   /* Second part of this stage computes the MAC operations less than or equal to 4 */
 582
 583   /* The first part of the stage starts here */
 584   j = blockSize3 >> 2u;
 585
 586   while((j > 0u) && (blockSize3 > 0u))
 587   {
 588     /* Accumulator is made zero for every iteration */
 589     sum = 0;
 590
 591     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 592     k = blockSize3 >> 2u;
 593
 594     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 595      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 596     while(k > 0u)
 597     {
 598       /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
 599        * with y[srcBLen - 1], y[srcBLen - 2] respectively */
 600       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 601       /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
 602        * with y[srcBLen - 3], y[srcBLen - 4] respectively */
 603       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 604
 605       /* Decrement the loop counter */
 606       k--;
 607     }
 608
 609     /* For the next MAC operations, the pointer py is used without SIMD
 610      * So, py is incremented by 1 */
 611     py = py + 1u;
 612
 613     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
 614      ** No loop unrolling is used. */
 615     k = blockSize3 % 0x4u;
 616
 617     while(k > 0u)
 618     {
 619       /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
 620       sum = __SMLAD(*px++, *py--, sum);
 621
 622       /* Decrement the loop counter */
 623       k--;
 624     }
 625
 626     /* Store the result in the accumulator in the destination buffer. */
 627     *pOut++ = (q15_t) (sum >> 15);
 628
 629     /* Update the inputA and inputB pointers for next MAC calculation */
 630     px = ++pSrc1;
 631     py = pIn2;
 632
 633     /* Decrement the loop counter */
 634     blockSize3--;
 635
 636     j--;
 637   }
 638
 639   /* The second part of the stage starts here */
 640   /* SIMD is not used for the next MAC operations,
 641    * so pointer py is updated to read only one sample at a time */
 642   py = py + 1u;
 643
 644   while(blockSize3 > 0u)
 645   {
 646     /* Accumulator is made zero for every iteration */
 647     sum = 0;
 648
 649     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 650     k = blockSize3;
 651
 652     while(k > 0u)
 653     {
 654       /* Perform the multiply-accumulates */
 655       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
 656       sum = __SMLAD(*px++, *py--, sum);
 657
 658       /* Decrement the loop counter */
 659       k--;
 660     }
 661
 662     /* Store the result in the accumulator in the destination buffer. */
 663     *pOut++ = (q15_t) (sum >> 15);
 664
 665     /* Update the inputA and inputB pointers for next MAC calculation */
 666     px = ++pSrc1;
 667     py = pSrc2;
 668
 669     /* Decrement the loop counter */
 670     blockSize3--;
 671   }
 672
 673 }
 674
 675 /**
 676  * @} end of Conv group
 677  */