git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_partial_fast_q15.c
   9 *
  10 * Description:  Fast Q15 Partial convolution.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated.
  28 * -------------------------------------------------------------------- */
  29
  30 #include "arm_math.h"
  31
  32 /**
  33  * @ingroup groupFilters
  34  */
  35
  36 /**
  37  * @addtogroup PartialConv
  38  * @{
  39  */
  40
  41 /**
  42  * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
  43  * @param[in]       *pSrcA points to the first input sequence.
  44  * @param[in]       srcALen length of the first input sequence.
  45  * @param[in]       *pSrcB points to the second input sequence.
  46  * @param[in]       srcBLen length of the second input sequence.
  47  * @param[out]      *pDst points to the location where the output result is written.
  48  * @param[in]       firstIndex is the first output sample to start with.
  49  * @param[in]       numPoints is the number of output points to be computed.
  50  * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
  51  *
  52  * See <code>arm_conv_partial_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
  53  */
  54
  55
  56 arm_status arm_conv_partial_fast_q15(
  57   q15_t * pSrcA,
  58   uint32_t srcALen,
  59   q15_t * pSrcB,
  60   uint32_t srcBLen,
  61   q15_t * pDst,
  62   uint32_t firstIndex,
  63   uint32_t numPoints)
  64 {
  65   q15_t *pIn1;                                   /* inputA pointer               */
  66   q15_t *pIn2;                                   /* inputB pointer               */
  67   q15_t *pOut = pDst;                            /* output pointer               */
  68   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */
  69   q15_t *px;                                     /* Intermediate inputA pointer  */
  70   q15_t *py;                                     /* Intermediate inputB pointer  */
  71   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
  72   q31_t x0, x1, x2, x3, c0;
  73   uint32_t j, k, count, check, blkCnt;
  74   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters                 */
  75   arm_status status;                             /* status of Partial convolution */
  76   q31_t *pb;                                     /* 32 bit pointer for inputB buffer */
  77
  78   /* Check for range of output samples to be calculated */
  79   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
  80   {
  81     /* Set status as ARM_MATH_ARGUMENT_ERROR */
  82     status = ARM_MATH_ARGUMENT_ERROR;
  83   }
  84   else
  85   {
  86
  87     /* The algorithm implementation is based on the lengths of the inputs. */
  88     /* srcB is always made to slide across srcA. */
  89     /* So srcBLen is always considered as shorter or equal to srcALen */
  90     if(srcALen >= srcBLen)
  91     {
  92       /* Initialization of inputA pointer */
  93       pIn1 = pSrcA;
  94
  95       /* Initialization of inputB pointer */
  96       pIn2 = pSrcB;
  97     }
  98     else
  99     {
 100       /* Initialization of inputA pointer */
 101       pIn1 = pSrcB;
 102
 103       /* Initialization of inputB pointer */
 104       pIn2 = pSrcA;
 105
 106       /* srcBLen is always considered as shorter or equal to srcALen */
 107       j = srcBLen;
 108       srcBLen = srcALen;
 109       srcALen = j;
 110     }
 111
 112     /* Conditions to check which loopCounter holds
 113      * the first and last indices of the output samples to be calculated. */
 114     check = firstIndex + numPoints;
 115     blockSize3 = ((int32_t) check - (int32_t) srcALen);
 116     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
 117     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
 118     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
 119                                      (int32_t) numPoints) : 0;
 120     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
 121                                     (int32_t) firstIndex);
 122     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
 123
 124     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
 125     /* The function is internally
 126      * divided into three stages according to the number of multiplications that has to be
 127      * taken place between inputA samples and inputB samples. In the first stage of the
 128      * algorithm, the multiplications increase by one for every iteration.
 129      * In the second stage of the algorithm, srcBLen number of multiplications are done.
 130      * In the third stage of the algorithm, the multiplications decrease by one
 131      * for every iteration. */
 132
 133     /* Set the output pointer to point to the firstIndex
 134      * of the output sample to be calculated. */
 135     pOut = pDst + firstIndex;
 136
 137     /* --------------------------
 138      * Initializations of stage1
 139      * -------------------------*/
 140
 141     /* sum = x[0] * y[0]
 142      * sum = x[0] * y[1] + x[1] * y[0]
 143      * ....
 144      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
 145      */
 146
 147     /* In this stage the MAC operations are increased by 1 for every iteration.
 148        The count variable holds the number of MAC operations performed.
 149        Since the partial convolution starts from firstIndex
 150        Number of Macs to be performed is firstIndex + 1 */
 151     count = 1u + firstIndex;
 152
 153     /* Working pointer of inputA */
 154     px = pIn1;
 155
 156     /* Working pointer of inputB */
 157     pSrc2 = pIn2 + firstIndex;
 158     py = pSrc2;
 159
 160     /* ------------------------
 161      * Stage1 process
 162      * ----------------------*/
 163
 164     /* For loop unrolling by 4, this stage is divided into two. */
 165     /* First part of this stage computes the MAC operations less than 4 */
 166     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
 167
 168     /* The first part of the stage starts here */
 169     while((count < 4u) && (blockSize1 > 0))
 170     {
 171       /* Accumulator is made zero for every iteration */
 172       sum = 0;
 173
 174       /* Loop over number of MAC operations between
 175        * inputA samples and inputB samples */
 176       k = count;
 177
 178       while(k > 0u)
 179       {
 180         /* Perform the multiply-accumulates */
 181         sum = __SMLAD(*px++, *py--, sum);
 182
 183         /* Decrement the loop counter */
 184         k--;
 185       }
 186
 187       /* Store the result in the accumulator in the destination buffer. */
 188       *pOut++ = (q15_t) (sum >> 15);
 189
 190       /* Update the inputA and inputB pointers for next MAC calculation */
 191       py = ++pSrc2;
 192       px = pIn1;
 193
 194       /* Increment the MAC count */
 195       count++;
 196
 197       /* Decrement the loop counter */
 198       blockSize1--;
 199     }
 200
 201     /* The second part of the stage starts here */
 202     /* The internal loop, over count, is unrolled by 4 */
 203     /* To, read the last two inputB samples using SIMD:
 204      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
 205     py = py - 1;
 206
 207     while(blockSize1 > 0)
 208     {
 209       /* Accumulator is made zero for every iteration */
 210       sum = 0;
 211
 212       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 213       k = count >> 2u;
 214
 215       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 216        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 217       while(k > 0u)
 218       {
 219         /* Perform the multiply-accumulates */
 220         /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
 221         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 222         /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
 223         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 224
 225         /* Decrement the loop counter */
 226         k--;
 227       }
 228
 229       /* For the next MAC operations, the pointer py is used without SIMD
 230        * So, py is incremented by 1 */
 231       py = py + 1u;
 232
 233       /* If the count is not a multiple of 4, compute any remaining MACs here.
 234        ** No loop unrolling is used. */
 235       k = count % 0x4u;
 236
 237       while(k > 0u)
 238       {
 239         /* Perform the multiply-accumulates */
 240         sum = __SMLAD(*px++, *py--, sum);
 241
 242         /* Decrement the loop counter */
 243         k--;
 244       }
 245
 246       /* Store the result in the accumulator in the destination buffer. */
 247       *pOut++ = (q15_t) (sum >> 15);
 248
 249       /* Update the inputA and inputB pointers for next MAC calculation */
 250       py = ++pSrc2 - 1u;
 251       px = pIn1;
 252
 253       /* Increment the MAC count */
 254       count++;
 255
 256       /* Decrement the loop counter */
 257       blockSize1--;
 258     }
 259
 260     /* --------------------------
 261      * Initializations of stage2
 262      * ------------------------*/
 263
 264     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
 265      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
 266      * ....
 267      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
 268      */
 269
 270     /* Working pointer of inputA */
 271     px = pIn1;
 272
 273     /* Working pointer of inputB */
 274     pSrc2 = pIn2 + (srcBLen - 1u);
 275     py = pSrc2;
 276
 277     /* Initialize inputB pointer of type q31 */
 278     pb = (q31_t *) (py - 1u);
 279
 280     /* count is the index by which the pointer pIn1 to be incremented */
 281     count = 1u;
 282
 283
 284     /* --------------------
 285      * Stage2 process
 286      * -------------------*/
 287
 288     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 289      * So, to loop unroll over blockSize2,
 290      * srcBLen should be greater than or equal to 4 */
 291     if(srcBLen >= 4u)
 292     {
 293       /* Loop unroll over blockSize2, by 4 */
 294       blkCnt = ((uint32_t) blockSize2 >> 2u);
 295
 296       while(blkCnt > 0u)
 297       {
 298         /* Set all accumulators to zero */
 299         acc0 = 0;
 300         acc1 = 0;
 301         acc2 = 0;
 302         acc3 = 0;
 303
 304
 305         /* read x[0], x[1] samples */
 306         x0 = *(q31_t *) (px++);
 307         /* read x[1], x[2] samples */
 308         x1 = *(q31_t *) (px++);
 309
 310
 311         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 312         k = srcBLen >> 2u;
 313
 314         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 315          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 316         do
 317         {
 318           /* Read the last two inputB samples using SIMD:
 319            * y[srcBLen - 1] and y[srcBLen - 2] */
 320           c0 = *(pb--);
 321
 322           /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
 323           acc0 = __SMLADX(x0, c0, acc0);
 324
 325           /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
 326           acc1 = __SMLADX(x1, c0, acc1);
 327
 328           /* Read x[2], x[3] */
 329           x2 = *(q31_t *) (px++);
 330
 331           /* Read x[3], x[4] */
 332           x3 = *(q31_t *) (px++);
 333
 334           /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
 335           acc2 = __SMLADX(x2, c0, acc2);
 336
 337           /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
 338           acc3 = __SMLADX(x3, c0, acc3);
 339
 340           /* Read y[srcBLen - 3] and y[srcBLen - 4] */
 341           c0 = *(pb--);
 342
 343           /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
 344           acc0 = __SMLADX(x2, c0, acc0);
 345
 346           /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
 347           acc1 = __SMLADX(x3, c0, acc1);
 348
 349           /* Read x[4], x[5] */
 350           x0 = *(q31_t *) (px++);
 351
 352           /* Read x[5], x[6] */
 353           x1 = *(q31_t *) (px++);
 354
 355           /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
 356           acc2 = __SMLADX(x0, c0, acc2);
 357
 358           /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
 359           acc3 = __SMLADX(x1, c0, acc3);
 360
 361         } while(--k);
 362
 363         /* For the next MAC operations, SIMD is not used
 364          * So, the 16 bit pointer if inputB, py is updated */
 365         py = (q15_t *) pb;
 366         py = py + 1;
 367
 368         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 369          ** No loop unrolling is used. */
 370         k = srcBLen % 0x4u;
 371
 372         if(k == 1u)
 373         {
 374           /* Read y[srcBLen - 5] */
 375           c0 = *(py);
 376 #ifdef  ARM_MATH_BIG_ENDIAN
 377
 378           c0 = c0 << 16;
 379
 380 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 381
 382           /* Read x[7] */
 383           x3 = *(q31_t *) px++;
 384
 385           /* Perform the multiply-accumulates */
 386           acc0 = __SMLAD(x0, c0, acc0);
 387           acc1 = __SMLAD(x1, c0, acc1);
 388           acc2 = __SMLADX(x1, c0, acc2);
 389           acc3 = __SMLADX(x3, c0, acc3);
 390         }
 391
 392         if(k == 2u)
 393         {
 394           /* Read y[srcBLen - 5], y[srcBLen - 6] */
 395           c0 = *(pb);
 396
 397           /* Read x[7], x[8] */
 398           x3 = *(q31_t *) px++;
 399
 400           /* Read x[9] */
 401           x2 = *(q31_t *) px++;
 402
 403           /* Perform the multiply-accumulates */
 404           acc0 = __SMLADX(x0, c0, acc0);
 405           acc1 = __SMLADX(x1, c0, acc1);
 406           acc2 = __SMLADX(x3, c0, acc2);
 407           acc3 = __SMLADX(x2, c0, acc3);
 408         }
 409
 410         if(k == 3u)
 411         {
 412           /* Read y[srcBLen - 5], y[srcBLen - 6] */
 413           c0 = *pb--;
 414
 415           /* Read x[7], x[8] */
 416           x3 = *(q31_t *) px++;
 417
 418           /* Read x[9] */
 419           x2 = *(q31_t *) px++;
 420
 421           /* Perform the multiply-accumulates */
 422           acc0 = __SMLADX(x0, c0, acc0);
 423           acc1 = __SMLADX(x1, c0, acc1);
 424           acc2 = __SMLADX(x3, c0, acc2);
 425           acc3 = __SMLADX(x2, c0, acc3);
 426
 427           /* Read y[srcBLen - 7] */
 428 #ifdef  ARM_MATH_BIG_ENDIAN
 429
 430           c0 = (*pb);
 431           c0 = (c0) << 16;
 432
 433 #else
 434
 435           c0 = (q15_t) (*pb >> 16);
 436
 437 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 438
 439           /* Read x[10] */
 440           x3 = *(q31_t *) px++;
 441
 442           /* Perform the multiply-accumulates */
 443           acc0 = __SMLADX(x1, c0, acc0);
 444           acc1 = __SMLAD(x2, c0, acc1);
 445           acc2 = __SMLADX(x2, c0, acc2);
 446           acc3 = __SMLADX(x3, c0, acc3);
 447         }
 448
 449         /* Store the results in the accumulators in the destination buffer. */
 450 #ifndef ARM_MATH_BIG_ENDIAN
 451
 452         *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16);
 453         *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16);
 454
 455 #else
 456
 457         *__SIMD32(pOut)++ = __PKHBT(acc1 >> 15, acc0 >> 15, 16);
 458         *__SIMD32(pOut)++ = __PKHBT(acc3 >> 15, acc2 >> 15, 16);
 459
 460 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
 461
 462         /* Update the inputA and inputB pointers for next MAC calculation */
 463         px = pIn1 + (count * 4u);
 464         py = pSrc2;
 465         pb = (q31_t *) (py - 1);
 466
 467         /* Increment the pointer pIn1 index, count by 1 */
 468         count++;
 469
 470         /* Decrement the loop counter */
 471         blkCnt--;
 472       }
 473
 474       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 475        ** No loop unrolling is used. */
 476       blkCnt = (uint32_t) blockSize2 % 0x4u;
 477
 478       while(blkCnt > 0u)
 479       {
 480         /* Accumulator is made zero for every iteration */
 481         sum = 0;
 482
 483         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 484         k = srcBLen >> 2u;
 485
 486         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 487          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 488         while(k > 0u)
 489         {
 490           /* Perform the multiply-accumulates */
 491           sum += ((q31_t) * px++ * *py--);
 492           sum += ((q31_t) * px++ * *py--);
 493           sum += ((q31_t) * px++ * *py--);
 494           sum += ((q31_t) * px++ * *py--);
 495
 496           /* Decrement the loop counter */
 497           k--;
 498         }
 499
 500         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 501          ** No loop unrolling is used. */
 502         k = srcBLen % 0x4u;
 503
 504         while(k > 0u)
 505         {
 506           /* Perform the multiply-accumulates */
 507           sum += ((q31_t) * px++ * *py--);
 508
 509           /* Decrement the loop counter */
 510           k--;
 511         }
 512
 513         /* Store the result in the accumulator in the destination buffer. */
 514         *pOut++ = (q15_t) (sum >> 15);
 515
 516         /* Update the inputA and inputB pointers for next MAC calculation */
 517         px = pIn1 + count;
 518         py = pSrc2;
 519
 520         /* Increment the pointer pIn1 index, count by 1 */
 521         count++;
 522
 523         /* Decrement the loop counter */
 524         blkCnt--;
 525       }
 526     }
 527     else
 528     {
 529       /* If the srcBLen is not a multiple of 4,
 530        * the blockSize2 loop cannot be unrolled by 4 */
 531       blkCnt = (uint32_t) blockSize2;
 532
 533       while(blkCnt > 0u)
 534       {
 535         /* Accumulator is made zero for every iteration */
 536         sum = 0;
 537
 538         /* srcBLen number of MACS should be performed */
 539         k = srcBLen;
 540
 541         while(k > 0u)
 542         {
 543           /* Perform the multiply-accumulate */
 544           sum += ((q31_t) * px++ * *py--);
 545
 546           /* Decrement the loop counter */
 547           k--;
 548         }
 549
 550         /* Store the result in the accumulator in the destination buffer. */
 551         *pOut++ = (q15_t) (sum >> 15);
 552
 553         /* Update the inputA and inputB pointers for next MAC calculation */
 554         px = pIn1 + count;
 555         py = pSrc2;
 556
 557         /* Increment the MAC count */
 558         count++;
 559
 560         /* Decrement the loop counter */
 561         blkCnt--;
 562       }
 563     }
 564
 565
 566     /* --------------------------
 567      * Initializations of stage3
 568      * -------------------------*/
 569
 570     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
 571      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
 572      * ....
 573      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
 574      * sum +=  x[srcALen-1] * y[srcBLen-1]
 575      */
 576
 577     /* In this stage the MAC operations are decreased by 1 for every iteration.
 578        The count variable holds the number of MAC operations performed */
 579     count = srcBLen - 1u;
 580
 581     /* Working pointer of inputA */
 582     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 583     px = pSrc1;
 584
 585     /* Working pointer of inputB */
 586     pSrc2 = pIn2 + (srcBLen - 1u);
 587     pIn2 = pSrc2 - 1u;
 588     py = pIn2;
 589
 590     /* -------------------
 591      * Stage3 process
 592      * ------------------*/
 593
 594     /* For loop unrolling by 4, this stage is divided into two. */
 595     /* First part of this stage computes the MAC operations greater than 4 */
 596     /* Second part of this stage computes the MAC operations less than or equal to 4 */
 597
 598     /* The first part of the stage starts here */
 599     j = count >> 2u;
 600
 601     while((j > 0u) && (blockSize3 > 0))
 602     {
 603       /* Accumulator is made zero for every iteration */
 604       sum = 0;
 605
 606       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 607       k = count >> 2u;
 608
 609       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 610        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 611       while(k > 0u)
 612       {
 613         /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
 614          * with y[srcBLen - 1], y[srcBLen - 2] respectively */
 615         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 616         /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
 617          * with y[srcBLen - 3], y[srcBLen - 4] respectively */
 618         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 619
 620         /* Decrement the loop counter */
 621         k--;
 622       }
 623
 624       /* For the next MAC operations, the pointer py is used without SIMD
 625        * So, py is incremented by 1 */
 626       py = py + 1u;
 627
 628       /* If the count is not a multiple of 4, compute any remaining MACs here.
 629        ** No loop unrolling is used. */
 630       k = count % 0x4u;
 631
 632       while(k > 0u)
 633       {
 634         /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
 635         sum = __SMLAD(*px++, *py--, sum);
 636
 637         /* Decrement the loop counter */
 638         k--;
 639       }
 640
 641       /* Store the result in the accumulator in the destination buffer. */
 642       *pOut++ = (q15_t) (sum >> 15);
 643
 644       /* Update the inputA and inputB pointers for next MAC calculation */
 645       px = ++pSrc1;
 646       py = pIn2;
 647
 648       /* Decrement the MAC count */
 649       count--;
 650
 651       /* Decrement the loop counter */
 652       blockSize3--;
 653
 654       j--;
 655     }
 656
 657     /* The second part of the stage starts here */
 658     /* SIMD is not used for the next MAC operations,
 659      * so pointer py is updated to read only one sample at a time */
 660     py = py + 1u;
 661
 662     while(blockSize3 > 0)
 663     {
 664       /* Accumulator is made zero for every iteration */
 665       sum = 0;
 666
 667       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 668       k = count;
 669
 670       while(k > 0u)
 671       {
 672         /* Perform the multiply-accumulates */
 673         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
 674         sum = __SMLAD(*px++, *py--, sum);
 675
 676         /* Decrement the loop counter */
 677         k--;
 678       }
 679
 680       /* Store the result in the accumulator in the destination buffer. */
 681       *pOut++ = (q15_t) (sum >> 15);
 682
 683       /* Update the inputA and inputB pointers for next MAC calculation */
 684       px = ++pSrc1;
 685       py = pSrc2;
 686
 687       /* Decrement the MAC count */
 688       count--;
 689
 690       /* Decrement the loop counter */
 691       blockSize3--;
 692     }
 693
 694     /* set status as ARM_MATH_SUCCESS */
 695     status = ARM_MATH_SUCCESS;
 696   }
 697
 698   /* Return to application */
 699   return (status);
 700
 701 }
 702
 703 /**
 704  * @} end of PartialConv group
 705  */