git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_partial_q15.c
   9 *
  10 * Description:  Partial convolution of Q15 sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated
  28 *
  29 * Version 0.0.7  2010/06/10
  30 *    Misra-C changes done
  31 *
  32 * -------------------------------------------------------------------- */
  33
  34 #include "arm_math.h"
  35
  36 /**
  37  * @ingroup groupFilters
  38  */
  39
  40 /**
  41  * @addtogroup PartialConv
  42  * @{
  43  */
  44
  45 /**
  46  * @brief Partial convolution of Q15 sequences.
  47  * @param[in]       *pSrcA points to the first input sequence.
  48  * @param[in]       srcALen length of the first input sequence.
  49  * @param[in]       *pSrcB points to the second input sequence.
  50  * @param[in]       srcBLen length of the second input sequence.
  51  * @param[out]      *pDst points to the location where the output result is written.
  52  * @param[in]       firstIndex is the first output sample to start with.
  53  * @param[in]       numPoints is the number of output points to be computed.
  54  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
  55  *
  56  * Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
  57  */
  58
  59
  60 arm_status arm_conv_partial_q15(
  61   q15_t * pSrcA,
  62   uint32_t srcALen,
  63   q15_t * pSrcB,
  64   uint32_t srcBLen,
  65   q15_t * pDst,
  66   uint32_t firstIndex,
  67   uint32_t numPoints)
  68 {
  69
  70
  71 #ifndef ARM_MATH_CM0
  72
  73   /* Run the below code for Cortex-M4 and Cortex-M3 */
  74
  75   q15_t *pIn1;                                   /* inputA pointer               */
  76   q15_t *pIn2;                                   /* inputB pointer               */
  77   q15_t *pOut = pDst;                            /* output pointer               */
  78   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */
  79   q15_t *px;                                     /* Intermediate inputA pointer  */
  80   q15_t *py;                                     /* Intermediate inputB pointer  */
  81   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
  82   q31_t x0, x1, x2, x3, c0;                      /* Temporary input variables */
  83   uint32_t j, k, count, check, blkCnt;
  84   int32_t blockSize1, blockSize2, blockSize3;    /* loop counter                 */
  85   arm_status status;                             /* status of Partial convolution */
  86   q31_t *pb;                                     /* 32 bit pointer for inputB buffer */
  87
  88   /* Check for range of output samples to be calculated */
  89   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
  90   {
  91     /* Set status as ARM_MATH_ARGUMENT_ERROR */
  92     status = ARM_MATH_ARGUMENT_ERROR;
  93   }
  94   else
  95   {
  96
  97     /* The algorithm implementation is based on the lengths of the inputs. */
  98     /* srcB is always made to slide across srcA. */
  99     /* So srcBLen is always considered as shorter or equal to srcALen */
 100     if(srcALen >= srcBLen)
 101     {
 102       /* Initialization of inputA pointer */
 103       pIn1 = pSrcA;
 104
 105       /* Initialization of inputB pointer */
 106       pIn2 = pSrcB;
 107     }
 108     else
 109     {
 110       /* Initialization of inputA pointer */
 111       pIn1 = pSrcB;
 112
 113       /* Initialization of inputB pointer */
 114       pIn2 = pSrcA;
 115
 116       /* srcBLen is always considered as shorter or equal to srcALen */
 117       j = srcBLen;
 118       srcBLen = srcALen;
 119       srcALen = j;
 120     }
 121
 122     /* Conditions to check which loopCounter holds
 123      * the first and last indices of the output samples to be calculated. */
 124     check = firstIndex + numPoints;
 125     blockSize3 = ((int32_t) check - (int32_t) srcALen);
 126     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
 127     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
 128     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
 129                                      (int32_t) numPoints) : 0;
 130     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
 131                                     (int32_t) firstIndex);
 132     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
 133
 134     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
 135     /* The function is internally
 136      * divided into three stages according to the number of multiplications that has to be
 137      * taken place between inputA samples and inputB samples. In the first stage of the
 138      * algorithm, the multiplications increase by one for every iteration.
 139      * In the second stage of the algorithm, srcBLen number of multiplications are done.
 140      * In the third stage of the algorithm, the multiplications decrease by one
 141      * for every iteration. */
 142
 143     /* Set the output pointer to point to the firstIndex
 144      * of the output sample to be calculated. */
 145     pOut = pDst + firstIndex;
 146
 147     /* --------------------------
 148      * Initializations of stage1
 149      * -------------------------*/
 150
 151     /* sum = x[0] * y[0]
 152      * sum = x[0] * y[1] + x[1] * y[0]
 153      * ....
 154      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
 155      */
 156
 157     /* In this stage the MAC operations are increased by 1 for every iteration.
 158        The count variable holds the number of MAC operations performed.
 159        Since the partial convolution starts from firstIndex
 160        Number of Macs to be performed is firstIndex + 1 */
 161     count = 1u + firstIndex;
 162
 163     /* Working pointer of inputA */
 164     px = pIn1;
 165
 166     /* Working pointer of inputB */
 167     pSrc2 = pIn2 + firstIndex;
 168     py = pSrc2;
 169
 170     /* ------------------------
 171      * Stage1 process
 172      * ----------------------*/
 173
 174     /* For loop unrolling by 4, this stage is divided into two. */
 175     /* First part of this stage computes the MAC operations less than 4 */
 176     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
 177
 178     /* The first part of the stage starts here */
 179     while((count < 4u) && (blockSize1 > 0))
 180     {
 181       /* Accumulator is made zero for every iteration */
 182       sum = 0;
 183
 184       /* Loop over number of MAC operations between
 185        * inputA samples and inputB samples */
 186       k = count;
 187
 188       while(k > 0u)
 189       {
 190         /* Perform the multiply-accumulates */
 191         sum = __SMLALD(*px++, *py--, sum);
 192
 193         /* Decrement the loop counter */
 194         k--;
 195       }
 196
 197       /* Store the result in the accumulator in the destination buffer. */
 198       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
 199
 200       /* Update the inputA and inputB pointers for next MAC calculation */
 201       py = ++pSrc2;
 202       px = pIn1;
 203
 204       /* Increment the MAC count */
 205       count++;
 206
 207       /* Decrement the loop counter */
 208       blockSize1--;
 209     }
 210
 211     /* The second part of the stage starts here */
 212     /* The internal loop, over count, is unrolled by 4 */
 213     /* To, read the last two inputB samples using SIMD:
 214      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
 215     py = py - 1;
 216
 217     while(blockSize1 > 0)
 218     {
 219       /* Accumulator is made zero for every iteration */
 220       sum = 0;
 221
 222       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 223       k = count >> 2u;
 224
 225       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 226        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 227       while(k > 0u)
 228       {
 229         /* Perform the multiply-accumulates */
 230         /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
 231         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 232         /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
 233         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 234
 235         /* Decrement the loop counter */
 236         k--;
 237       }
 238
 239       /* For the next MAC operations, the pointer py is used without SIMD
 240        * So, py is incremented by 1 */
 241       py = py + 1u;
 242
 243       /* If the count is not a multiple of 4, compute any remaining MACs here.
 244        ** No loop unrolling is used. */
 245       k = count % 0x4u;
 246
 247       while(k > 0u)
 248       {
 249         /* Perform the multiply-accumulates */
 250         sum = __SMLALD(*px++, *py--, sum);
 251
 252         /* Decrement the loop counter */
 253         k--;
 254       }
 255
 256       /* Store the result in the accumulator in the destination buffer. */
 257       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
 258
 259       /* Update the inputA and inputB pointers for next MAC calculation */
 260       py = ++pSrc2 - 1u;
 261       px = pIn1;
 262
 263       /* Increment the MAC count */
 264       count++;
 265
 266       /* Decrement the loop counter */
 267       blockSize1--;
 268     }
 269
 270     /* --------------------------
 271      * Initializations of stage2
 272      * ------------------------*/
 273
 274     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
 275      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
 276      * ....
 277      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
 278      */
 279
 280     /* Working pointer of inputA */
 281     px = pIn1;
 282
 283     /* Working pointer of inputB */
 284     pSrc2 = pIn2 + (srcBLen - 1u);
 285     py = pSrc2;
 286
 287     /* Initialize inputB pointer of type q31 */
 288     pb = (q31_t *) (py - 1u);
 289
 290     /* count is the index by which the pointer pIn1 to be incremented */
 291     count = 1u;
 292
 293
 294     /* --------------------
 295      * Stage2 process
 296      * -------------------*/
 297
 298     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 299      * So, to loop unroll over blockSize2,
 300      * srcBLen should be greater than or equal to 4 */
 301     if(srcBLen >= 4u)
 302     {
 303       /* Loop unroll over blockSize2, by 4 */
 304       blkCnt = ((uint32_t) blockSize2 >> 2u);
 305
 306       while(blkCnt > 0u)
 307       {
 308         /* Set all accumulators to zero */
 309         acc0 = 0;
 310         acc1 = 0;
 311         acc2 = 0;
 312         acc3 = 0;
 313
 314
 315         /* read x[0], x[1] samples */
 316         x0 = *(q31_t *) (px++);
 317         /* read x[1], x[2] samples */
 318         x1 = *(q31_t *) (px++);
 319
 320
 321         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 322         k = srcBLen >> 2u;
 323
 324         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 325          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 326         do
 327         {
 328           /* Read the last two inputB samples using SIMD:
 329            * y[srcBLen - 1] and y[srcBLen - 2] */
 330           c0 = *(pb--);
 331
 332           /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
 333           acc0 = __SMLALDX(x0, c0, acc0);
 334
 335           /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
 336           acc1 = __SMLALDX(x1, c0, acc1);
 337
 338           /* Read x[2], x[3] */
 339           x2 = *(q31_t *) (px++);
 340
 341           /* Read x[3], x[4] */
 342           x3 = *(q31_t *) (px++);
 343
 344           /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
 345           acc2 = __SMLALDX(x2, c0, acc2);
 346
 347           /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
 348           acc3 = __SMLALDX(x3, c0, acc3);
 349
 350           /* Read y[srcBLen - 3] and y[srcBLen - 4] */
 351           c0 = *(pb--);
 352
 353           /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
 354           acc0 = __SMLALDX(x2, c0, acc0);
 355
 356           /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
 357           acc1 = __SMLALDX(x3, c0, acc1);
 358
 359           /* Read x[4], x[5] */
 360           x0 = *(q31_t *) (px++);
 361
 362           /* Read x[5], x[6] */
 363           x1 = *(q31_t *) (px++);
 364
 365           /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
 366           acc2 = __SMLALDX(x0, c0, acc2);
 367
 368           /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
 369           acc3 = __SMLALDX(x1, c0, acc3);
 370
 371         } while(--k);
 372
 373         /* For the next MAC operations, SIMD is not used
 374          * So, the 16 bit pointer if inputB, py is updated */
 375         py = (q15_t *) pb;
 376         py = py + 1;
 377
 378         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 379          ** No loop unrolling is used. */
 380         k = srcBLen % 0x4u;
 381
 382         if(k == 1u)
 383         {
 384           /* Read y[srcBLen - 5] */
 385           c0 = *(py);
 386
 387 #ifdef  ARM_MATH_BIG_ENDIAN
 388
 389           c0 = c0 << 16u;
 390
 391 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 392           /* Read x[7] */
 393           x3 = *(q31_t *) px++;
 394
 395           /* Perform the multiply-accumulates */
 396           acc0 = __SMLALD(x0, c0, acc0);
 397           acc1 = __SMLALD(x1, c0, acc1);
 398           acc2 = __SMLALDX(x1, c0, acc2);
 399           acc3 = __SMLALDX(x3, c0, acc3);
 400         }
 401
 402         if(k == 2u)
 403         {
 404           /* Read y[srcBLen - 5], y[srcBLen - 6] */
 405           c0 = *(pb);
 406
 407           /* Read x[7], x[8] */
 408           x3 = *(q31_t *) px++;
 409
 410           /* Read x[9] */
 411           x2 = *(q31_t *) px++;
 412
 413           /* Perform the multiply-accumulates */
 414           acc0 = __SMLALDX(x0, c0, acc0);
 415           acc1 = __SMLALDX(x1, c0, acc1);
 416           acc2 = __SMLALDX(x3, c0, acc2);
 417           acc3 = __SMLALDX(x2, c0, acc3);
 418         }
 419
 420         if(k == 3u)
 421         {
 422           /* Read y[srcBLen - 5], y[srcBLen - 6] */
 423           c0 = *pb--;
 424
 425           /* Read x[7], x[8] */
 426           x3 = *(q31_t *) px++;
 427
 428           /* Read x[9] */
 429           x2 = *(q31_t *) px++;
 430
 431           /* Perform the multiply-accumulates */
 432           acc0 = __SMLALDX(x0, c0, acc0);
 433           acc1 = __SMLALDX(x1, c0, acc1);
 434           acc2 = __SMLALDX(x3, c0, acc2);
 435           acc3 = __SMLALDX(x2, c0, acc3);
 436
 437 #ifdef  ARM_MATH_BIG_ENDIAN
 438
 439           /* Read y[srcBLen - 7] */
 440           c0 = (*pb);
 441           c0 = (c0) << 16;
 442
 443 #else
 444
 445           /* Read y[srcBLen - 7] */
 446           c0 = (q15_t) (*pb >> 16);
 447
 448 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 449
 450           /* Read x[10] */
 451           x3 = *(q31_t *) px++;
 452
 453           /* Perform the multiply-accumulates */
 454           acc0 = __SMLALDX(x1, c0, acc0);
 455           acc1 = __SMLALD(x2, c0, acc1);
 456           acc2 = __SMLALDX(x2, c0, acc2);
 457           acc3 = __SMLALDX(x3, c0, acc3);
 458         }
 459
 460         /* Store the results in the accumulators in the destination buffer. */
 461 #ifndef  ARM_MATH_BIG_ENDIAN
 462
 463         *__SIMD32(pOut)++ =
 464           __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
 465         *__SIMD32(pOut)++ =
 466           __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
 467
 468 #else
 469
 470         *__SIMD32(pOut)++ =
 471           __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
 472         *__SIMD32(pOut)++ =
 473           __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
 474
 475 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
 476
 477         /* Update the inputA and inputB pointers for next MAC calculation */
 478         px = pIn1 + (count * 4u);
 479         py = pSrc2;
 480         pb = (q31_t *) (py - 1);
 481
 482         /* Increment the pointer pIn1 index, count by 1 */
 483         count++;
 484
 485         /* Decrement the loop counter */
 486         blkCnt--;
 487       }
 488
 489       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 490        ** No loop unrolling is used. */
 491       blkCnt = (uint32_t) blockSize2 % 0x4u;
 492
 493       while(blkCnt > 0u)
 494       {
 495         /* Accumulator is made zero for every iteration */
 496         sum = 0;
 497
 498         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 499         k = srcBLen >> 2u;
 500
 501         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 502          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 503         while(k > 0u)
 504         {
 505           /* Perform the multiply-accumulates */
 506           sum += (q63_t) ((q31_t) * px++ * *py--);
 507           sum += (q63_t) ((q31_t) * px++ * *py--);
 508           sum += (q63_t) ((q31_t) * px++ * *py--);
 509           sum += (q63_t) ((q31_t) * px++ * *py--);
 510
 511           /* Decrement the loop counter */
 512           k--;
 513         }
 514
 515         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 516          ** No loop unrolling is used. */
 517         k = srcBLen % 0x4u;
 518
 519         while(k > 0u)
 520         {
 521           /* Perform the multiply-accumulates */
 522           sum += (q63_t) ((q31_t) * px++ * *py--);
 523
 524           /* Decrement the loop counter */
 525           k--;
 526         }
 527
 528         /* Store the result in the accumulator in the destination buffer. */
 529         *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
 530
 531         /* Update the inputA and inputB pointers for next MAC calculation */
 532         px = pIn1 + count;
 533         py = pSrc2;
 534
 535         /* Increment the pointer pIn1 index, count by 1 */
 536         count++;
 537
 538         /* Decrement the loop counter */
 539         blkCnt--;
 540       }
 541     }
 542     else
 543     {
 544       /* If the srcBLen is not a multiple of 4,
 545        * the blockSize2 loop cannot be unrolled by 4 */
 546       blkCnt = (uint32_t) blockSize2;
 547
 548       while(blkCnt > 0u)
 549       {
 550         /* Accumulator is made zero for every iteration */
 551         sum = 0;
 552
 553         /* srcBLen number of MACS should be performed */
 554         k = srcBLen;
 555
 556         while(k > 0u)
 557         {
 558           /* Perform the multiply-accumulate */
 559           sum += (q63_t) ((q31_t) * px++ * *py--);
 560
 561           /* Decrement the loop counter */
 562           k--;
 563         }
 564
 565         /* Store the result in the accumulator in the destination buffer. */
 566         *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
 567
 568         /* Update the inputA and inputB pointers for next MAC calculation */
 569         px = pIn1 + count;
 570         py = pSrc2;
 571
 572         /* Increment the MAC count */
 573         count++;
 574
 575         /* Decrement the loop counter */
 576         blkCnt--;
 577       }
 578     }
 579
 580
 581     /* --------------------------
 582      * Initializations of stage3
 583      * -------------------------*/
 584
 585     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
 586      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
 587      * ....
 588      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
 589      * sum +=  x[srcALen-1] * y[srcBLen-1]
 590      */
 591
 592     /* In this stage the MAC operations are decreased by 1 for every iteration.
 593        The count variable holds the number of MAC operations performed */
 594     count = srcBLen - 1u;
 595
 596     /* Working pointer of inputA */
 597     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 598     px = pSrc1;
 599
 600     /* Working pointer of inputB */
 601     pSrc2 = pIn2 + (srcBLen - 1u);
 602     pIn2 = pSrc2 - 1u;
 603     py = pIn2;
 604
 605     /* -------------------
 606      * Stage3 process
 607      * ------------------*/
 608
 609     /* For loop unrolling by 4, this stage is divided into two. */
 610     /* First part of this stage computes the MAC operations greater than 4 */
 611     /* Second part of this stage computes the MAC operations less than or equal to 4 */
 612
 613     /* The first part of the stage starts here */
 614     j = count >> 2u;
 615
 616     while((j > 0u) && (blockSize3 > 0))
 617     {
 618       /* Accumulator is made zero for every iteration */
 619       sum = 0;
 620
 621       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 622       k = count >> 2u;
 623
 624       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 625        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 626       while(k > 0u)
 627       {
 628         /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
 629          * with y[srcBLen - 1], y[srcBLen - 2] respectively */
 630         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 631         /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
 632          * with y[srcBLen - 3], y[srcBLen - 4] respectively */
 633         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 634
 635         /* Decrement the loop counter */
 636         k--;
 637       }
 638
 639       /* For the next MAC operations, the pointer py is used without SIMD
 640        * So, py is incremented by 1 */
 641       py = py + 1u;
 642
 643       /* If the count is not a multiple of 4, compute any remaining MACs here.
 644        ** No loop unrolling is used. */
 645       k = count % 0x4u;
 646
 647       while(k > 0u)
 648       {
 649         /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
 650         sum = __SMLALD(*px++, *py--, sum);
 651
 652         /* Decrement the loop counter */
 653         k--;
 654       }
 655
 656       /* Store the result in the accumulator in the destination buffer. */
 657       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
 658
 659       /* Update the inputA and inputB pointers for next MAC calculation */
 660       px = ++pSrc1;
 661       py = pIn2;
 662
 663       /* Decrement the MAC count */
 664       count--;
 665
 666       /* Decrement the loop counter */
 667       blockSize3--;
 668
 669       j--;
 670     }
 671
 672     /* The second part of the stage starts here */
 673     /* SIMD is not used for the next MAC operations,
 674      * so pointer py is updated to read only one sample at a time */
 675     py = py + 1u;
 676
 677     while(blockSize3 > 0)
 678     {
 679       /* Accumulator is made zero for every iteration */
 680       sum = 0;
 681
 682       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 683       k = count;
 684
 685       while(k > 0u)
 686       {
 687         /* Perform the multiply-accumulates */
 688         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
 689         sum = __SMLALD(*px++, *py--, sum);
 690
 691         /* Decrement the loop counter */
 692         k--;
 693       }
 694
 695       /* Store the result in the accumulator in the destination buffer. */
 696       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
 697
 698       /* Update the inputA and inputB pointers for next MAC calculation */
 699       px = ++pSrc1;
 700       py = pSrc2;
 701
 702       /* Decrement the MAC count */
 703       count--;
 704
 705       /* Decrement the loop counter */
 706       blockSize3--;
 707     }
 708
 709     /* set status as ARM_MATH_SUCCESS */
 710     status = ARM_MATH_SUCCESS;
 711   }
 712
 713   /* Return to application */
 714   return (status);
 715
 716 #else
 717
 718   /* Run the below code for Cortex-M0 */
 719
 720   q15_t *pIn1 = pSrcA;                           /* inputA pointer */
 721   q15_t *pIn2 = pSrcB;                           /* inputB pointer */
 722   q63_t sum;                                     /* Accumulator */
 723   uint32_t i, j;                                 /* loop counters */
 724   arm_status status;                             /* status of Partial convolution */
 725
 726   /* Check for range of output samples to be calculated */
 727   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
 728   {
 729     /* Set status as ARM_ARGUMENT_ERROR */
 730     status = ARM_MATH_ARGUMENT_ERROR;
 731   }
 732   else
 733   {
 734     /* Loop to calculate convolution for output length number of values */
 735     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
 736     {
 737       /* Initialize sum with zero to carry on MAC operations */
 738       sum = 0;
 739
 740       /* Loop to perform MAC operations according to convolution equation */
 741       for (j = 0; j <= i; j++)
 742       {
 743         /* Check the array limitations */
 744         if(((i - j) < srcBLen) && (j < srcALen))
 745         {
 746           /* z[i] += x[i-j] * y[j] */
 747           sum += ((q31_t) pIn1[j] * (pIn2[i - j]));
 748         }
 749       }
 750
 751       /* Store the output in the destination buffer */
 752       pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u);
 753     }
 754     /* set status as ARM_SUCCESS as there are no argument errors */
 755     status = ARM_MATH_SUCCESS;
 756   }
 757   return (status);
 758
 759 #endif /*     #ifndef ARM_MATH_CM0      */
 760
 761 }
 762
 763 /**
 764  * @} end of PartialConv group
 765  */