git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q7.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_q7.c
   9 *
  10 * Description:  Convolution of Q7 sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated
  28 *
  29 * Version 0.0.7  2010/06/10
  30 *    Misra-C changes done
  31 *
  32 * -------------------------------------------------------------------- */
  33
  34 #include "arm_math.h"
  35
  36 /**
  37  * @ingroup groupFilters
  38  */
  39
  40 /**
  41  * @addtogroup Conv
  42  * @{
  43  */
  44
  45 /**
  46  * @brief Convolution of Q7 sequences.
  47  * @param[in] *pSrcA points to the first input sequence.
  48  * @param[in] srcALen length of the first input sequence.
  49  * @param[in] *pSrcB points to the second input sequence.
  50  * @param[in] srcBLen length of the second input sequence.
  51  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
  52  * @return none.
  53  *
  54  * @details
  55  * <b>Scaling and Overflow Behavior:</b>
  56  *
  57  * \par
  58  * The function is implemented using a 32-bit internal accumulator.
  59  * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
  60  * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
  61  * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
  62  * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
  63  */
  64
  65 void arm_conv_q7(
  66   q7_t * pSrcA,
  67   uint32_t srcALen,
  68   q7_t * pSrcB,
  69   uint32_t srcBLen,
  70   q7_t * pDst)
  71 {
  72
  73
  74 #ifndef ARM_MATH_CM0
  75
  76   /* Run the below code for Cortex-M4 and Cortex-M3 */
  77
  78   q7_t *pIn1;                                    /* inputA pointer */
  79   q7_t *pIn2;                                    /* inputB pointer */
  80   q7_t *pOut = pDst;                             /* output pointer */
  81   q7_t *px;                                      /* Intermediate inputA pointer */
  82   q7_t *py;                                      /* Intermediate inputB pointer */
  83   q7_t *pSrc1, *pSrc2;                           /* Intermediate pointers */
  84   q7_t x0, x1, x2, x3, c0, c1;                   /* Temporary variables to hold state and coefficient values */
  85   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
  86   q31_t input1, input2;                          /* Temporary input variables */
  87   q15_t in1, in2;                                /* Temporary input variables */
  88   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counter */
  89
  90
  91   /* The algorithm implementation is based on the lengths of the inputs. */
  92   /* srcB is always made to slide across srcA. */
  93   /* So srcBLen is always considered as shorter or equal to srcALen */
  94   if(srcALen >= srcBLen)
  95   {
  96     /* Initialization of inputA pointer */
  97     pIn1 = pSrcA;
  98
  99     /* Initialization of inputB pointer */
 100     pIn2 = pSrcB;
 101   }
 102   else
 103   {
 104     /* Initialization of inputA pointer */
 105     pIn1 = pSrcB;
 106
 107     /* Initialization of inputB pointer */
 108     pIn2 = pSrcA;
 109
 110     /* srcBLen is always considered as shorter or equal to srcALen */
 111     j = srcBLen;
 112     srcBLen = srcALen;
 113     srcALen = j;
 114   }
 115
 116   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
 117   /* The function is internally
 118    * divided into three stages according to the number of multiplications that has to be
 119    * taken place between inputA samples and inputB samples. In the first stage of the
 120    * algorithm, the multiplications increase by one for every iteration.
 121    * In the second stage of the algorithm, srcBLen number of multiplications are done.
 122    * In the third stage of the algorithm, the multiplications decrease by one
 123    * for every iteration. */
 124
 125   /* The algorithm is implemented in three stages.
 126      The loop counters of each stage is initiated here. */
 127   blockSize1 = srcBLen - 1u;
 128   blockSize2 = (srcALen - srcBLen) + 1u;
 129   blockSize3 = blockSize1;
 130
 131   /* --------------------------
 132    * Initializations of stage1
 133    * -------------------------*/
 134
 135   /* sum = x[0] * y[0]
 136    * sum = x[0] * y[1] + x[1] * y[0]
 137    * ....
 138    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
 139    */
 140
 141   /* In this stage the MAC operations are increased by 1 for every iteration.
 142      The count variable holds the number of MAC operations performed */
 143   count = 1u;
 144
 145   /* Working pointer of inputA */
 146   px = pIn1;
 147
 148   /* Working pointer of inputB */
 149   py = pIn2;
 150
 151
 152   /* ------------------------
 153    * Stage1 process
 154    * ----------------------*/
 155
 156   /* The first stage starts here */
 157   while(blockSize1 > 0u)
 158   {
 159     /* Accumulator is made zero for every iteration */
 160     sum = 0;
 161
 162     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 163     k = count >> 2u;
 164
 165     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 166      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 167     while(k > 0u)
 168     {
 169       /* x[0] , x[1] */
 170       in1 = (q15_t) * px++;
 171       in2 = (q15_t) * px++;
 172       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 173
 174       /* y[srcBLen - 1] , y[srcBLen - 2] */
 175       in1 = (q15_t) * py--;
 176       in2 = (q15_t) * py--;
 177       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 178
 179       /* x[0] * y[srcBLen - 1] */
 180       /* x[1] * y[srcBLen - 2] */
 181       sum = __SMLAD(input1, input2, sum);
 182
 183       /* x[2] , x[3] */
 184       in1 = (q15_t) * px++;
 185       in2 = (q15_t) * px++;
 186       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 187
 188       /* y[srcBLen - 3] , y[srcBLen - 4] */
 189       in1 = (q15_t) * py--;
 190       in2 = (q15_t) * py--;
 191       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 192
 193       /* x[2] * y[srcBLen - 3] */
 194       /* x[3] * y[srcBLen - 4] */
 195       sum = __SMLAD(input1, input2, sum);
 196
 197       /* Decrement the loop counter */
 198       k--;
 199     }
 200
 201     /* If the count is not a multiple of 4, compute any remaining MACs here.
 202      ** No loop unrolling is used. */
 203     k = count % 0x4u;
 204
 205     while(k > 0u)
 206     {
 207       /* Perform the multiply-accumulates */
 208       sum += ((q15_t) * px++ * *py--);
 209
 210       /* Decrement the loop counter */
 211       k--;
 212     }
 213
 214     /* Store the result in the accumulator in the destination buffer. */
 215     *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
 216
 217     /* Update the inputA and inputB pointers for next MAC calculation */
 218     py = pIn2 + count;
 219     px = pIn1;
 220
 221     /* Increment the MAC count */
 222     count++;
 223
 224     /* Decrement the loop counter */
 225     blockSize1--;
 226   }
 227
 228   /* --------------------------
 229    * Initializations of stage2
 230    * ------------------------*/
 231
 232   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
 233    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
 234    * ....
 235    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
 236    */
 237
 238   /* Working pointer of inputA */
 239   px = pIn1;
 240
 241   /* Working pointer of inputB */
 242   pSrc2 = pIn2 + (srcBLen - 1u);
 243   py = pSrc2;
 244
 245   /* count is index by which the pointer pIn1 to be incremented */
 246   count = 1u;
 247
 248   /* -------------------
 249    * Stage2 process
 250    * ------------------*/
 251
 252   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 253    * So, to loop unroll over blockSize2,
 254    * srcBLen should be greater than or equal to 4 */
 255   if(srcBLen >= 4u)
 256   {
 257     /* Loop unroll over blockSize2, by 4 */
 258     blkCnt = blockSize2 >> 2u;
 259
 260     while(blkCnt > 0u)
 261     {
 262       /* Set all accumulators to zero */
 263       acc0 = 0;
 264       acc1 = 0;
 265       acc2 = 0;
 266       acc3 = 0;
 267
 268       /* read x[0], x[1], x[2] samples */
 269       x0 = *(px++);
 270       x1 = *(px++);
 271       x2 = *(px++);
 272
 273       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 274       k = srcBLen >> 2u;
 275
 276       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 277        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 278       do
 279       {
 280         /* Read y[srcBLen - 1] sample */
 281         c0 = *(py--);
 282         /* Read y[srcBLen - 2] sample */
 283         c1 = *(py--);
 284
 285         /* Read x[3] sample */
 286         x3 = *(px++);
 287
 288         /* x[0] and x[1] are packed */
 289         in1 = (q15_t) x0;
 290         in2 = (q15_t) x1;
 291
 292         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 293
 294         /* y[srcBLen - 1]   and y[srcBLen - 2] are packed */
 295         in1 = (q15_t) c0;
 296         in2 = (q15_t) c1;
 297
 298         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 299
 300         /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2]  */
 301         acc0 = __SMLAD(input1, input2, acc0);
 302
 303         /* x[1] and x[2] are packed */
 304         in1 = (q15_t) x1;
 305         in2 = (q15_t) x2;
 306
 307         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 308
 309         /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2]  */
 310         acc1 = __SMLAD(input1, input2, acc1);
 311
 312         /* x[2] and x[3] are packed */
 313         in1 = (q15_t) x2;
 314         in2 = (q15_t) x3;
 315
 316         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 317
 318         /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2]  */
 319         acc2 = __SMLAD(input1, input2, acc2);
 320
 321         /* Read x[4] sample */
 322         x0 = *(px++);
 323
 324         /* x[3] and x[4] are packed */
 325         in1 = (q15_t) x3;
 326         in2 = (q15_t) x0;
 327
 328         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 329
 330         /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2]  */
 331         acc3 = __SMLAD(input1, input2, acc3);
 332
 333         /* Read y[srcBLen - 3] sample */
 334         c0 = *(py--);
 335         /* Read y[srcBLen - 4] sample */
 336         c1 = *(py--);
 337
 338         /* Read x[5] sample */
 339         x1 = *(px++);
 340
 341         /* x[2] and x[3] are packed */
 342         in1 = (q15_t) x2;
 343         in2 = (q15_t) x3;
 344
 345         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 346
 347         /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
 348         in1 = (q15_t) c0;
 349         in2 = (q15_t) c1;
 350
 351         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 352
 353         /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4]  */
 354         acc0 = __SMLAD(input1, input2, acc0);
 355
 356         /* x[3] and x[4] are packed */
 357         in1 = (q15_t) x3;
 358         in2 = (q15_t) x0;
 359
 360         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 361
 362         /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4]  */
 363         acc1 = __SMLAD(input1, input2, acc1);
 364
 365         /* x[4] and x[5] are packed */
 366         in1 = (q15_t) x0;
 367         in2 = (q15_t) x1;
 368
 369         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 370
 371         /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4]  */
 372         acc2 = __SMLAD(input1, input2, acc2);
 373
 374         /* Read x[6] sample */
 375         x2 = *(px++);
 376
 377         /* x[5] and x[6] are packed */
 378         in1 = (q15_t) x1;
 379         in2 = (q15_t) x2;
 380
 381         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 382
 383         /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4]  */
 384         acc3 = __SMLAD(input1, input2, acc3);
 385
 386       } while(--k);
 387
 388       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 389        ** No loop unrolling is used. */
 390       k = srcBLen % 0x4u;
 391
 392       while(k > 0u)
 393       {
 394         /* Read y[srcBLen - 5] sample */
 395         c0 = *(py--);
 396
 397         /* Read x[7] sample */
 398         x3 = *(px++);
 399
 400         /* Perform the multiply-accumulates */
 401         /* acc0 +=  x[4] * y[srcBLen - 5] */
 402         acc0 += ((q15_t) x0 * c0);
 403         /* acc1 +=  x[5] * y[srcBLen - 5] */
 404         acc1 += ((q15_t) x1 * c0);
 405         /* acc2 +=  x[6] * y[srcBLen - 5] */
 406         acc2 += ((q15_t) x2 * c0);
 407         /* acc3 +=  x[7] * y[srcBLen - 5] */
 408         acc3 += ((q15_t) x3 * c0);
 409
 410         /* Reuse the present samples for the next MAC */
 411         x0 = x1;
 412         x1 = x2;
 413         x2 = x3;
 414
 415         /* Decrement the loop counter */
 416         k--;
 417       }
 418
 419
 420       /* Store the result in the accumulator in the destination buffer. */
 421       *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
 422       *pOut++ = (q7_t) (__SSAT(acc1 >> 7u, 8));
 423       *pOut++ = (q7_t) (__SSAT(acc2 >> 7u, 8));
 424       *pOut++ = (q7_t) (__SSAT(acc3 >> 7u, 8));
 425
 426       /* Update the inputA and inputB pointers for next MAC calculation */
 427       px = pIn1 + (count * 4u);
 428       py = pSrc2;
 429
 430       /* Increment the pointer pIn1 index, count by 1 */
 431       count++;
 432
 433       /* Decrement the loop counter */
 434       blkCnt--;
 435     }
 436
 437     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 438      ** No loop unrolling is used. */
 439     blkCnt = blockSize2 % 0x4u;
 440
 441     while(blkCnt > 0u)
 442     {
 443       /* Accumulator is made zero for every iteration */
 444       sum = 0;
 445
 446       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 447       k = srcBLen >> 2u;
 448
 449       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 450        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 451       while(k > 0u)
 452       {
 453
 454         /* Reading two inputs of SrcA buffer and packing */
 455         in1 = (q15_t) * px++;
 456         in2 = (q15_t) * px++;
 457         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 458
 459         /* Reading two inputs of SrcB buffer and packing */
 460         in1 = (q15_t) * py--;
 461         in2 = (q15_t) * py--;
 462         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 463
 464         /* Perform the multiply-accumulates */
 465         sum = __SMLAD(input1, input2, sum);
 466
 467         /* Reading two inputs of SrcA buffer and packing */
 468         in1 = (q15_t) * px++;
 469         in2 = (q15_t) * px++;
 470         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 471
 472         /* Reading two inputs of SrcB buffer and packing */
 473         in1 = (q15_t) * py--;
 474         in2 = (q15_t) * py--;
 475         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 476
 477         /* Perform the multiply-accumulates */
 478         sum = __SMLAD(input1, input2, sum);
 479
 480         /* Decrement the loop counter */
 481         k--;
 482       }
 483
 484       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 485        ** No loop unrolling is used. */
 486       k = srcBLen % 0x4u;
 487
 488       while(k > 0u)
 489       {
 490         /* Perform the multiply-accumulates */
 491         sum += ((q15_t) * px++ * *py--);
 492
 493         /* Decrement the loop counter */
 494         k--;
 495       }
 496
 497       /* Store the result in the accumulator in the destination buffer. */
 498       *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
 499
 500       /* Update the inputA and inputB pointers for next MAC calculation */
 501       px = pIn1 + count;
 502       py = pSrc2;
 503
 504       /* Increment the pointer pIn1 index, count by 1 */
 505       count++;
 506
 507       /* Decrement the loop counter */
 508       blkCnt--;
 509     }
 510   }
 511   else
 512   {
 513     /* If the srcBLen is not a multiple of 4,
 514      * the blockSize2 loop cannot be unrolled by 4 */
 515     blkCnt = blockSize2;
 516
 517     while(blkCnt > 0u)
 518     {
 519       /* Accumulator is made zero for every iteration */
 520       sum = 0;
 521
 522       /* srcBLen number of MACS should be performed */
 523       k = srcBLen;
 524
 525       while(k > 0u)
 526       {
 527         /* Perform the multiply-accumulate */
 528         sum += ((q15_t) * px++ * *py--);
 529
 530         /* Decrement the loop counter */
 531         k--;
 532       }
 533
 534       /* Store the result in the accumulator in the destination buffer. */
 535       *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
 536
 537       /* Update the inputA and inputB pointers for next MAC calculation */
 538       px = pIn1 + count;
 539       py = pSrc2;
 540
 541       /* Increment the MAC count */
 542       count++;
 543
 544       /* Decrement the loop counter */
 545       blkCnt--;
 546     }
 547   }
 548
 549
 550   /* --------------------------
 551    * Initializations of stage3
 552    * -------------------------*/
 553
 554   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
 555    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
 556    * ....
 557    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
 558    * sum +=  x[srcALen-1] * y[srcBLen-1]
 559    */
 560
 561   /* In this stage the MAC operations are decreased by 1 for every iteration.
 562      The blockSize3 variable holds the number of MAC operations performed */
 563
 564   /* Working pointer of inputA */
 565   pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
 566   px = pSrc1;
 567
 568   /* Working pointer of inputB */
 569   pSrc2 = pIn2 + (srcBLen - 1u);
 570   py = pSrc2;
 571
 572   /* -------------------
 573    * Stage3 process
 574    * ------------------*/
 575
 576   while(blockSize3 > 0u)
 577   {
 578     /* Accumulator is made zero for every iteration */
 579     sum = 0;
 580
 581     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 582     k = blockSize3 >> 2u;
 583
 584     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 585      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 586     while(k > 0u)
 587     {
 588       /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
 589       in1 = (q15_t) * px++;
 590       in2 = (q15_t) * px++;
 591       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 592
 593       /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
 594       in1 = (q15_t) * py--;
 595       in2 = (q15_t) * py--;
 596       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 597
 598       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
 599       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
 600       sum = __SMLAD(input1, input2, sum);
 601
 602       /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
 603       in1 = (q15_t) * px++;
 604       in2 = (q15_t) * px++;
 605       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 606
 607       /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
 608       in1 = (q15_t) * py--;
 609       in2 = (q15_t) * py--;
 610       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
 611
 612       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
 613       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
 614       sum = __SMLAD(input1, input2, sum);
 615
 616       /* Decrement the loop counter */
 617       k--;
 618     }
 619
 620     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
 621      ** No loop unrolling is used. */
 622     k = blockSize3 % 0x4u;
 623
 624     while(k > 0u)
 625     {
 626       /* Perform the multiply-accumulates */
 627       sum += ((q15_t) * px++ * *py--);
 628
 629       /* Decrement the loop counter */
 630       k--;
 631     }
 632
 633     /* Store the result in the accumulator in the destination buffer. */
 634     *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
 635
 636     /* Update the inputA and inputB pointers for next MAC calculation */
 637     px = ++pSrc1;
 638     py = pSrc2;
 639
 640     /* Decrement the loop counter */
 641     blockSize3--;
 642   }
 643
 644 #else
 645
 646   /* Run the below code for Cortex-M0 */
 647
 648   q7_t *pIn1 = pSrcA;                            /* input pointer */
 649   q7_t *pIn2 = pSrcB;                            /* coefficient pointer */
 650   q31_t sum;                                     /* Accumulator */
 651   uint32_t i, j;                                 /* loop counter */
 652
 653   /* Loop to calculate output of convolution for output length number of times */
 654   for (i = 0; i < (srcALen + srcBLen - 1); i++)
 655   {
 656     /* Initialize sum with zero to carry on MAC operations */
 657     sum = 0;
 658
 659     /* Loop to perform MAC operations according to convolution equation */
 660     for (j = 0; j <= i; j++)
 661     {
 662       /* Check the array limitations */
 663       if(((i - j) < srcBLen) && (j < srcALen))
 664       {
 665         /* z[i] += x[i-j] * y[j] */
 666         sum += (q15_t) pIn1[j] * (pIn2[i - j]);
 667       }
 668     }
 669
 670     /* Store the output in the destination buffer */
 671     pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u);
 672   }
 673
 674 #endif /*   #ifndef ARM_MATH_CM0        */
 675
 676 }
 677
 678 /**
 679  * @} end of Conv group
 680  */