git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q7.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_correlate_q7.c
   9 *
  10 * Description:  Correlation of Q7 sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated
  28 *
  29 * Version 0.0.7  2010/06/10
  30 *    Misra-C changes done
  31 *
  32 * -------------------------------------------------------------------- */
  33
  34 #include "arm_math.h"
  35
  36 /**
  37  * @ingroup groupFilters
  38  */
  39
  40 /**
  41  * @addtogroup Corr
  42  * @{
  43  */
  44
  45 /**
  46  * @brief Correlation of Q7 sequences.
  47  * @param[in] *pSrcA points to the first input sequence.
  48  * @param[in] srcALen length of the first input sequence.
  49  * @param[in] *pSrcB points to the second input sequence.
  50  * @param[in] srcBLen length of the second input sequence.
  51  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
  52  * @return none.
  53  *
  54  * @details
  55  * <b>Scaling and Overflow Behavior:</b>
  56  *
  57  * \par
  58  * The function is implemented using a 32-bit internal accumulator.
  59  * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
  60  * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
  61  * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
  62  * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format.
  63  */
  64
  65 void arm_correlate_q7(
  66   q7_t * pSrcA,
  67   uint32_t srcALen,
  68   q7_t * pSrcB,
  69   uint32_t srcBLen,
  70   q7_t * pDst)
  71 {
  72
  73
  74 #ifndef ARM_MATH_CM0
  75
  76   /* Run the below code for Cortex-M4 and Cortex-M3 */
  77
  78   q7_t *pIn1;                                    /* inputA pointer               */
  79   q7_t *pIn2;                                    /* inputB pointer               */
  80   q7_t *pOut = pDst;                             /* output pointer               */
  81   q7_t *px;                                      /* Intermediate inputA pointer  */
  82   q7_t *py;                                      /* Intermediate inputB pointer  */
  83   q7_t *pSrc1;                                   /* Intermediate pointers        */
  84   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
  85   q31_t input1, input2;                          /* temporary variables */
  86   q15_t in1, in2;                                /* temporary variables */
  87   q7_t x0, x1, x2, x3, c0, c1;                   /* temporary variables for holding input and coefficient values */
  88   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
  89   int32_t inc = 1;
  90
  91
  92   /* The algorithm implementation is based on the lengths of the inputs. */
  93   /* srcB is always made to slide across srcA. */
  94   /* So srcBLen is always considered as shorter or equal to srcALen */
  95   /* But CORR(x, y) is reverse of CORR(y, x) */
  96   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
  97   /* and the destination pointer modifier, inc is set to -1 */
  98   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
  99   /* But to improve the performance,
 100    * we include zeroes in the output instead of zero padding either of the the inputs*/
 101   /* If srcALen > srcBLen,
 102    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
 103   /* If srcALen < srcBLen,
 104    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
 105   if(srcALen >= srcBLen)
 106   {
 107     /* Initialization of inputA pointer */
 108     pIn1 = (pSrcA);
 109
 110     /* Initialization of inputB pointer */
 111     pIn2 = (pSrcB);
 112
 113     /* Number of output samples is calculated */
 114     outBlockSize = (2u * srcALen) - 1u;
 115
 116     /* When srcALen > srcBLen, zero padding is done to srcB
 117      * to make their lengths equal.
 118      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
 119      * number of output samples are made zero */
 120     j = outBlockSize - (srcALen + (srcBLen - 1u));
 121
 122     /* Updating the pointer position to non zero value */
 123     pOut += j;
 124
 125   }
 126   else
 127   {
 128     /* Initialization of inputA pointer */
 129     pIn1 = (pSrcB);
 130
 131     /* Initialization of inputB pointer */
 132     pIn2 = (pSrcA);
 133
 134     /* srcBLen is always considered as shorter or equal to srcALen */
 135     j = srcBLen;
 136     srcBLen = srcALen;
 137     srcALen = j;
 138
 139     /* CORR(x, y) = Reverse order(CORR(y, x)) */
 140     /* Hence set the destination pointer to point to the last output sample */
 141     pOut = pDst + ((srcALen + srcBLen) - 2u);
 142
 143     /* Destination address modifier is set to -1 */
 144     inc = -1;
 145
 146   }
 147
 148   /* The function is internally
 149    * divided into three parts according to the number of multiplications that has to be
 150    * taken place between inputA samples and inputB samples. In the first part of the
 151    * algorithm, the multiplications increase by one for every iteration.
 152    * In the second part of the algorithm, srcBLen number of multiplications are done.
 153    * In the third part of the algorithm, the multiplications decrease by one
 154    * for every iteration.*/
 155   /* The algorithm is implemented in three stages.
 156    * The loop counters of each stage is initiated here. */
 157   blockSize1 = srcBLen - 1u;
 158   blockSize2 = srcALen - (srcBLen - 1u);
 159   blockSize3 = blockSize1;
 160
 161   /* --------------------------
 162    * Initializations of stage1
 163    * -------------------------*/
 164
 165   /* sum = x[0] * y[srcBlen - 1]
 166    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
 167    * ....
 168    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
 169    */
 170
 171   /* In this stage the MAC operations are increased by 1 for every iteration.
 172      The count variable holds the number of MAC operations performed */
 173   count = 1u;
 174
 175   /* Working pointer of inputA */
 176   px = pIn1;
 177
 178   /* Working pointer of inputB */
 179   pSrc1 = pIn2 + (srcBLen - 1u);
 180   py = pSrc1;
 181
 182   /* ------------------------
 183    * Stage1 process
 184    * ----------------------*/
 185
 186   /* The first stage starts here */
 187   while(blockSize1 > 0u)
 188   {
 189     /* Accumulator is made zero for every iteration */
 190     sum = 0;
 191
 192     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 193     k = count >> 2;
 194
 195     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 196      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 197     while(k > 0u)
 198     {
 199       /* x[0] , x[1] */
 200       in1 = (q15_t) * px++;
 201       in2 = (q15_t) * px++;
 202       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 203
 204       /* y[srcBLen - 4] , y[srcBLen - 3] */
 205       in1 = (q15_t) * py++;
 206       in2 = (q15_t) * py++;
 207       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 208
 209       /* x[0] * y[srcBLen - 4] */
 210       /* x[1] * y[srcBLen - 3] */
 211       sum = __SMLAD(input1, input2, sum);
 212
 213       /* x[2] , x[3] */
 214       in1 = (q15_t) * px++;
 215       in2 = (q15_t) * px++;
 216       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 217
 218       /* y[srcBLen - 2] , y[srcBLen - 1] */
 219       in1 = (q15_t) * py++;
 220       in2 = (q15_t) * py++;
 221       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 222
 223       /* x[2] * y[srcBLen - 2] */
 224       /* x[3] * y[srcBLen - 1] */
 225       sum = __SMLAD(input1, input2, sum);
 226
 227
 228       /* Decrement the loop counter */
 229       k--;
 230     }
 231
 232     /* If the count is not a multiple of 4, compute any remaining MACs here.
 233      ** No loop unrolling is used. */
 234     k = count % 0x4u;
 235
 236     while(k > 0u)
 237     {
 238       /* Perform the multiply-accumulates */
 239       /* x[0] * y[srcBLen - 1] */
 240       sum += (q31_t) ((q15_t) * px++ * *py++);
 241
 242       /* Decrement the loop counter */
 243       k--;
 244     }
 245
 246     /* Store the result in the accumulator in the destination buffer. */
 247     *pOut = (q7_t) (__SSAT(sum >> 7, 8));
 248     /* Destination pointer is updated according to the address modifier, inc */
 249     pOut += inc;
 250
 251     /* Update the inputA and inputB pointers for next MAC calculation */
 252     py = pSrc1 - count;
 253     px = pIn1;
 254
 255     /* Increment the MAC count */
 256     count++;
 257
 258     /* Decrement the loop counter */
 259     blockSize1--;
 260   }
 261
 262   /* --------------------------
 263    * Initializations of stage2
 264    * ------------------------*/
 265
 266   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
 267    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
 268    * ....
 269    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 270    */
 271
 272   /* Working pointer of inputA */
 273   px = pIn1;
 274
 275   /* Working pointer of inputB */
 276   py = pIn2;
 277
 278   /* count is index by which the pointer pIn1 to be incremented */
 279   count = 1u;
 280
 281   /* -------------------
 282    * Stage2 process
 283    * ------------------*/
 284
 285   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 286    * So, to loop unroll over blockSize2,
 287    * srcBLen should be greater than or equal to 4 */
 288   if(srcBLen >= 4u)
 289   {
 290     /* Loop unroll over blockSize2, by 4 */
 291     blkCnt = blockSize2 >> 2u;
 292
 293     while(blkCnt > 0u)
 294     {
 295       /* Set all accumulators to zero */
 296       acc0 = 0;
 297       acc1 = 0;
 298       acc2 = 0;
 299       acc3 = 0;
 300
 301       /* read x[0], x[1], x[2] samples */
 302       x0 = *px++;
 303       x1 = *px++;
 304       x2 = *px++;
 305
 306       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 307       k = srcBLen >> 2u;
 308
 309       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 310        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 311       do
 312       {
 313         /* Read y[0] sample */
 314         c0 = *py++;
 315         /* Read y[1] sample */
 316         c1 = *py++;
 317
 318         /* Read x[3] sample */
 319         x3 = *px++;
 320
 321         /* x[0] and x[1] are packed */
 322         in1 = (q15_t) x0;
 323         in2 = (q15_t) x1;
 324
 325         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 326
 327         /* y[0] and y[1] are packed */
 328         in1 = (q15_t) c0;
 329         in2 = (q15_t) c1;
 330
 331         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 332
 333         /* acc0 += x[0] * y[0] + x[1] * y[1]  */
 334         acc0 = __SMLAD(input1, input2, acc0);
 335
 336         /* x[1] and x[2] are packed */
 337         in1 = (q15_t) x1;
 338         in2 = (q15_t) x2;
 339
 340         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 341
 342         /* acc1 += x[1] * y[0] + x[2] * y[1] */
 343         acc1 = __SMLAD(input1, input2, acc1);
 344
 345         /* x[2] and x[3] are packed */
 346         in1 = (q15_t) x2;
 347         in2 = (q15_t) x3;
 348
 349         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 350
 351         /* acc2 += x[2] * y[0] + x[3] * y[1]  */
 352         acc2 = __SMLAD(input1, input2, acc2);
 353
 354         /* Read x[4] sample */
 355         x0 = *(px++);
 356
 357         /* x[3] and x[4] are packed */
 358         in1 = (q15_t) x3;
 359         in2 = (q15_t) x0;
 360
 361         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 362
 363         /* acc3 += x[3] * y[0] + x[4] * y[1]  */
 364         acc3 = __SMLAD(input1, input2, acc3);
 365
 366         /* Read y[2] sample */
 367         c0 = *py++;
 368         /* Read y[3] sample */
 369         c1 = *py++;
 370
 371         /* Read x[5] sample */
 372         x1 = *px++;
 373
 374         /* x[2] and x[3] are packed */
 375         in1 = (q15_t) x2;
 376         in2 = (q15_t) x3;
 377
 378         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 379
 380         /* y[2] and y[3] are packed */
 381         in1 = (q15_t) c0;
 382         in2 = (q15_t) c1;
 383
 384         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 385
 386         /* acc0 += x[2] * y[2] + x[3] * y[3]  */
 387         acc0 = __SMLAD(input1, input2, acc0);
 388
 389         /* x[3] and x[4] are packed */
 390         in1 = (q15_t) x3;
 391         in2 = (q15_t) x0;
 392
 393         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 394
 395         /* acc1 += x[3] * y[2] + x[4] * y[3]  */
 396         acc1 = __SMLAD(input1, input2, acc1);
 397
 398         /* x[4] and x[5] are packed */
 399         in1 = (q15_t) x0;
 400         in2 = (q15_t) x1;
 401
 402         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 403
 404         /* acc2 += x[4] * y[2] + x[5] * y[3]  */
 405         acc2 = __SMLAD(input1, input2, acc2);
 406
 407         /* Read x[6] sample */
 408         x2 = *px++;
 409
 410         /* x[5] and x[6] are packed */
 411         in1 = (q15_t) x1;
 412         in2 = (q15_t) x2;
 413
 414         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 415
 416         /* acc3 += x[5] * y[2] + x[6] * y[3]  */
 417         acc3 = __SMLAD(input1, input2, acc3);
 418
 419       } while(--k);
 420
 421       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 422        ** No loop unrolling is used. */
 423       k = srcBLen % 0x4u;
 424
 425       while(k > 0u)
 426       {
 427         /* Read y[4] sample */
 428         c0 = *py++;
 429
 430         /* Read x[7] sample */
 431         x3 = *px++;
 432
 433         /* Perform the multiply-accumulates */
 434         /* acc0 +=  x[4] * y[4] */
 435         acc0 += ((q15_t) x0 * c0);
 436         /* acc1 +=  x[5] * y[4] */
 437         acc1 += ((q15_t) x1 * c0);
 438         /* acc2 +=  x[6] * y[4] */
 439         acc2 += ((q15_t) x2 * c0);
 440         /* acc3 +=  x[7] * y[4] */
 441         acc3 += ((q15_t) x3 * c0);
 442
 443         /* Reuse the present samples for the next MAC */
 444         x0 = x1;
 445         x1 = x2;
 446         x2 = x3;
 447
 448         /* Decrement the loop counter */
 449         k--;
 450       }
 451
 452       /* Store the result in the accumulator in the destination buffer. */
 453       *pOut = (q7_t) (__SSAT(acc0 >> 7, 8));
 454       /* Destination pointer is updated according to the address modifier, inc */
 455       pOut += inc;
 456
 457       *pOut = (q7_t) (__SSAT(acc1 >> 7, 8));
 458       pOut += inc;
 459
 460       *pOut = (q7_t) (__SSAT(acc2 >> 7, 8));
 461       pOut += inc;
 462
 463       *pOut = (q7_t) (__SSAT(acc3 >> 7, 8));
 464       pOut += inc;
 465
 466       /* Update the inputA and inputB pointers for next MAC calculation */
 467       px = pIn1 + (count * 4u);
 468       py = pIn2;
 469
 470       /* Increment the pointer pIn1 index, count by 1 */
 471       count++;
 472
 473       /* Decrement the loop counter */
 474       blkCnt--;
 475     }
 476
 477     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 478      ** No loop unrolling is used. */
 479     blkCnt = blockSize2 % 0x4u;
 480
 481     while(blkCnt > 0u)
 482     {
 483       /* Accumulator is made zero for every iteration */
 484       sum = 0;
 485
 486       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 487       k = srcBLen >> 2u;
 488
 489       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 490        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 491       while(k > 0u)
 492       {
 493         /* Reading two inputs of SrcA buffer and packing */
 494         in1 = (q15_t) * px++;
 495         in2 = (q15_t) * px++;
 496         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 497
 498         /* Reading two inputs of SrcB buffer and packing */
 499         in1 = (q15_t) * py++;
 500         in2 = (q15_t) * py++;
 501         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 502
 503         /* Perform the multiply-accumulates */
 504         sum = __SMLAD(input1, input2, sum);
 505
 506         /* Reading two inputs of SrcA buffer and packing */
 507         in1 = (q15_t) * px++;
 508         in2 = (q15_t) * px++;
 509         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 510
 511         /* Reading two inputs of SrcB buffer and packing */
 512         in1 = (q15_t) * py++;
 513         in2 = (q15_t) * py++;
 514         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 515
 516         /* Perform the multiply-accumulates */
 517         sum = __SMLAD(input1, input2, sum);
 518
 519         /* Decrement the loop counter */
 520         k--;
 521       }
 522
 523       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 524        ** No loop unrolling is used. */
 525       k = srcBLen % 0x4u;
 526
 527       while(k > 0u)
 528       {
 529         /* Perform the multiply-accumulates */
 530         sum += ((q15_t) * px++ * *py++);
 531
 532         /* Decrement the loop counter */
 533         k--;
 534       }
 535
 536       /* Store the result in the accumulator in the destination buffer. */
 537       *pOut = (q7_t) (__SSAT(sum >> 7, 8));
 538       /* Destination pointer is updated according to the address modifier, inc */
 539       pOut += inc;
 540
 541       /* Update the inputA and inputB pointers for next MAC calculation */
 542       px = pIn1 + count;
 543       py = pIn2;
 544
 545       /* Increment the pointer pIn1 index, count by 1 */
 546       count++;
 547
 548       /* Decrement the loop counter */
 549       blkCnt--;
 550     }
 551   }
 552   else
 553   {
 554     /* If the srcBLen is not a multiple of 4,
 555      * the blockSize2 loop cannot be unrolled by 4 */
 556     blkCnt = blockSize2;
 557
 558     while(blkCnt > 0u)
 559     {
 560       /* Accumulator is made zero for every iteration */
 561       sum = 0;
 562
 563       /* Loop over srcBLen */
 564       k = srcBLen;
 565
 566       while(k > 0u)
 567       {
 568         /* Perform the multiply-accumulate */
 569         sum += ((q15_t) * px++ * *py++);
 570
 571         /* Decrement the loop counter */
 572         k--;
 573       }
 574
 575       /* Store the result in the accumulator in the destination buffer. */
 576       *pOut = (q7_t) (__SSAT(sum >> 7, 8));
 577       /* Destination pointer is updated according to the address modifier, inc */
 578       pOut += inc;
 579
 580       /* Update the inputA and inputB pointers for next MAC calculation */
 581       px = pIn1 + count;
 582       py = pIn2;
 583
 584       /* Increment the MAC count */
 585       count++;
 586
 587       /* Decrement the loop counter */
 588       blkCnt--;
 589     }
 590   }
 591
 592   /* --------------------------
 593    * Initializations of stage3
 594    * -------------------------*/
 595
 596   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 597    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 598    * ....
 599    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
 600    * sum +=  x[srcALen-1] * y[0]
 601    */
 602
 603   /* In this stage the MAC operations are decreased by 1 for every iteration.
 604      The count variable holds the number of MAC operations performed */
 605   count = srcBLen - 1u;
 606
 607   /* Working pointer of inputA */
 608   pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
 609   px = pSrc1;
 610
 611   /* Working pointer of inputB */
 612   py = pIn2;
 613
 614   /* -------------------
 615    * Stage3 process
 616    * ------------------*/
 617
 618   while(blockSize3 > 0u)
 619   {
 620     /* Accumulator is made zero for every iteration */
 621     sum = 0;
 622
 623     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 624     k = count >> 2u;
 625
 626     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 627      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 628     while(k > 0u)
 629     {
 630       /* x[srcALen - srcBLen + 1] , x[srcALen - srcBLen + 2]  */
 631       in1 = (q15_t) * px++;
 632       in2 = (q15_t) * px++;
 633       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 634
 635       /* y[0] , y[1] */
 636       in1 = (q15_t) * py++;
 637       in2 = (q15_t) * py++;
 638       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 639
 640       /* sum += x[srcALen - srcBLen + 1] * y[0] */
 641       /* sum += x[srcALen - srcBLen + 2] * y[1] */
 642       sum = __SMLAD(input1, input2, sum);
 643
 644       /* x[srcALen - srcBLen + 3] , x[srcALen - srcBLen + 4] */
 645       in1 = (q15_t) * px++;
 646       in2 = (q15_t) * px++;
 647       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 648
 649       /* y[2] , y[3] */
 650       in1 = (q15_t) * py++;
 651       in2 = (q15_t) * py++;
 652       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
 653
 654       /* sum += x[srcALen - srcBLen + 3] * y[2] */
 655       /* sum += x[srcALen - srcBLen + 4] * y[3] */
 656       sum = __SMLAD(input1, input2, sum);
 657
 658       /* Decrement the loop counter */
 659       k--;
 660     }
 661
 662     /* If the count is not a multiple of 4, compute any remaining MACs here.
 663      ** No loop unrolling is used. */
 664     k = count % 0x4u;
 665
 666     while(k > 0u)
 667     {
 668       /* Perform the multiply-accumulates */
 669       sum += ((q15_t) * px++ * *py++);
 670
 671       /* Decrement the loop counter */
 672       k--;
 673     }
 674
 675     /* Store the result in the accumulator in the destination buffer. */
 676     *pOut = (q7_t) (__SSAT(sum >> 7, 8));
 677     /* Destination pointer is updated according to the address modifier, inc */
 678     pOut += inc;
 679
 680     /* Update the inputA and inputB pointers for next MAC calculation */
 681     px = ++pSrc1;
 682     py = pIn2;
 683
 684     /* Decrement the MAC count */
 685     count--;
 686
 687     /* Decrement the loop counter */
 688     blockSize3--;
 689   }
 690
 691 #else
 692
 693 /* Run the below code for Cortex-M0 */
 694
 695   q7_t *pIn1 = pSrcA;                            /* inputA pointer */
 696   q7_t *pIn2 = pSrcB + (srcBLen - 1u);           /* inputB pointer */
 697   q31_t sum;                                     /* Accumulator */
 698   uint32_t i = 0u, j;                            /* loop counters */
 699   uint32_t inv = 0u;                             /* Reverse order flag */
 700   uint32_t tot = 0u;                             /* Length */
 701
 702   /* The algorithm implementation is based on the lengths of the inputs. */
 703   /* srcB is always made to slide across srcA. */
 704   /* So srcBLen is always considered as shorter or equal to srcALen */
 705   /* But CORR(x, y) is reverse of CORR(y, x) */
 706   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
 707   /* and a varaible, inv is set to 1 */
 708   /* If lengths are not equal then zero pad has to be done to  make the two
 709    * inputs of same length. But to improve the performance, we include zeroes
 710    * in the output instead of zero padding either of the the inputs*/
 711   /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
 712    * starting of the output buffer */
 713   /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
 714    * ending of the output buffer */
 715   /* Once the zero padding is done the remaining of the output is calcualted
 716    * using convolution but with the shorter signal time shifted. */
 717
 718   /* Calculate the length of the remaining sequence */
 719   tot = ((srcALen + srcBLen) - 2u);
 720
 721   if(srcALen > srcBLen)
 722   {
 723     /* Calculating the number of zeros to be padded to the output */
 724     j = srcALen - srcBLen;
 725
 726     /* Initialise the pointer after zero padding */
 727     pDst += j;
 728   }
 729
 730   else if(srcALen < srcBLen)
 731   {
 732     /* Initialization to inputB pointer */
 733     pIn1 = pSrcB;
 734
 735     /* Initialization to the end of inputA pointer */
 736     pIn2 = pSrcA + (srcALen - 1u);
 737
 738     /* Initialisation of the pointer after zero padding */
 739     pDst = pDst + tot;
 740
 741     /* Swapping the lengths */
 742     j = srcALen;
 743     srcALen = srcBLen;
 744     srcBLen = j;
 745
 746     /* Setting the reverse flag */
 747     inv = 1;
 748
 749   }
 750
 751   /* Loop to calculate convolution for output length number of times */
 752   for (i = 0u; i <= tot; i++)
 753   {
 754     /* Initialize sum with zero to carry on MAC operations */
 755     sum = 0;
 756
 757     /* Loop to perform MAC operations according to convolution equation */
 758     for (j = 0u; j <= i; j++)
 759     {
 760       /* Check the array limitations */
 761       if((((i - j) < srcBLen) && (j < srcALen)))
 762       {
 763         /* z[i] += x[i-j] * y[j] */
 764         sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
 765       }
 766     }
 767     /* Store the output in the destination buffer */
 768     if(inv == 1)
 769       *pDst-- = (q7_t) __SSAT((sum >> 7u), 8u);
 770     else
 771       *pDst++ = (q7_t) __SSAT((sum >> 7u), 8u);
 772   }
 773
 774 #endif /*   #ifndef ARM_MATH_CM0 */
 775
 776 }
 777
 778 /**
 779  * @} end of Corr group
 780  */