git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_correlate_fast_q15.c
   9 *
  10 * Description:  Fast Q15 Correlation.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated.
  28 * -------------------------------------------------------------------- */
  29
  30 #include "arm_math.h"
  31
  32 /**
  33  * @ingroup groupFilters
  34  */
  35
  36 /**
  37  * @addtogroup Corr
  38  * @{
  39  */
  40
  41 /**
  42  * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
  43  * @param[in] *pSrcA points to the first input sequence.
  44  * @param[in] srcALen length of the first input sequence.
  45  * @param[in] *pSrcB points to the second input sequence.
  46  * @param[in] srcBLen length of the second input sequence.
  47  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
  48  * @return none.
  49  *
  50  * <b>Scaling and Overflow Behavior:</b>
  51  *
  52  * \par
  53  * This fast version uses a 32-bit accumulator with 2.30 format.
  54  * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
  55  * There is no saturation on intermediate additions.
  56  * Thus, if the accumulator overflows it wraps around and distorts the result.
  57  * The input signals should be scaled down to avoid intermediate overflows.
  58  * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
  59  * maximum of min(srcALen, srcBLen) number of additions is carried internally.
  60  * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
  61  *
  62  * \par
  63  * See <code>arm_correlate_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
  64  */
  65
  66 void arm_correlate_fast_q15(
  67   q15_t * pSrcA,
  68   uint32_t srcALen,
  69   q15_t * pSrcB,
  70   uint32_t srcBLen,
  71   q15_t * pDst)
  72 {
  73   q15_t *pIn1;                                   /* inputA pointer               */
  74   q15_t *pIn2;                                   /* inputB pointer               */
  75   q15_t *pOut = pDst;                            /* output pointer               */
  76   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
  77   q15_t *px;                                     /* Intermediate inputA pointer  */
  78   q15_t *py;                                     /* Intermediate inputB pointer  */
  79   q15_t *pSrc1;                                  /* Intermediate pointers        */
  80   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */
  81   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
  82   int32_t inc = 1;                               /* Destination address modifier */
  83   q31_t *pb;                                     /* 32 bit pointer for inputB buffer */
  84
  85
  86   /* The algorithm implementation is based on the lengths of the inputs. */
  87   /* srcB is always made to slide across srcA. */
  88   /* So srcBLen is always considered as shorter or equal to srcALen */
  89   /* But CORR(x, y) is reverse of CORR(y, x) */
  90   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
  91   /* and the destination pointer modifier, inc is set to -1 */
  92   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
  93   /* But to improve the performance,
  94    * we include zeroes in the output instead of zero padding either of the the inputs*/
  95   /* If srcALen > srcBLen,
  96    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
  97   /* If srcALen < srcBLen,
  98    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
  99   if(srcALen >= srcBLen)
 100   {
 101     /* Initialization of inputA pointer */
 102     pIn1 = (pSrcA);
 103
 104     /* Initialization of inputB pointer */
 105     pIn2 = (pSrcB);
 106
 107     /* Number of output samples is calculated */
 108     outBlockSize = (2u * srcALen) - 1u;
 109
 110     /* When srcALen > srcBLen, zero padding is done to srcB
 111      * to make their lengths equal.
 112      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
 113      * number of output samples are made zero */
 114     j = outBlockSize - (srcALen + (srcBLen - 1u));
 115
 116     /* Updating the pointer position to non zero value */
 117     pOut += j;
 118
 119   }
 120   else
 121   {
 122     /* Initialization of inputA pointer */
 123     pIn1 = (pSrcB);
 124
 125     /* Initialization of inputB pointer */
 126     pIn2 = (pSrcA);
 127
 128     /* srcBLen is always considered as shorter or equal to srcALen */
 129     j = srcBLen;
 130     srcBLen = srcALen;
 131     srcALen = j;
 132
 133     /* CORR(x, y) = Reverse order(CORR(y, x)) */
 134     /* Hence set the destination pointer to point to the last output sample */
 135     pOut = pDst + ((srcALen + srcBLen) - 2u);
 136
 137     /* Destination address modifier is set to -1 */
 138     inc = -1;
 139
 140   }
 141
 142   /* The function is internally
 143    * divided into three parts according to the number of multiplications that has to be
 144    * taken place between inputA samples and inputB samples. In the first part of the
 145    * algorithm, the multiplications increase by one for every iteration.
 146    * In the second part of the algorithm, srcBLen number of multiplications are done.
 147    * In the third part of the algorithm, the multiplications decrease by one
 148    * for every iteration.*/
 149   /* The algorithm is implemented in three stages.
 150    * The loop counters of each stage is initiated here. */
 151   blockSize1 = srcBLen - 1u;
 152   blockSize2 = srcALen - (srcBLen - 1u);
 153   blockSize3 = blockSize1;
 154
 155   /* --------------------------
 156    * Initializations of stage1
 157    * -------------------------*/
 158
 159   /* sum = x[0] * y[srcBlen - 1]
 160    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
 161    * ....
 162    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
 163    */
 164
 165   /* In this stage the MAC operations are increased by 1 for every iteration.
 166      The count variable holds the number of MAC operations performed */
 167   count = 1u;
 168
 169   /* Working pointer of inputA */
 170   px = pIn1;
 171
 172   /* Working pointer of inputB */
 173   pSrc1 = pIn2 + (srcBLen - 1u);
 174   py = pSrc1;
 175
 176   /* ------------------------
 177    * Stage1 process
 178    * ----------------------*/
 179
 180   /* The first loop starts here */
 181   while(blockSize1 > 0u)
 182   {
 183     /* Accumulator is made zero for every iteration */
 184     sum = 0;
 185
 186     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 187     k = count >> 2;
 188
 189     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 190      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 191     while(k > 0u)
 192     {
 193       /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
 194       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
 195       /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
 196       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
 197
 198       /* Decrement the loop counter */
 199       k--;
 200     }
 201
 202     /* If the count is not a multiple of 4, compute any remaining MACs here.
 203      ** No loop unrolling is used. */
 204     k = count % 0x4u;
 205
 206     while(k > 0u)
 207     {
 208       /* Perform the multiply-accumulates */
 209       /* x[0] * y[srcBLen - 1] */
 210       sum = __SMLAD(*px++, *py++, sum);
 211
 212       /* Decrement the loop counter */
 213       k--;
 214     }
 215
 216     /* Store the result in the accumulator in the destination buffer. */
 217     *pOut = (q15_t) (sum >> 15);
 218     /* Destination pointer is updated according to the address modifier, inc */
 219     pOut += inc;
 220
 221     /* Update the inputA and inputB pointers for next MAC calculation */
 222     py = pSrc1 - count;
 223     px = pIn1;
 224
 225     /* Increment the MAC count */
 226     count++;
 227
 228     /* Decrement the loop counter */
 229     blockSize1--;
 230   }
 231
 232   /* --------------------------
 233    * Initializations of stage2
 234    * ------------------------*/
 235
 236   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
 237    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
 238    * ....
 239    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 240    */
 241
 242   /* Working pointer of inputA */
 243   px = pIn1;
 244
 245   /* Working pointer of inputB */
 246   py = pIn2;
 247
 248   /* Initialize inputB pointer of type q31 */
 249   pb = (q31_t *) (py);
 250
 251   /* count is index by which the pointer pIn1 to be incremented */
 252   count = 0u;
 253
 254   /* -------------------
 255    * Stage2 process
 256    * ------------------*/
 257
 258   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 259    * So, to loop unroll over blockSize2,
 260    * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
 261   if(srcBLen >= 4u)
 262   {
 263     /* Loop unroll over blockSize2, by 4 */
 264     blkCnt = blockSize2 >> 2u;
 265
 266     while(blkCnt > 0u)
 267     {
 268       /* Set all accumulators to zero */
 269       acc0 = 0;
 270       acc1 = 0;
 271       acc2 = 0;
 272       acc3 = 0;
 273
 274       /* read x[0], x[1] samples */
 275       x0 = *(q31_t *) (px++);
 276       /* read x[1], x[2] samples */
 277       x1 = *(q31_t *) (px++);
 278
 279       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 280       k = srcBLen >> 2u;
 281
 282       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 283        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 284       do
 285       {
 286         /* Read the first two inputB samples using SIMD:
 287          * y[0] and y[1] */
 288         c0 = *(pb++);
 289
 290         /* acc0 +=  x[0] * y[0] + x[1] * y[1] */
 291         acc0 = __SMLAD(x0, c0, acc0);
 292
 293         /* acc1 +=  x[1] * y[0] + x[2] * y[1] */
 294         acc1 = __SMLAD(x1, c0, acc1);
 295
 296         /* Read x[2], x[3] */
 297         x2 = *(q31_t *) (px++);
 298
 299         /* Read x[3], x[4] */
 300         x3 = *(q31_t *) (px++);
 301
 302         /* acc2 +=  x[2] * y[0] + x[3] * y[1] */
 303         acc2 = __SMLAD(x2, c0, acc2);
 304
 305         /* acc3 +=  x[3] * y[0] + x[4] * y[1] */
 306         acc3 = __SMLAD(x3, c0, acc3);
 307
 308         /* Read y[2] and y[3] */
 309         c0 = *(pb++);
 310
 311         /* acc0 +=  x[2] * y[2] + x[3] * y[3] */
 312         acc0 = __SMLAD(x2, c0, acc0);
 313
 314         /* acc1 +=  x[3] * y[2] + x[4] * y[3] */
 315         acc1 = __SMLAD(x3, c0, acc1);
 316
 317         /* Read x[4], x[5] */
 318         x0 = *(q31_t *) (px++);
 319
 320         /* Read x[5], x[6] */
 321         x1 = *(q31_t *) (px++);
 322
 323         /* acc2 +=  x[4] * y[2] + x[5] * y[3] */
 324         acc2 = __SMLAD(x0, c0, acc2);
 325
 326         /* acc3 +=  x[5] * y[2] + x[6] * y[3] */
 327         acc3 = __SMLAD(x1, c0, acc3);
 328
 329       } while(--k);
 330
 331       /* For the next MAC operations, SIMD is not used
 332        * So, the 16 bit pointer if inputB, py is updated */
 333       py = (q15_t *) (pb);
 334
 335       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 336        ** No loop unrolling is used. */
 337       k = srcBLen % 0x4u;
 338
 339       if(k == 1u)
 340       {
 341         /* Read y[4] */
 342         c0 = *py;
 343 #ifdef  ARM_MATH_BIG_ENDIAN
 344
 345         c0 = c0 << 16u;
 346
 347 #else
 348
 349         c0 = c0 & 0x0000FFFF;
 350
 351 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 352
 353         /* Read x[7] */
 354         x3 = *(q31_t *) px++;
 355
 356         /* Perform the multiply-accumulates */
 357         acc0 = __SMLAD(x0, c0, acc0);
 358         acc1 = __SMLAD(x1, c0, acc1);
 359         acc2 = __SMLADX(x1, c0, acc2);
 360         acc3 = __SMLADX(x3, c0, acc3);
 361       }
 362
 363       if(k == 2u)
 364       {
 365         /* Read y[4], y[5] */
 366         c0 = *(pb);
 367
 368         /* Read x[7], x[8] */
 369         x3 = *(q31_t *) px++;
 370
 371         /* Read x[9] */
 372         x2 = *(q31_t *) px++;
 373
 374         /* Perform the multiply-accumulates */
 375         acc0 = __SMLAD(x0, c0, acc0);
 376         acc1 = __SMLAD(x1, c0, acc1);
 377         acc2 = __SMLAD(x3, c0, acc2);
 378         acc3 = __SMLAD(x2, c0, acc3);
 379       }
 380
 381       if(k == 3u)
 382       {
 383         /* Read y[4], y[5] */
 384         c0 = *pb++;
 385
 386         /* Read x[7], x[8] */
 387         x3 = *(q31_t *) px++;
 388
 389         /* Read x[9] */
 390         x2 = *(q31_t *) px++;
 391
 392         /* Perform the multiply-accumulates */
 393         acc0 = __SMLAD(x0, c0, acc0);
 394         acc1 = __SMLAD(x1, c0, acc1);
 395         acc2 = __SMLAD(x3, c0, acc2);
 396         acc3 = __SMLAD(x2, c0, acc3);
 397
 398         /* Read y[6] */
 399 #ifdef  ARM_MATH_BIG_ENDIAN
 400         c0 = (*pb);
 401         c0 = c0 & 0xFFFF0000;
 402
 403 #else
 404         c0 = (q15_t) (*pb);
 405         c0 = c0 & 0x0000FFFF;
 406
 407 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 408
 409         /* Read x[10] */
 410         x3 = *(q31_t *) px++;
 411
 412         /* Perform the multiply-accumulates */
 413         acc0 = __SMLADX(x1, c0, acc0);
 414         acc1 = __SMLAD(x2, c0, acc1);
 415         acc2 = __SMLADX(x2, c0, acc2);
 416         acc3 = __SMLADX(x3, c0, acc3);
 417       }
 418
 419       /* Store the result in the accumulator in the destination buffer. */
 420       *pOut = (q15_t) (acc0 >> 15);
 421       /* Destination pointer is updated according to the address modifier, inc */
 422       pOut += inc;
 423
 424       *pOut = (q15_t) (acc1 >> 15);
 425       pOut += inc;
 426
 427       *pOut = (q15_t) (acc2 >> 15);
 428       pOut += inc;
 429
 430       *pOut = (q15_t) (acc3 >> 15);
 431       pOut += inc;
 432
 433       /* Increment the pointer pIn1 index, count by 1 */
 434       count += 4u;
 435
 436       /* Update the inputA and inputB pointers for next MAC calculation */
 437       px = pIn1 + count;
 438       py = pIn2;
 439       pb = (q31_t *) (py);
 440
 441
 442       /* Decrement the loop counter */
 443       blkCnt--;
 444     }
 445
 446     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 447      ** No loop unrolling is used. */
 448     blkCnt = blockSize2 % 0x4u;
 449
 450     while(blkCnt > 0u)
 451     {
 452       /* Accumulator is made zero for every iteration */
 453       sum = 0;
 454
 455       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 456       k = srcBLen >> 2u;
 457
 458       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 459        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 460       while(k > 0u)
 461       {
 462         /* Perform the multiply-accumulates */
 463         sum += ((q31_t) * px++ * *py++);
 464         sum += ((q31_t) * px++ * *py++);
 465         sum += ((q31_t) * px++ * *py++);
 466         sum += ((q31_t) * px++ * *py++);
 467
 468         /* Decrement the loop counter */
 469         k--;
 470       }
 471
 472       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 473        ** No loop unrolling is used. */
 474       k = srcBLen % 0x4u;
 475
 476       while(k > 0u)
 477       {
 478         /* Perform the multiply-accumulates */
 479         sum += ((q31_t) * px++ * *py++);
 480
 481         /* Decrement the loop counter */
 482         k--;
 483       }
 484
 485       /* Store the result in the accumulator in the destination buffer. */
 486       *pOut = (q15_t) (sum >> 15);
 487       /* Destination pointer is updated according to the address modifier, inc */
 488       pOut += inc;
 489
 490       /* Increment the pointer pIn1 index, count by 1 */
 491       count++;
 492
 493       /* Update the inputA and inputB pointers for next MAC calculation */
 494       px = pIn1 + count;
 495       py = pIn2;
 496
 497       /* Decrement the loop counter */
 498       blkCnt--;
 499     }
 500   }
 501   else
 502   {
 503     /* If the srcBLen is not a multiple of 4,
 504      * the blockSize2 loop cannot be unrolled by 4 */
 505     blkCnt = blockSize2;
 506
 507     while(blkCnt > 0u)
 508     {
 509       /* Accumulator is made zero for every iteration */
 510       sum = 0;
 511
 512       /* Loop over srcBLen */
 513       k = srcBLen;
 514
 515       while(k > 0u)
 516       {
 517         /* Perform the multiply-accumulate */
 518         sum += ((q31_t) * px++ * *py++);
 519
 520         /* Decrement the loop counter */
 521         k--;
 522       }
 523
 524       /* Store the result in the accumulator in the destination buffer. */
 525       *pOut = (q15_t) (sum >> 15);
 526       /* Destination pointer is updated according to the address modifier, inc */
 527       pOut += inc;
 528
 529       /* Increment the MAC count */
 530       count++;
 531
 532       /* Update the inputA and inputB pointers for next MAC calculation */
 533       px = pIn1 + count;
 534       py = pIn2;
 535
 536       /* Decrement the loop counter */
 537       blkCnt--;
 538     }
 539   }
 540
 541   /* --------------------------
 542    * Initializations of stage3
 543    * -------------------------*/
 544
 545   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 546    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 547    * ....
 548    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
 549    * sum +=  x[srcALen-1] * y[0]
 550    */
 551
 552   /* In this stage the MAC operations are decreased by 1 for every iteration.
 553      The count variable holds the number of MAC operations performed */
 554   count = srcBLen - 1u;
 555
 556   /* Working pointer of inputA */
 557   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 558   px = pSrc1;
 559
 560   /* Working pointer of inputB */
 561   py = pIn2;
 562
 563   /* -------------------
 564    * Stage3 process
 565    * ------------------*/
 566
 567   while(blockSize3 > 0u)
 568   {
 569     /* Accumulator is made zero for every iteration */
 570     sum = 0;
 571
 572     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 573     k = count >> 2u;
 574
 575     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 576      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 577     while(k > 0u)
 578     {
 579       /* Perform the multiply-accumulates */
 580       /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
 581       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
 582       /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
 583       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
 584
 585       /* Decrement the loop counter */
 586       k--;
 587     }
 588
 589     /* If the count is not a multiple of 4, compute any remaining MACs here.
 590      ** No loop unrolling is used. */
 591     k = count % 0x4u;
 592
 593     while(k > 0u)
 594     {
 595       /* Perform the multiply-accumulates */
 596       sum = __SMLAD(*px++, *py++, sum);
 597
 598       /* Decrement the loop counter */
 599       k--;
 600     }
 601
 602     /* Store the result in the accumulator in the destination buffer. */
 603     *pOut = (q15_t) (sum >> 15);
 604     /* Destination pointer is updated according to the address modifier, inc */
 605     pOut += inc;
 606
 607     /* Update the inputA and inputB pointers for next MAC calculation */
 608     px = ++pSrc1;
 609     py = pIn2;
 610
 611     /* Decrement the MAC count */
 612     count--;
 613
 614     /* Decrement the loop counter */
 615     blockSize3--;
 616   }
 617
 618 }
 619
 620 /**
 621  * @} end of Corr group
 622  */