git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_q31.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_correlate_fast_q31.c
   9 *
  10 * Description:  Fast Q31 Correlation.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated.
  28 * -------------------------------------------------------------------- */
  29
  30 #include "arm_math.h"
  31
  32 /**
  33  * @ingroup groupFilters
  34  */
  35
  36 /**
  37  * @addtogroup Corr
  38  * @{
  39  */
  40
  41 /**
  42  * @brief Correlation of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4.
  43  * @param[in] *pSrcA points to the first input sequence.
  44  * @param[in] srcALen length of the first input sequence.
  45  * @param[in] *pSrcB points to the second input sequence.
  46  * @param[in] srcBLen length of the second input sequence.
  47  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
  48  * @return none.
  49  *
  50  * @details
  51  * <b>Scaling and Overflow Behavior:</b>
  52  *
  53  * \par
  54  * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
  55  * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
  56  * These intermediate results are accumulated in a 32-bit register in 2.30 format.
  57  * Finally, the accumulator is saturated and converted to a 1.31 result.
  58  *
  59  * \par
  60  * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
  61  * In order to avoid overflows completely the input signals must be scaled down.
  62  * The input signals should be scaled down to avoid intermediate overflows.
  63  * Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a
  64  * maximum of min(srcALen, srcBLen) number of additions is carried internally.
  65  *
  66  * \par
  67  * See <code>arm_correlate_q31()</code> for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
  68  */
  69
  70 void arm_correlate_fast_q31(
  71   q31_t * pSrcA,
  72   uint32_t srcALen,
  73   q31_t * pSrcB,
  74   uint32_t srcBLen,
  75   q31_t * pDst)
  76 {
  77   q31_t *pIn1;                                   /* inputA pointer               */
  78   q31_t *pIn2;                                   /* inputB pointer               */
  79   q31_t *pOut = pDst;                            /* output pointer               */
  80   q31_t *px;                                     /* Intermediate inputA pointer  */
  81   q31_t *py;                                     /* Intermediate inputB pointer  */
  82   q31_t *pSrc1;                                  /* Intermediate pointers        */
  83   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
  84   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */
  85   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
  86   int32_t inc = 1;                               /* Destination address modifier */
  87
  88
  89   /* The algorithm implementation is based on the lengths of the inputs. */
  90   /* srcB is always made to slide across srcA. */
  91   /* So srcBLen is always considered as shorter or equal to srcALen */
  92   if(srcALen >= srcBLen)
  93   {
  94     /* Initialization of inputA pointer */
  95     pIn1 = (pSrcA);
  96
  97     /* Initialization of inputB pointer */
  98     pIn2 = (pSrcB);
  99
 100     /* Number of output samples is calculated */
 101     outBlockSize = (2u * srcALen) - 1u;
 102
 103     /* When srcALen > srcBLen, zero padding is done to srcB
 104      * to make their lengths equal.
 105      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
 106      * number of output samples are made zero */
 107     j = outBlockSize - (srcALen + (srcBLen - 1u));
 108
 109     /* Updating the pointer position to non zero value */
 110     pOut += j;
 111
 112   }
 113   else
 114   {
 115     /* Initialization of inputA pointer */
 116     pIn1 = (pSrcB);
 117
 118     /* Initialization of inputB pointer */
 119     pIn2 = (pSrcA);
 120
 121     /* srcBLen is always considered as shorter or equal to srcALen */
 122     j = srcBLen;
 123     srcBLen = srcALen;
 124     srcALen = j;
 125
 126     /* CORR(x, y) = Reverse order(CORR(y, x)) */
 127     /* Hence set the destination pointer to point to the last output sample */
 128     pOut = pDst + ((srcALen + srcBLen) - 2u);
 129
 130     /* Destination address modifier is set to -1 */
 131     inc = -1;
 132
 133   }
 134
 135   /* The function is internally
 136    * divided into three parts according to the number of multiplications that has to be
 137    * taken place between inputA samples and inputB samples. In the first part of the
 138    * algorithm, the multiplications increase by one for every iteration.
 139    * In the second part of the algorithm, srcBLen number of multiplications are done.
 140    * In the third part of the algorithm, the multiplications decrease by one
 141    * for every iteration.*/
 142   /* The algorithm is implemented in three stages.
 143    * The loop counters of each stage is initiated here. */
 144   blockSize1 = srcBLen - 1u;
 145   blockSize2 = srcALen - (srcBLen - 1u);
 146   blockSize3 = blockSize1;
 147
 148   /* --------------------------
 149    * Initializations of stage1
 150    * -------------------------*/
 151
 152   /* sum = x[0] * y[srcBlen - 1]
 153    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
 154    * ....
 155    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
 156    */
 157
 158   /* In this stage the MAC operations are increased by 1 for every iteration.
 159      The count variable holds the number of MAC operations performed */
 160   count = 1u;
 161
 162   /* Working pointer of inputA */
 163   px = pIn1;
 164
 165   /* Working pointer of inputB */
 166   pSrc1 = pIn2 + (srcBLen - 1u);
 167   py = pSrc1;
 168
 169   /* ------------------------
 170    * Stage1 process
 171    * ----------------------*/
 172
 173   /* The first stage starts here */
 174   while(blockSize1 > 0u)
 175   {
 176     /* Accumulator is made zero for every iteration */
 177     sum = 0;
 178
 179     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 180     k = count >> 2;
 181
 182     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 183      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 184     while(k > 0u)
 185     {
 186       /* x[0] * y[srcBLen - 4] */
 187       sum = (q31_t) ((((q63_t) sum << 32) +
 188                       ((q63_t) * px++ * (*py++))) >> 32);
 189       /* x[1] * y[srcBLen - 3] */
 190       sum = (q31_t) ((((q63_t) sum << 32) +
 191                       ((q63_t) * px++ * (*py++))) >> 32);
 192       /* x[2] * y[srcBLen - 2] */
 193       sum = (q31_t) ((((q63_t) sum << 32) +
 194                       ((q63_t) * px++ * (*py++))) >> 32);
 195       /* x[3] * y[srcBLen - 1] */
 196       sum = (q31_t) ((((q63_t) sum << 32) +
 197                       ((q63_t) * px++ * (*py++))) >> 32);
 198
 199       /* Decrement the loop counter */
 200       k--;
 201     }
 202
 203     /* If the count is not a multiple of 4, compute any remaining MACs here.
 204      ** No loop unrolling is used. */
 205     k = count % 0x4u;
 206
 207     while(k > 0u)
 208     {
 209       /* Perform the multiply-accumulates */
 210       /* x[0] * y[srcBLen - 1] */
 211       sum = (q31_t) ((((q63_t) sum << 32) +
 212                       ((q63_t) * px++ * (*py++))) >> 32);
 213
 214       /* Decrement the loop counter */
 215       k--;
 216     }
 217
 218     /* Store the result in the accumulator in the destination buffer. */
 219     *pOut = sum << 1;
 220     /* Destination pointer is updated according to the address modifier, inc */
 221     pOut += inc;
 222
 223     /* Update the inputA and inputB pointers for next MAC calculation */
 224     py = pSrc1 - count;
 225     px = pIn1;
 226
 227     /* Increment the MAC count */
 228     count++;
 229
 230     /* Decrement the loop counter */
 231     blockSize1--;
 232   }
 233
 234   /* --------------------------
 235    * Initializations of stage2
 236    * ------------------------*/
 237
 238   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
 239    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
 240    * ....
 241    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 242    */
 243
 244   /* Working pointer of inputA */
 245   px = pIn1;
 246
 247   /* Working pointer of inputB */
 248   py = pIn2;
 249
 250   /* count is index by which the pointer pIn1 to be incremented */
 251   count = 1u;
 252
 253   /* -------------------
 254    * Stage2 process
 255    * ------------------*/
 256
 257   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 258    * So, to loop unroll over blockSize2,
 259    * srcBLen should be greater than or equal to 4 */
 260   if(srcBLen >= 4u)
 261   {
 262     /* Loop unroll over blockSize2, by 4 */
 263     blkCnt = blockSize2 >> 2u;
 264
 265     while(blkCnt > 0u)
 266     {
 267       /* Set all accumulators to zero */
 268       acc0 = 0;
 269       acc1 = 0;
 270       acc2 = 0;
 271       acc3 = 0;
 272
 273       /* read x[0], x[1], x[2] samples */
 274       x0 = *(px++);
 275       x1 = *(px++);
 276       x2 = *(px++);
 277
 278       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 279       k = srcBLen >> 2u;
 280
 281       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 282        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 283       do
 284       {
 285         /* Read y[0] sample */
 286         c0 = *(py++);
 287
 288         /* Read x[3] sample */
 289         x3 = *(px++);
 290
 291         /* Perform the multiply-accumulate */
 292         /* acc0 +=  x[0] * y[0] */
 293         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
 294         /* acc1 +=  x[1] * y[0] */
 295         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
 296         /* acc2 +=  x[2] * y[0] */
 297         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
 298         /* acc3 +=  x[3] * y[0] */
 299         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
 300
 301         /* Read y[1] sample */
 302         c0 = *(py++);
 303
 304         /* Read x[4] sample */
 305         x0 = *(px++);
 306
 307         /* Perform the multiply-accumulates */
 308         /* acc0 +=  x[1] * y[1] */
 309         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
 310         /* acc1 +=  x[2] * y[1] */
 311         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
 312         /* acc2 +=  x[3] * y[1] */
 313         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
 314         /* acc3 +=  x[4] * y[1] */
 315         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
 316
 317         /* Read y[2] sample */
 318         c0 = *(py++);
 319
 320         /* Read x[5] sample */
 321         x1 = *(px++);
 322
 323         /* Perform the multiply-accumulates */
 324         /* acc0 +=  x[2] * y[2] */
 325         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
 326         /* acc1 +=  x[3] * y[2] */
 327         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
 328         /* acc2 +=  x[4] * y[2] */
 329         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
 330         /* acc3 +=  x[5] * y[2] */
 331         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
 332
 333         /* Read y[3] sample */
 334         c0 = *(py++);
 335
 336         /* Read x[6] sample */
 337         x2 = *(px++);
 338
 339         /* Perform the multiply-accumulates */
 340         /* acc0 +=  x[3] * y[3] */
 341         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
 342         /* acc1 +=  x[4] * y[3] */
 343         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
 344         /* acc2 +=  x[5] * y[3] */
 345         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
 346         /* acc3 +=  x[6] * y[3] */
 347         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
 348
 349
 350       } while(--k);
 351
 352       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 353        ** No loop unrolling is used. */
 354       k = srcBLen % 0x4u;
 355
 356       while(k > 0u)
 357       {
 358         /* Read y[4] sample */
 359         c0 = *(py++);
 360
 361         /* Read x[7] sample */
 362         x3 = *(px++);
 363
 364         /* Perform the multiply-accumulates */
 365         /* acc0 +=  x[4] * y[4] */
 366         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
 367         /* acc1 +=  x[5] * y[4] */
 368         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
 369         /* acc2 +=  x[6] * y[4] */
 370         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
 371         /* acc3 +=  x[7] * y[4] */
 372         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
 373
 374         /* Reuse the present samples for the next MAC */
 375         x0 = x1;
 376         x1 = x2;
 377         x2 = x3;
 378
 379         /* Decrement the loop counter */
 380         k--;
 381       }
 382
 383       /* Store the result in the accumulator in the destination buffer. */
 384       *pOut = (q31_t) (acc0 << 1);
 385       /* Destination pointer is updated according to the address modifier, inc */
 386       pOut += inc;
 387
 388       *pOut = (q31_t) (acc1 << 1);
 389       pOut += inc;
 390
 391       *pOut = (q31_t) (acc2 << 1);
 392       pOut += inc;
 393
 394       *pOut = (q31_t) (acc3 << 1);
 395       pOut += inc;
 396
 397       /* Update the inputA and inputB pointers for next MAC calculation */
 398       px = pIn1 + (count * 4u);
 399       py = pIn2;
 400
 401       /* Increment the pointer pIn1 index, count by 1 */
 402       count++;
 403
 404       /* Decrement the loop counter */
 405       blkCnt--;
 406     }
 407
 408     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 409      ** No loop unrolling is used. */
 410     blkCnt = blockSize2 % 0x4u;
 411
 412     while(blkCnt > 0u)
 413     {
 414       /* Accumulator is made zero for every iteration */
 415       sum = 0;
 416
 417       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 418       k = srcBLen >> 2u;
 419
 420       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 421        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 422       while(k > 0u)
 423       {
 424         /* Perform the multiply-accumulates */
 425         sum = (q31_t) ((((q63_t) sum << 32) +
 426                         ((q63_t) * px++ * (*py++))) >> 32);
 427         sum = (q31_t) ((((q63_t) sum << 32) +
 428                         ((q63_t) * px++ * (*py++))) >> 32);
 429         sum = (q31_t) ((((q63_t) sum << 32) +
 430                         ((q63_t) * px++ * (*py++))) >> 32);
 431         sum = (q31_t) ((((q63_t) sum << 32) +
 432                         ((q63_t) * px++ * (*py++))) >> 32);
 433
 434         /* Decrement the loop counter */
 435         k--;
 436       }
 437
 438       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 439        ** No loop unrolling is used. */
 440       k = srcBLen % 0x4u;
 441
 442       while(k > 0u)
 443       {
 444         /* Perform the multiply-accumulate */
 445         sum = (q31_t) ((((q63_t) sum << 32) +
 446                         ((q63_t) * px++ * (*py++))) >> 32);
 447
 448         /* Decrement the loop counter */
 449         k--;
 450       }
 451
 452       /* Store the result in the accumulator in the destination buffer. */
 453       *pOut = sum << 1;
 454       /* Destination pointer is updated according to the address modifier, inc */
 455       pOut += inc;
 456
 457       /* Update the inputA and inputB pointers for next MAC calculation */
 458       px = pIn1 + count;
 459       py = pIn2;
 460
 461       /* Increment the MAC count */
 462       count++;
 463
 464       /* Decrement the loop counter */
 465       blkCnt--;
 466     }
 467   }
 468   else
 469   {
 470     /* If the srcBLen is not a multiple of 4,
 471      * the blockSize2 loop cannot be unrolled by 4 */
 472     blkCnt = blockSize2;
 473
 474     while(blkCnt > 0u)
 475     {
 476       /* Accumulator is made zero for every iteration */
 477       sum = 0;
 478
 479       /* Loop over srcBLen */
 480       k = srcBLen;
 481
 482       while(k > 0u)
 483       {
 484         /* Perform the multiply-accumulate */
 485         sum = (q31_t) ((((q63_t) sum << 32) +
 486                         ((q63_t) * px++ * (*py++))) >> 32);
 487
 488         /* Decrement the loop counter */
 489         k--;
 490       }
 491
 492       /* Store the result in the accumulator in the destination buffer. */
 493       *pOut = sum << 1;
 494       /* Destination pointer is updated according to the address modifier, inc */
 495       pOut += inc;
 496
 497       /* Update the inputA and inputB pointers for next MAC calculation */
 498       px = pIn1 + count;
 499       py = pIn2;
 500
 501       /* Increment the MAC count */
 502       count++;
 503
 504       /* Decrement the loop counter */
 505       blkCnt--;
 506     }
 507   }
 508
 509   /* --------------------------
 510    * Initializations of stage3
 511    * -------------------------*/
 512
 513   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 514    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 515    * ....
 516    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
 517    * sum +=  x[srcALen-1] * y[0]
 518    */
 519
 520   /* In this stage the MAC operations are decreased by 1 for every iteration.
 521      The count variable holds the number of MAC operations performed */
 522   count = srcBLen - 1u;
 523
 524   /* Working pointer of inputA */
 525   pSrc1 = ((pIn1 + srcALen) - srcBLen) + 1u;
 526   px = pSrc1;
 527
 528   /* Working pointer of inputB */
 529   py = pIn2;
 530
 531   /* -------------------
 532    * Stage3 process
 533    * ------------------*/
 534
 535   while(blockSize3 > 0u)
 536   {
 537     /* Accumulator is made zero for every iteration */
 538     sum = 0;
 539
 540     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 541     k = count >> 2u;
 542
 543     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 544      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 545     while(k > 0u)
 546     {
 547       /* Perform the multiply-accumulates */
 548       /* sum += x[srcALen - srcBLen + 4] * y[3] */
 549       sum = (q31_t) ((((q63_t) sum << 32) +
 550                       ((q63_t) * px++ * (*py++))) >> 32);
 551       /* sum += x[srcALen - srcBLen + 3] * y[2] */
 552       sum = (q31_t) ((((q63_t) sum << 32) +
 553                       ((q63_t) * px++ * (*py++))) >> 32);
 554       /* sum += x[srcALen - srcBLen + 2] * y[1] */
 555       sum = (q31_t) ((((q63_t) sum << 32) +
 556                       ((q63_t) * px++ * (*py++))) >> 32);
 557       /* sum += x[srcALen - srcBLen + 1] * y[0] */
 558       sum = (q31_t) ((((q63_t) sum << 32) +
 559                       ((q63_t) * px++ * (*py++))) >> 32);
 560
 561       /* Decrement the loop counter */
 562       k--;
 563     }
 564
 565     /* If the count is not a multiple of 4, compute any remaining MACs here.
 566      ** No loop unrolling is used. */
 567     k = count % 0x4u;
 568
 569     while(k > 0u)
 570     {
 571       /* Perform the multiply-accumulates */
 572       sum = (q31_t) ((((q63_t) sum << 32) +
 573                       ((q63_t) * px++ * (*py++))) >> 32);
 574
 575       /* Decrement the loop counter */
 576       k--;
 577     }
 578
 579     /* Store the result in the accumulator in the destination buffer. */
 580     *pOut = sum << 1;
 581     /* Destination pointer is updated according to the address modifier, inc */
 582     pOut += inc;
 583
 584     /* Update the inputA and inputB pointers for next MAC calculation */
 585     px = ++pSrc1;
 586     py = pIn2;
 587
 588     /* Decrement the MAC count */
 589     count--;
 590
 591     /* Decrement the loop counter */
 592     blockSize3--;
 593   }
 594
 595 }
 596
 597 /**
 598  * @} end of Corr group
 599  */