git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_q31.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_fast_q31.c
   9 *
  10 * Description:  Q31 Convolution (fast version).
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated.
  28 * -------------------------------------------------------------------- */
  29
  30 #include "arm_math.h"
  31
  32 /**
  33  * @ingroup groupFilters
  34  */
  35
  36 /**
  37  * @addtogroup Conv
  38  * @{
  39  */
  40
  41 /**
  42  * @param[in] *pSrcA points to the first input sequence.
  43  * @param[in] srcALen length of the first input sequence.
  44  * @param[in] *pSrcB points to the second input sequence.
  45  * @param[in] srcBLen length of the second input sequence.
  46  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
  47  * @return none.
  48  *
  49  * @details
  50  * <b>Scaling and Overflow Behavior:</b>
  51  *
  52  * \par
  53  * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
  54  * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
  55  * These intermediate results are accumulated in a 32-bit register in 2.30 format.
  56  * Finally, the accumulator is saturated and converted to a 1.31 result.
  57  *
  58  * \par
  59  * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
  60  * In order to avoid overflows completely the input signals must be scaled down.
  61  * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
  62  * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
  63  *
  64  * \par
  65  * See <code>arm_conv_q31()</code> for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
  66  */
  67
  68 void arm_conv_fast_q31(
  69   q31_t * pSrcA,
  70   uint32_t srcALen,
  71   q31_t * pSrcB,
  72   uint32_t srcBLen,
  73   q31_t * pDst)
  74 {
  75   q31_t *pIn1;                                   /* inputA pointer */
  76   q31_t *pIn2;                                   /* inputB pointer */
  77   q31_t *pOut = pDst;                            /* output pointer */
  78   q31_t *px;                                     /* Intermediate inputA pointer  */
  79   q31_t *py;                                     /* Intermediate inputB pointer  */
  80   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
  81   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
  82   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
  83   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counter */
  84
  85
  86   /* The algorithm implementation is based on the lengths of the inputs. */
  87   /* srcB is always made to slide across srcA. */
  88   /* So srcBLen is always considered as shorter or equal to srcALen */
  89   if(srcALen >= srcBLen)
  90   {
  91     /* Initialization of inputA pointer */
  92     pIn1 = pSrcA;
  93
  94     /* Initialization of inputB pointer */
  95     pIn2 = pSrcB;
  96   }
  97   else
  98   {
  99     /* Initialization of inputA pointer */
 100     pIn1 = pSrcB;
 101
 102     /* Initialization of inputB pointer */
 103     pIn2 = pSrcA;
 104
 105     /* srcBLen is always considered as shorter or equal to srcALen */
 106     j = srcBLen;
 107     srcBLen = srcALen;
 108     srcALen = j;
 109   }
 110
 111   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
 112   /* The function is internally
 113    * divided into three stages according to the number of multiplications that has to be
 114    * taken place between inputA samples and inputB samples. In the first stage of the
 115    * algorithm, the multiplications increase by one for every iteration.
 116    * In the second stage of the algorithm, srcBLen number of multiplications are done.
 117    * In the third stage of the algorithm, the multiplications decrease by one
 118    * for every iteration. */
 119
 120   /* The algorithm is implemented in three stages.
 121      The loop counters of each stage is initiated here. */
 122   blockSize1 = srcBLen - 1u;
 123   blockSize2 = srcALen - (srcBLen - 1u);
 124   blockSize3 = blockSize1;
 125
 126   /* --------------------------
 127    * Initializations of stage1
 128    * -------------------------*/
 129
 130   /* sum = x[0] * y[0]
 131    * sum = x[0] * y[1] + x[1] * y[0]
 132    * ....
 133    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
 134    */
 135
 136   /* In this stage the MAC operations are increased by 1 for every iteration.
 137      The count variable holds the number of MAC operations performed */
 138   count = 1u;
 139
 140   /* Working pointer of inputA */
 141   px = pIn1;
 142
 143   /* Working pointer of inputB */
 144   py = pIn2;
 145
 146
 147   /* ------------------------
 148    * Stage1 process
 149    * ----------------------*/
 150
 151   /* The first stage starts here */
 152   while(blockSize1 > 0u)
 153   {
 154     /* Accumulator is made zero for every iteration */
 155     sum = 0;
 156
 157     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 158     k = count >> 2u;
 159
 160     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 161      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 162     while(k > 0u)
 163     {
 164       /* x[0] * y[srcBLen - 1] */
 165       sum = (q31_t) ((((q63_t) sum << 32) +
 166                       ((q63_t) * px++ * (*py--))) >> 32);
 167
 168       /* x[1] * y[srcBLen - 2] */
 169       sum = (q31_t) ((((q63_t) sum << 32) +
 170                       ((q63_t) * px++ * (*py--))) >> 32);
 171
 172       /* x[2] * y[srcBLen - 3] */
 173       sum = (q31_t) ((((q63_t) sum << 32) +
 174                       ((q63_t) * px++ * (*py--))) >> 32);
 175
 176       /* x[3] * y[srcBLen - 4] */
 177       sum = (q31_t) ((((q63_t) sum << 32) +
 178                       ((q63_t) * px++ * (*py--))) >> 32);
 179
 180       /* Decrement the loop counter */
 181       k--;
 182     }
 183
 184     /* If the count is not a multiple of 4, compute any remaining MACs here.
 185      ** No loop unrolling is used. */
 186     k = count % 0x4u;
 187
 188     while(k > 0u)
 189     {
 190       /* Perform the multiply-accumulate */
 191       sum = (q31_t) ((((q63_t) sum << 32) +
 192                       ((q63_t) * px++ * (*py--))) >> 32);
 193
 194       /* Decrement the loop counter */
 195       k--;
 196     }
 197
 198     /* Store the result in the accumulator in the destination buffer. */
 199     *pOut++ = sum << 1;
 200
 201     /* Update the inputA and inputB pointers for next MAC calculation */
 202     py = pIn2 + count;
 203     px = pIn1;
 204
 205     /* Increment the MAC count */
 206     count++;
 207
 208     /* Decrement the loop counter */
 209     blockSize1--;
 210   }
 211
 212   /* --------------------------
 213    * Initializations of stage2
 214    * ------------------------*/
 215
 216   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
 217    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
 218    * ....
 219    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
 220    */
 221
 222   /* Working pointer of inputA */
 223   px = pIn1;
 224
 225   /* Working pointer of inputB */
 226   pSrc2 = pIn2 + (srcBLen - 1u);
 227   py = pSrc2;
 228
 229   /* count is index by which the pointer pIn1 to be incremented */
 230   count = 1u;
 231
 232   /* -------------------
 233    * Stage2 process
 234    * ------------------*/
 235
 236   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 237    * So, to loop unroll over blockSize2,
 238    * srcBLen should be greater than or equal to 4 */
 239   if(srcBLen >= 4u)
 240   {
 241     /* Loop unroll over blockSize2, by 4 */
 242     blkCnt = blockSize2 >> 2u;
 243
 244     while(blkCnt > 0u)
 245     {
 246       /* Set all accumulators to zero */
 247       acc0 = 0;
 248       acc1 = 0;
 249       acc2 = 0;
 250       acc3 = 0;
 251
 252       /* read x[0], x[1], x[2] samples */
 253       x0 = *(px++);
 254       x1 = *(px++);
 255       x2 = *(px++);
 256
 257       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 258       k = srcBLen >> 2u;
 259
 260       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 261        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 262       do
 263       {
 264         /* Read y[srcBLen - 1] sample */
 265         c0 = *(py--);
 266
 267         /* Read x[3] sample */
 268         x3 = *(px++);
 269
 270         /* Perform the multiply-accumulates */
 271         /* acc0 +=  x[0] * y[srcBLen - 1] */
 272         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
 273
 274         /* acc1 +=  x[1] * y[srcBLen - 1] */
 275         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
 276
 277         /* acc2 +=  x[2] * y[srcBLen - 1] */
 278         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
 279
 280         /* acc3 +=  x[3] * y[srcBLen - 1] */
 281         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
 282
 283         /* Read y[srcBLen - 2] sample */
 284         c0 = *(py--);
 285
 286         /* Read x[4] sample */
 287         x0 = *(px++);
 288
 289         /* Perform the multiply-accumulate */
 290         /* acc0 +=  x[1] * y[srcBLen - 2] */
 291         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
 292         /* acc1 +=  x[2] * y[srcBLen - 2] */
 293         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
 294         /* acc2 +=  x[3] * y[srcBLen - 2] */
 295         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
 296         /* acc3 +=  x[4] * y[srcBLen - 2] */
 297         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
 298
 299         /* Read y[srcBLen - 3] sample */
 300         c0 = *(py--);
 301
 302         /* Read x[5] sample */
 303         x1 = *(px++);
 304
 305         /* Perform the multiply-accumulates */
 306         /* acc0 +=  x[2] * y[srcBLen - 3] */
 307         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
 308         /* acc1 +=  x[3] * y[srcBLen - 2] */
 309         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
 310         /* acc2 +=  x[4] * y[srcBLen - 2] */
 311         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
 312         /* acc3 +=  x[5] * y[srcBLen - 2] */
 313         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
 314
 315         /* Read y[srcBLen - 4] sample */
 316         c0 = *(py--);
 317
 318         /* Read x[6] sample */
 319         x2 = *(px++);
 320
 321         /* Perform the multiply-accumulates */
 322         /* acc0 +=  x[3] * y[srcBLen - 4] */
 323         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
 324         /* acc1 +=  x[4] * y[srcBLen - 4] */
 325         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
 326         /* acc2 +=  x[5] * y[srcBLen - 4] */
 327         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
 328         /* acc3 +=  x[6] * y[srcBLen - 4] */
 329         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
 330
 331
 332       } while(--k);
 333
 334       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 335        ** No loop unrolling is used. */
 336       k = srcBLen % 0x4u;
 337
 338       while(k > 0u)
 339       {
 340         /* Read y[srcBLen - 5] sample */
 341         c0 = *(py--);
 342
 343         /* Read x[7] sample */
 344         x3 = *(px++);
 345
 346         /* Perform the multiply-accumulates */
 347         /* acc0 +=  x[4] * y[srcBLen - 5] */
 348         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
 349         /* acc1 +=  x[5] * y[srcBLen - 5] */
 350         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
 351         /* acc2 +=  x[6] * y[srcBLen - 5] */
 352         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
 353         /* acc3 +=  x[7] * y[srcBLen - 5] */
 354         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
 355
 356         /* Reuse the present samples for the next MAC */
 357         x0 = x1;
 358         x1 = x2;
 359         x2 = x3;
 360
 361         /* Decrement the loop counter */
 362         k--;
 363       }
 364
 365       /* Store the results in the accumulators in the destination buffer. */
 366       *pOut++ = (q31_t) (acc0 << 1);
 367       *pOut++ = (q31_t) (acc1 << 1);
 368       *pOut++ = (q31_t) (acc2 << 1);
 369       *pOut++ = (q31_t) (acc3 << 1);
 370
 371       /* Update the inputA and inputB pointers for next MAC calculation */
 372       px = pIn1 + (count * 4u);
 373       py = pSrc2;
 374
 375       /* Increment the pointer pIn1 index, count by 1 */
 376       count++;
 377
 378       /* Decrement the loop counter */
 379       blkCnt--;
 380     }
 381
 382     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 383      ** No loop unrolling is used. */
 384     blkCnt = blockSize2 % 0x4u;
 385
 386     while(blkCnt > 0u)
 387     {
 388       /* Accumulator is made zero for every iteration */
 389       sum = 0;
 390
 391       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 392       k = srcBLen >> 2u;
 393
 394       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 395        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 396       while(k > 0u)
 397       {
 398         /* Perform the multiply-accumulates */
 399         sum = (q31_t) ((((q63_t) sum << 32) +
 400                         ((q63_t) * px++ * (*py--))) >> 32);
 401         sum = (q31_t) ((((q63_t) sum << 32) +
 402                         ((q63_t) * px++ * (*py--))) >> 32);
 403         sum = (q31_t) ((((q63_t) sum << 32) +
 404                         ((q63_t) * px++ * (*py--))) >> 32);
 405         sum = (q31_t) ((((q63_t) sum << 32) +
 406                         ((q63_t) * px++ * (*py--))) >> 32);
 407
 408         /* Decrement the loop counter */
 409         k--;
 410       }
 411
 412       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 413        ** No loop unrolling is used. */
 414       k = srcBLen % 0x4u;
 415
 416       while(k > 0u)
 417       {
 418         /* Perform the multiply-accumulate */
 419         sum = (q31_t) ((((q63_t) sum << 32) +
 420                         ((q63_t) * px++ * (*py--))) >> 32);
 421
 422         /* Decrement the loop counter */
 423         k--;
 424       }
 425
 426       /* Store the result in the accumulator in the destination buffer. */
 427       *pOut++ = sum << 1;
 428
 429       /* Update the inputA and inputB pointers for next MAC calculation */
 430       px = pIn1 + count;
 431       py = pSrc2;
 432
 433       /* Increment the MAC count */
 434       count++;
 435
 436       /* Decrement the loop counter */
 437       blkCnt--;
 438     }
 439   }
 440   else
 441   {
 442     /* If the srcBLen is not a multiple of 4,
 443      * the blockSize2 loop cannot be unrolled by 4 */
 444     blkCnt = blockSize2;
 445
 446     while(blkCnt > 0u)
 447     {
 448       /* Accumulator is made zero for every iteration */
 449       sum = 0;
 450
 451       /* srcBLen number of MACS should be performed */
 452       k = srcBLen;
 453
 454       while(k > 0u)
 455       {
 456         /* Perform the multiply-accumulate */
 457         sum = (q31_t) ((((q63_t) sum << 32) +
 458                         ((q63_t) * px++ * (*py--))) >> 32);
 459
 460         /* Decrement the loop counter */
 461         k--;
 462       }
 463
 464       /* Store the result in the accumulator in the destination buffer. */
 465       *pOut++ = sum << 1;
 466
 467       /* Update the inputA and inputB pointers for next MAC calculation */
 468       px = pIn1 + count;
 469       py = pSrc2;
 470
 471       /* Increment the MAC count */
 472       count++;
 473
 474       /* Decrement the loop counter */
 475       blkCnt--;
 476     }
 477   }
 478
 479
 480   /* --------------------------
 481    * Initializations of stage3
 482    * -------------------------*/
 483
 484   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
 485    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
 486    * ....
 487    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
 488    * sum +=  x[srcALen-1] * y[srcBLen-1]
 489    */
 490
 491   /* In this stage the MAC operations are decreased by 1 for every iteration.
 492      The blockSize3 variable holds the number of MAC operations performed */
 493
 494   /* Working pointer of inputA */
 495   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 496   px = pSrc1;
 497
 498   /* Working pointer of inputB */
 499   pSrc2 = pIn2 + (srcBLen - 1u);
 500   py = pSrc2;
 501
 502   /* -------------------
 503    * Stage3 process
 504    * ------------------*/
 505
 506   while(blockSize3 > 0u)
 507   {
 508     /* Accumulator is made zero for every iteration */
 509     sum = 0;
 510
 511     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 512     k = blockSize3 >> 2u;
 513
 514     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 515      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 516     while(k > 0u)
 517     {
 518       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
 519       sum = (q31_t) ((((q63_t) sum << 32) +
 520                       ((q63_t) * px++ * (*py--))) >> 32);
 521
 522       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
 523       sum = (q31_t) ((((q63_t) sum << 32) +
 524                       ((q63_t) * px++ * (*py--))) >> 32);
 525
 526       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
 527       sum = (q31_t) ((((q63_t) sum << 32) +
 528                       ((q63_t) * px++ * (*py--))) >> 32);
 529
 530       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
 531       sum = (q31_t) ((((q63_t) sum << 32) +
 532                       ((q63_t) * px++ * (*py--))) >> 32);
 533
 534       /* Decrement the loop counter */
 535       k--;
 536     }
 537
 538     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
 539      ** No loop unrolling is used. */
 540     k = blockSize3 % 0x4u;
 541
 542     while(k > 0u)
 543     {
 544       /* Perform the multiply-accumulate */
 545       sum = (q31_t) ((((q63_t) sum << 32) +
 546                       ((q63_t) * px++ * (*py--))) >> 32);
 547
 548       /* Decrement the loop counter */
 549       k--;
 550     }
 551
 552     /* Store the result in the accumulator in the destination buffer. */
 553     *pOut++ = sum << 1;
 554
 555     /* Update the inputA and inputB pointers for next MAC calculation */
 556     px = ++pSrc1;
 557     py = pSrc2;
 558
 559     /* Decrement the loop counter */
 560     blockSize3--;
 561   }
 562
 563 }
 564
 565 /**
 566  * @} end of Conv group
 567  */