git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_f32.c

   1 /* ----------------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_f32.c
   9 *
  10 * Description:  Convolution of floating-point sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated
  28 *
  29 * Version 0.0.7  2010/06/10
  30 *    Misra-C changes done
  31 *
  32 * -------------------------------------------------------------------------- */
  33
  34 #include "arm_math.h"
  35
  36 /**
  37  * @ingroup groupFilters
  38  */
  39
  40 /**
  41  * @defgroup Conv Convolution
  42  *
  43  * Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector.
  44  * Convolution is similar to correlation and is frequently used in filtering and data analysis.
  45  * The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types.
  46  * The library also provides fast versions of the Q15 and Q31 functions on Cortex-M4 and Cortex-M3.
  47  *
  48  * \par Algorithm
  49  * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
  50  * Then the convolution
  51  *
  52  * <pre>
  53  *                   c[n] = a[n] * b[n]
  54  * </pre>
  55  *
  56  * \par
  57  * is defined as
  58  * \image html ConvolutionEquation.gif
  59  * \par
  60  * Note that <code>c[n]</code> is of length <code>srcALen + srcBLen - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., srcALen + srcBLen - 2</code>.
  61  * <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and
  62  * <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
  63  * The output result is written to <code>pDst</code> and the calling function must allocate <code>srcALen+srcBLen-1</code> words for the result.
  64  *
  65  * \par
  66  * Conceptually, when two signals <code>a[n]</code> and <code>b[n]</code> are convolved,
  67  * the signal <code>b[n]</code> slides over <code>a[n]</code>.
  68  * For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together.
  69  *
  70  * \par
  71  * Note that convolution is a commutative operation:
  72  *
  73  * <pre>
  74  *                   a[n] * b[n] = b[n] * a[n].
  75  * </pre>
  76  *
  77  * \par
  78  * This means that switching the A and B arguments to the convolution functions has no effect.
  79  *
  80  * <b>Fixed-Point Behavior</b>
  81  *
  82  * \par
  83  * Convolution requires summing up a large number of intermediate products.
  84  * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
  85  * Refer to the function specific documentation below for further details of the particular algorithm used.
  86  */
  87
  88 /**
  89  * @addtogroup Conv
  90  * @{
  91  */
  92
  93 /**
  94  * @brief Convolution of floating-point sequences.
  95  * @param[in] *pSrcA points to the first input sequence.
  96  * @param[in] srcALen length of the first input sequence.
  97  * @param[in] *pSrcB points to the second input sequence.
  98  * @param[in] srcBLen length of the second input sequence.
  99  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
 100  * @return none.
 101  */
 102
 103 void arm_conv_f32(
 104   float32_t * pSrcA,
 105   uint32_t srcALen,
 106   float32_t * pSrcB,
 107   uint32_t srcBLen,
 108   float32_t * pDst)
 109 {
 110
 111
 112 #ifndef ARM_MATH_CM0
 113
 114   /* Run the below code for Cortex-M4 and Cortex-M3 */
 115
 116   float32_t *pIn1;                               /* inputA pointer */
 117   float32_t *pIn2;                               /* inputB pointer */
 118   float32_t *pOut = pDst;                        /* output pointer */
 119   float32_t *px;                                 /* Intermediate inputA pointer */
 120   float32_t *py;                                 /* Intermediate inputB pointer */
 121   float32_t *pSrc1, *pSrc2;                      /* Intermediate pointers */
 122   float32_t sum, acc0, acc1, acc2, acc3;         /* Accumulator */
 123   float32_t x0, x1, x2, x3, c0;                  /* Temporary variables to hold state and coefficient values */
 124   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counters */
 125
 126   /* The algorithm implementation is based on the lengths of the inputs. */
 127   /* srcB is always made to slide across srcA. */
 128   /* So srcBLen is always considered as shorter or equal to srcALen */
 129   if(srcALen >= srcBLen)
 130   {
 131     /* Initialization of inputA pointer */
 132     pIn1 = pSrcA;
 133
 134     /* Initialization of inputB pointer */
 135     pIn2 = pSrcB;
 136   }
 137   else
 138   {
 139     /* Initialization of inputA pointer */
 140     pIn1 = pSrcB;
 141
 142     /* Initialization of inputB pointer */
 143     pIn2 = pSrcA;
 144
 145     /* srcBLen is always considered as shorter or equal to srcALen */
 146     j = srcBLen;
 147     srcBLen = srcALen;
 148     srcALen = j;
 149   }
 150
 151   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
 152   /* The function is internally
 153    * divided into three stages according to the number of multiplications that has to be
 154    * taken place between inputA samples and inputB samples. In the first stage of the
 155    * algorithm, the multiplications increase by one for every iteration.
 156    * In the second stage of the algorithm, srcBLen number of multiplications are done.
 157    * In the third stage of the algorithm, the multiplications decrease by one
 158    * for every iteration. */
 159
 160   /* The algorithm is implemented in three stages.
 161      The loop counters of each stage is initiated here. */
 162   blockSize1 = srcBLen - 1u;
 163   blockSize2 = srcALen - (srcBLen - 1u);
 164   blockSize3 = blockSize1;
 165
 166   /* --------------------------
 167    * initializations of stage1
 168    * -------------------------*/
 169
 170   /* sum = x[0] * y[0]
 171    * sum = x[0] * y[1] + x[1] * y[0]
 172    * ....
 173    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
 174    */
 175
 176   /* In this stage the MAC operations are increased by 1 for every iteration.
 177      The count variable holds the number of MAC operations performed */
 178   count = 1u;
 179
 180   /* Working pointer of inputA */
 181   px = pIn1;
 182
 183   /* Working pointer of inputB */
 184   py = pIn2;
 185
 186
 187   /* ------------------------
 188    * Stage1 process
 189    * ----------------------*/
 190
 191   /* The first stage starts here */
 192   while(blockSize1 > 0u)
 193   {
 194     /* Accumulator is made zero for every iteration */
 195     sum = 0.0f;
 196
 197     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 198     k = count >> 2u;
 199
 200     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 201      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 202     while(k > 0u)
 203     {
 204       /* x[0] * y[srcBLen - 1] */
 205       sum += *px++ * *py--;
 206
 207       /* x[1] * y[srcBLen - 2] */
 208       sum += *px++ * *py--;
 209
 210       /* x[2] * y[srcBLen - 3] */
 211       sum += *px++ * *py--;
 212
 213       /* x[3] * y[srcBLen - 4] */
 214       sum += *px++ * *py--;
 215
 216       /* Decrement the loop counter */
 217       k--;
 218     }
 219
 220     /* If the count is not a multiple of 4, compute any remaining MACs here.
 221      ** No loop unrolling is used. */
 222     k = count % 0x4u;
 223
 224     while(k > 0u)
 225     {
 226       /* Perform the multiply-accumulate */
 227       sum += *px++ * *py--;
 228
 229       /* Decrement the loop counter */
 230       k--;
 231     }
 232
 233     /* Store the result in the accumulator in the destination buffer. */
 234     *pOut++ = sum;
 235
 236     /* Update the inputA and inputB pointers for next MAC calculation */
 237     py = pIn2 + count;
 238     px = pIn1;
 239
 240     /* Increment the MAC count */
 241     count++;
 242
 243     /* Decrement the loop counter */
 244     blockSize1--;
 245   }
 246
 247   /* --------------------------
 248    * Initializations of stage2
 249    * ------------------------*/
 250
 251   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
 252    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
 253    * ....
 254    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
 255    */
 256
 257   /* Working pointer of inputA */
 258   px = pIn1;
 259
 260   /* Working pointer of inputB */
 261   pSrc2 = pIn2 + (srcBLen - 1u);
 262   py = pSrc2;
 263
 264   /* count is index by which the pointer pIn1 to be incremented */
 265   count = 1u;
 266
 267   /* -------------------
 268    * Stage2 process
 269    * ------------------*/
 270
 271   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 272    * So, to loop unroll over blockSize2,
 273    * srcBLen should be greater than or equal to 4 */
 274   if(srcBLen >= 4u)
 275   {
 276     /* Loop unroll over blockSize2, by 4 */
 277     blkCnt = blockSize2 >> 2u;
 278
 279     while(blkCnt > 0u)
 280     {
 281       /* Set all accumulators to zero */
 282       acc0 = 0.0f;
 283       acc1 = 0.0f;
 284       acc2 = 0.0f;
 285       acc3 = 0.0f;
 286
 287       /* read x[0], x[1], x[2] samples */
 288       x0 = *(px++);
 289       x1 = *(px++);
 290       x2 = *(px++);
 291
 292       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 293       k = srcBLen >> 2u;
 294
 295       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 296        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 297       do
 298       {
 299         /* Read y[srcBLen - 1] sample */
 300         c0 = *(py--);
 301
 302         /* Read x[3] sample */
 303         x3 = *(px++);
 304
 305         /* Perform the multiply-accumulate */
 306         /* acc0 +=  x[0] * y[srcBLen - 1] */
 307         acc0 += x0 * c0;
 308
 309         /* acc1 +=  x[1] * y[srcBLen - 1] */
 310         acc1 += x1 * c0;
 311
 312         /* acc2 +=  x[2] * y[srcBLen - 1] */
 313         acc2 += x2 * c0;
 314
 315         /* acc3 +=  x[3] * y[srcBLen - 1] */
 316         acc3 += x3 * c0;
 317
 318         /* Read y[srcBLen - 2] sample */
 319         c0 = *(py--);
 320
 321         /* Read x[4] sample */
 322         x0 = *(px++);
 323
 324         /* Perform the multiply-accumulate */
 325         /* acc0 +=  x[1] * y[srcBLen - 2] */
 326         acc0 += x1 * c0;
 327         /* acc1 +=  x[2] * y[srcBLen - 2] */
 328         acc1 += x2 * c0;
 329         /* acc2 +=  x[3] * y[srcBLen - 2] */
 330         acc2 += x3 * c0;
 331         /* acc3 +=  x[4] * y[srcBLen - 2] */
 332         acc3 += x0 * c0;
 333
 334         /* Read y[srcBLen - 3] sample */
 335         c0 = *(py--);
 336
 337         /* Read x[5] sample */
 338         x1 = *(px++);
 339
 340         /* Perform the multiply-accumulates */
 341         /* acc0 +=  x[2] * y[srcBLen - 3] */
 342         acc0 += x2 * c0;
 343         /* acc1 +=  x[3] * y[srcBLen - 2] */
 344         acc1 += x3 * c0;
 345         /* acc2 +=  x[4] * y[srcBLen - 2] */
 346         acc2 += x0 * c0;
 347         /* acc3 +=  x[5] * y[srcBLen - 2] */
 348         acc3 += x1 * c0;
 349
 350         /* Read y[srcBLen - 4] sample */
 351         c0 = *(py--);
 352
 353         /* Read x[6] sample */
 354         x2 = *(px++);
 355
 356         /* Perform the multiply-accumulates */
 357         /* acc0 +=  x[3] * y[srcBLen - 4] */
 358         acc0 += x3 * c0;
 359         /* acc1 +=  x[4] * y[srcBLen - 4] */
 360         acc1 += x0 * c0;
 361         /* acc2 +=  x[5] * y[srcBLen - 4] */
 362         acc2 += x1 * c0;
 363         /* acc3 +=  x[6] * y[srcBLen - 4] */
 364         acc3 += x2 * c0;
 365
 366
 367       } while(--k);
 368
 369       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 370        ** No loop unrolling is used. */
 371       k = srcBLen % 0x4u;
 372
 373       while(k > 0u)
 374       {
 375         /* Read y[srcBLen - 5] sample */
 376         c0 = *(py--);
 377
 378         /* Read x[7] sample */
 379         x3 = *(px++);
 380
 381         /* Perform the multiply-accumulates */
 382         /* acc0 +=  x[4] * y[srcBLen - 5] */
 383         acc0 += x0 * c0;
 384         /* acc1 +=  x[5] * y[srcBLen - 5] */
 385         acc1 += x1 * c0;
 386         /* acc2 +=  x[6] * y[srcBLen - 5] */
 387         acc2 += x2 * c0;
 388         /* acc3 +=  x[7] * y[srcBLen - 5] */
 389         acc3 += x3 * c0;
 390
 391         /* Reuse the present samples for the next MAC */
 392         x0 = x1;
 393         x1 = x2;
 394         x2 = x3;
 395
 396         /* Decrement the loop counter */
 397         k--;
 398       }
 399
 400       /* Store the result in the accumulator in the destination buffer. */
 401       *pOut++ = acc0;
 402       *pOut++ = acc1;
 403       *pOut++ = acc2;
 404       *pOut++ = acc3;
 405
 406       /* Update the inputA and inputB pointers for next MAC calculation */
 407       px = pIn1 + (count * 4u);
 408       py = pSrc2;
 409
 410       /* Increment the pointer pIn1 index, count by 1 */
 411       count++;
 412
 413       /* Decrement the loop counter */
 414       blkCnt--;
 415     }
 416
 417     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 418      ** No loop unrolling is used. */
 419     blkCnt = blockSize2 % 0x4u;
 420
 421     while(blkCnt > 0u)
 422     {
 423       /* Accumulator is made zero for every iteration */
 424       sum = 0.0f;
 425
 426       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 427       k = srcBLen >> 2u;
 428
 429       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 430        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 431       while(k > 0u)
 432       {
 433         /* Perform the multiply-accumulates */
 434         sum += *px++ * *py--;
 435         sum += *px++ * *py--;
 436         sum += *px++ * *py--;
 437         sum += *px++ * *py--;
 438
 439         /* Decrement the loop counter */
 440         k--;
 441       }
 442
 443       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 444        ** No loop unrolling is used. */
 445       k = srcBLen % 0x4u;
 446
 447       while(k > 0u)
 448       {
 449         /* Perform the multiply-accumulate */
 450         sum += *px++ * *py--;
 451
 452         /* Decrement the loop counter */
 453         k--;
 454       }
 455
 456       /* Store the result in the accumulator in the destination buffer. */
 457       *pOut++ = sum;
 458
 459       /* Update the inputA and inputB pointers for next MAC calculation */
 460       px = pIn1 + count;
 461       py = pSrc2;
 462
 463       /* Increment the MAC count */
 464       count++;
 465
 466       /* Decrement the loop counter */
 467       blkCnt--;
 468     }
 469   }
 470   else
 471   {
 472     /* If the srcBLen is not a multiple of 4,
 473      * the blockSize2 loop cannot be unrolled by 4 */
 474     blkCnt = blockSize2;
 475
 476     while(blkCnt > 0u)
 477     {
 478       /* Accumulator is made zero for every iteration */
 479       sum = 0.0f;
 480
 481       /* srcBLen number of MACS should be performed */
 482       k = srcBLen;
 483
 484       while(k > 0u)
 485       {
 486         /* Perform the multiply-accumulate */
 487         sum += *px++ * *py--;
 488
 489         /* Decrement the loop counter */
 490         k--;
 491       }
 492
 493       /* Store the result in the accumulator in the destination buffer. */
 494       *pOut++ = sum;
 495
 496       /* Update the inputA and inputB pointers for next MAC calculation */
 497       px = pIn1 + count;
 498       py = pSrc2;
 499
 500       /* Increment the MAC count */
 501       count++;
 502
 503       /* Decrement the loop counter */
 504       blkCnt--;
 505     }
 506   }
 507
 508
 509   /* --------------------------
 510    * Initializations of stage3
 511    * -------------------------*/
 512
 513   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
 514    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
 515    * ....
 516    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
 517    * sum +=  x[srcALen-1] * y[srcBLen-1]
 518    */
 519
 520   /* In this stage the MAC operations are decreased by 1 for every iteration.
 521      The blockSize3 variable holds the number of MAC operations performed */
 522
 523   /* Working pointer of inputA */
 524   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 525   px = pSrc1;
 526
 527   /* Working pointer of inputB */
 528   pSrc2 = pIn2 + (srcBLen - 1u);
 529   py = pSrc2;
 530
 531   /* -------------------
 532    * Stage3 process
 533    * ------------------*/
 534
 535   while(blockSize3 > 0u)
 536   {
 537     /* Accumulator is made zero for every iteration */
 538     sum = 0.0f;
 539
 540     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 541     k = blockSize3 >> 2u;
 542
 543     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 544      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 545     while(k > 0u)
 546     {
 547       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
 548       sum += *px++ * *py--;
 549
 550       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
 551       sum += *px++ * *py--;
 552
 553       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
 554       sum += *px++ * *py--;
 555
 556       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
 557       sum += *px++ * *py--;
 558
 559       /* Decrement the loop counter */
 560       k--;
 561     }
 562
 563     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
 564      ** No loop unrolling is used. */
 565     k = blockSize3 % 0x4u;
 566
 567     while(k > 0u)
 568     {
 569       /* Perform the multiply-accumulates */
 570       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
 571       sum += *px++ * *py--;
 572
 573       /* Decrement the loop counter */
 574       k--;
 575     }
 576
 577     /* Store the result in the accumulator in the destination buffer. */
 578     *pOut++ = sum;
 579
 580     /* Update the inputA and inputB pointers for next MAC calculation */
 581     px = ++pSrc1;
 582     py = pSrc2;
 583
 584     /* Decrement the loop counter */
 585     blockSize3--;
 586   }
 587
 588 #else
 589
 590   /* Run the below code for Cortex-M0 */
 591
 592   float32_t *pIn1 = pSrcA;                       /* inputA pointer */
 593   float32_t *pIn2 = pSrcB;                       /* inputB pointer */
 594   float32_t sum;                                 /* Accumulator */
 595   uint32_t i, j;                                 /* loop counters */
 596
 597   /* Loop to calculate convolution for output length number of times */
 598   for (i = 0u; i < ((srcALen + srcBLen) - 1u); i++)
 599   {
 600     /* Initialize sum with zero to carry out MAC operations */
 601     sum = 0.0f;
 602
 603     /* Loop to perform MAC operations according to convolution equation */
 604     for (j = 0u; j <= i; j++)
 605     {
 606       /* Check the array limitations */
 607       if((((i - j) < srcBLen) && (j < srcALen)))
 608       {
 609         /* z[i] += x[i-j] * y[j] */
 610         sum += pIn1[j] * pIn2[i - j];
 611       }
 612     }
 613     /* Store the output in the destination buffer */
 614     pDst[i] = sum;
 615   }
 616
 617 #endif /*   #ifndef ARM_MATH_CM0        */
 618
 619 }
 620
 621 /**
 622  * @} end of Conv group
 623  */