git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q31.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_partial_q31.c
   9 *
  10 * Description:  Partial convolution of Q31 sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated
  28 *
  29 * Version 0.0.7  2010/06/10
  30 *    Misra-C changes done
  31 *
  32 * -------------------------------------------------------------------- */
  33
  34 #include "arm_math.h"
  35
  36 /**
  37  * @ingroup groupFilters
  38  */
  39
  40 /**
  41  * @addtogroup PartialConv
  42  * @{
  43  */
  44
  45 /**
  46  * @brief Partial convolution of Q31 sequences.
  47  * @param[in]       *pSrcA points to the first input sequence.
  48  * @param[in]       srcALen length of the first input sequence.
  49  * @param[in]       *pSrcB points to the second input sequence.
  50  * @param[in]       srcBLen length of the second input sequence.
  51  * @param[out]      *pDst points to the location where the output result is written.
  52  * @param[in]       firstIndex is the first output sample to start with.
  53  * @param[in]       numPoints is the number of output points to be computed.
  54  * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
  55  *
  56  * See <code>arm_conv_partial_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
  57  */
  58
  59 arm_status arm_conv_partial_q31(
  60   q31_t * pSrcA,
  61   uint32_t srcALen,
  62   q31_t * pSrcB,
  63   uint32_t srcBLen,
  64   q31_t * pDst,
  65   uint32_t firstIndex,
  66   uint32_t numPoints)
  67 {
  68
  69
  70 #ifndef ARM_MATH_CM0
  71
  72   /* Run the below code for Cortex-M4 and Cortex-M3 */
  73
  74   q31_t *pIn1;                                   /* inputA pointer               */
  75   q31_t *pIn2;                                   /* inputB pointer               */
  76   q31_t *pOut = pDst;                            /* output pointer               */
  77   q31_t *px;                                     /* Intermediate inputA pointer  */
  78   q31_t *py;                                     /* Intermediate inputB pointer  */
  79   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
  80   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */
  81   q31_t x0, x1, x2, x3, c0;
  82   uint32_t j, k, count, check, blkCnt;
  83   int32_t blockSize1, blockSize2, blockSize3;    /* loop counter                 */
  84   arm_status status;                             /* status of Partial convolution */
  85
  86
  87   /* Check for range of output samples to be calculated */
  88   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
  89   {
  90     /* Set status as ARM_MATH_ARGUMENT_ERROR */
  91     status = ARM_MATH_ARGUMENT_ERROR;
  92   }
  93   else
  94   {
  95
  96     /* The algorithm implementation is based on the lengths of the inputs. */
  97     /* srcB is always made to slide across srcA. */
  98     /* So srcBLen is always considered as shorter or equal to srcALen */
  99     if(srcALen >= srcBLen)
 100     {
 101       /* Initialization of inputA pointer */
 102       pIn1 = pSrcA;
 103
 104       /* Initialization of inputB pointer */
 105       pIn2 = pSrcB;
 106     }
 107     else
 108     {
 109       /* Initialization of inputA pointer */
 110       pIn1 = pSrcB;
 111
 112       /* Initialization of inputB pointer */
 113       pIn2 = pSrcA;
 114
 115       /* srcBLen is always considered as shorter or equal to srcALen */
 116       j = srcBLen;
 117       srcBLen = srcALen;
 118       srcALen = j;
 119     }
 120
 121     /* Conditions to check which loopCounter holds
 122      * the first and last indices of the output samples to be calculated. */
 123     check = firstIndex + numPoints;
 124     blockSize3 = ((int32_t) check - (int32_t) srcALen);
 125     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
 126     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
 127     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
 128                                      (int32_t) numPoints) : 0;
 129     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
 130                                     (int32_t) firstIndex);
 131     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
 132
 133     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
 134     /* The function is internally
 135      * divided into three stages according to the number of multiplications that has to be
 136      * taken place between inputA samples and inputB samples. In the first stage of the
 137      * algorithm, the multiplications increase by one for every iteration.
 138      * In the second stage of the algorithm, srcBLen number of multiplications are done.
 139      * In the third stage of the algorithm, the multiplications decrease by one
 140      * for every iteration. */
 141
 142     /* Set the output pointer to point to the firstIndex
 143      * of the output sample to be calculated. */
 144     pOut = pDst + firstIndex;
 145
 146     /* --------------------------
 147      * Initializations of stage1
 148      * -------------------------*/
 149
 150     /* sum = x[0] * y[0]
 151      * sum = x[0] * y[1] + x[1] * y[0]
 152      * ....
 153      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
 154      */
 155
 156     /* In this stage the MAC operations are increased by 1 for every iteration.
 157        The count variable holds the number of MAC operations performed.
 158        Since the partial convolution starts from firstIndex
 159        Number of Macs to be performed is firstIndex + 1 */
 160     count = 1u + firstIndex;
 161
 162     /* Working pointer of inputA */
 163     px = pIn1;
 164
 165     /* Working pointer of inputB */
 166     pSrc2 = pIn2 + firstIndex;
 167     py = pSrc2;
 168
 169     /* ------------------------
 170      * Stage1 process
 171      * ----------------------*/
 172
 173     /* The first loop starts here */
 174     while(blockSize1 > 0)
 175     {
 176       /* Accumulator is made zero for every iteration */
 177       sum = 0;
 178
 179       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 180       k = count >> 2u;
 181
 182       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 183        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 184       while(k > 0u)
 185       {
 186         /* x[0] * y[srcBLen - 1] */
 187         sum += (q63_t) * px++ * (*py--);
 188         /* x[1] * y[srcBLen - 2] */
 189         sum += (q63_t) * px++ * (*py--);
 190         /* x[2] * y[srcBLen - 3] */
 191         sum += (q63_t) * px++ * (*py--);
 192         /* x[3] * y[srcBLen - 4] */
 193         sum += (q63_t) * px++ * (*py--);
 194
 195         /* Decrement the loop counter */
 196         k--;
 197       }
 198
 199       /* If the count is not a multiple of 4, compute any remaining MACs here.
 200        ** No loop unrolling is used. */
 201       k = count % 0x4u;
 202
 203       while(k > 0u)
 204       {
 205         /* Perform the multiply-accumulate */
 206         sum += (q63_t) * px++ * (*py--);
 207
 208         /* Decrement the loop counter */
 209         k--;
 210       }
 211
 212       /* Store the result in the accumulator in the destination buffer. */
 213       *pOut++ = (q31_t) (sum >> 31);
 214
 215       /* Update the inputA and inputB pointers for next MAC calculation */
 216       py = ++pSrc2;
 217       px = pIn1;
 218
 219       /* Increment the MAC count */
 220       count++;
 221
 222       /* Decrement the loop counter */
 223       blockSize1--;
 224     }
 225
 226     /* --------------------------
 227      * Initializations of stage2
 228      * ------------------------*/
 229
 230     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
 231      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
 232      * ....
 233      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
 234      */
 235
 236     /* Working pointer of inputA */
 237     px = pIn1;
 238
 239     /* Working pointer of inputB */
 240     pSrc2 = pIn2 + (srcBLen - 1u);
 241     py = pSrc2;
 242
 243     /* count is index by which the pointer pIn1 to be incremented */
 244     count = 1u;
 245
 246     /* -------------------
 247      * Stage2 process
 248      * ------------------*/
 249
 250     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 251      * So, to loop unroll over blockSize2,
 252      * srcBLen should be greater than or equal to 4 */
 253     if(srcBLen >= 4u)
 254     {
 255       /* Loop unroll over blockSize2 */
 256       blkCnt = ((uint32_t) blockSize2 >> 2u);
 257
 258       while(blkCnt > 0u)
 259       {
 260         /* Set all accumulators to zero */
 261         acc0 = 0;
 262         acc1 = 0;
 263         acc2 = 0;
 264         acc3 = 0;
 265
 266         /* read x[0], x[1], x[2] samples */
 267         x0 = *(px++);
 268         x1 = *(px++);
 269         x2 = *(px++);
 270
 271         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 272         k = srcBLen >> 2u;
 273
 274         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 275          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 276         do
 277         {
 278           /* Read y[srcBLen - 1] sample */
 279           c0 = *(py--);
 280
 281           /* Read x[3] sample */
 282           x3 = *(px++);
 283
 284           /* Perform the multiply-accumulates */
 285           /* acc0 +=  x[0] * y[srcBLen - 1] */
 286           acc0 += (q63_t) x0 *c0;
 287           /* acc1 +=  x[1] * y[srcBLen - 1] */
 288           acc1 += (q63_t) x1 *c0;
 289           /* acc2 +=  x[2] * y[srcBLen - 1] */
 290           acc2 += (q63_t) x2 *c0;
 291           /* acc3 +=  x[3] * y[srcBLen - 1] */
 292           acc3 += (q63_t) x3 *c0;
 293
 294           /* Read y[srcBLen - 2] sample */
 295           c0 = *(py--);
 296
 297           /* Read x[4] sample */
 298           x0 = *(px++);
 299
 300           /* Perform the multiply-accumulate */
 301           /* acc0 +=  x[1] * y[srcBLen - 2] */
 302           acc0 += (q63_t) x1 *c0;
 303           /* acc1 +=  x[2] * y[srcBLen - 2] */
 304           acc1 += (q63_t) x2 *c0;
 305           /* acc2 +=  x[3] * y[srcBLen - 2] */
 306           acc2 += (q63_t) x3 *c0;
 307           /* acc3 +=  x[4] * y[srcBLen - 2] */
 308           acc3 += (q63_t) x0 *c0;
 309
 310           /* Read y[srcBLen - 3] sample */
 311           c0 = *(py--);
 312
 313           /* Read x[5] sample */
 314           x1 = *(px++);
 315
 316           /* Perform the multiply-accumulates */
 317           /* acc0 +=  x[2] * y[srcBLen - 3] */
 318           acc0 += (q63_t) x2 *c0;
 319           /* acc1 +=  x[3] * y[srcBLen - 2] */
 320           acc1 += (q63_t) x3 *c0;
 321           /* acc2 +=  x[4] * y[srcBLen - 2] */
 322           acc2 += (q63_t) x0 *c0;
 323           /* acc3 +=  x[5] * y[srcBLen - 2] */
 324           acc3 += (q63_t) x1 *c0;
 325
 326           /* Read y[srcBLen - 4] sample */
 327           c0 = *(py--);
 328
 329           /* Read x[6] sample */
 330           x2 = *(px++);
 331
 332           /* Perform the multiply-accumulates */
 333           /* acc0 +=  x[3] * y[srcBLen - 4] */
 334           acc0 += (q63_t) x3 *c0;
 335           /* acc1 +=  x[4] * y[srcBLen - 4] */
 336           acc1 += (q63_t) x0 *c0;
 337           /* acc2 +=  x[5] * y[srcBLen - 4] */
 338           acc2 += (q63_t) x1 *c0;
 339           /* acc3 +=  x[6] * y[srcBLen - 4] */
 340           acc3 += (q63_t) x2 *c0;
 341
 342         } while(--k);
 343
 344         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 345          ** No loop unrolling is used. */
 346         k = srcBLen % 0x4u;
 347
 348         while(k > 0u)
 349         {
 350           /* Read y[srcBLen - 5] sample */
 351           c0 = *(py--);
 352
 353           /* Read x[7] sample */
 354           x3 = *(px++);
 355
 356           /* Perform the multiply-accumulates */
 357           /* acc0 +=  x[4] * y[srcBLen - 5] */
 358           acc0 += (q63_t) x0 *c0;
 359           /* acc1 +=  x[5] * y[srcBLen - 5] */
 360           acc1 += (q63_t) x1 *c0;
 361           /* acc2 +=  x[6] * y[srcBLen - 5] */
 362           acc2 += (q63_t) x2 *c0;
 363           /* acc3 +=  x[7] * y[srcBLen - 5] */
 364           acc3 += (q63_t) x3 *c0;
 365
 366           /* Reuse the present samples for the next MAC */
 367           x0 = x1;
 368           x1 = x2;
 369           x2 = x3;
 370
 371           /* Decrement the loop counter */
 372           k--;
 373         }
 374
 375         /* Store the result in the accumulator in the destination buffer. */
 376         *pOut++ = (q31_t) (acc0 >> 31);
 377         *pOut++ = (q31_t) (acc1 >> 31);
 378         *pOut++ = (q31_t) (acc2 >> 31);
 379         *pOut++ = (q31_t) (acc3 >> 31);
 380
 381         /* Update the inputA and inputB pointers for next MAC calculation */
 382         px = pIn1 + (count * 4u);
 383         py = pSrc2;
 384
 385         /* Increment the pointer pIn1 index, count by 1 */
 386         count++;
 387
 388         /* Decrement the loop counter */
 389         blkCnt--;
 390       }
 391
 392       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 393        ** No loop unrolling is used. */
 394       blkCnt = (uint32_t) blockSize2 % 0x4u;
 395
 396       while(blkCnt > 0u)
 397       {
 398         /* Accumulator is made zero for every iteration */
 399         sum = 0;
 400
 401         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 402         k = srcBLen >> 2u;
 403
 404         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 405          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 406         while(k > 0u)
 407         {
 408           /* Perform the multiply-accumulates */
 409           sum += (q63_t) * px++ * (*py--);
 410           sum += (q63_t) * px++ * (*py--);
 411           sum += (q63_t) * px++ * (*py--);
 412           sum += (q63_t) * px++ * (*py--);
 413
 414           /* Decrement the loop counter */
 415           k--;
 416         }
 417
 418         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 419          ** No loop unrolling is used. */
 420         k = srcBLen % 0x4u;
 421
 422         while(k > 0u)
 423         {
 424           /* Perform the multiply-accumulate */
 425           sum += (q63_t) * px++ * (*py--);
 426
 427           /* Decrement the loop counter */
 428           k--;
 429         }
 430
 431         /* Store the result in the accumulator in the destination buffer. */
 432         *pOut++ = (q31_t) (sum >> 31);
 433
 434         /* Update the inputA and inputB pointers for next MAC calculation */
 435         px = pIn1 + count;
 436         py = pSrc2;
 437
 438         /* Increment the MAC count */
 439         count++;
 440
 441         /* Decrement the loop counter */
 442         blkCnt--;
 443       }
 444     }
 445     else
 446     {
 447       /* If the srcBLen is not a multiple of 4,
 448        * the blockSize2 loop cannot be unrolled by 4 */
 449       blkCnt = (uint32_t) blockSize2;
 450
 451       while(blkCnt > 0u)
 452       {
 453         /* Accumulator is made zero for every iteration */
 454         sum = 0;
 455
 456         /* srcBLen number of MACS should be performed */
 457         k = srcBLen;
 458
 459         while(k > 0u)
 460         {
 461           /* Perform the multiply-accumulate */
 462           sum += (q63_t) * px++ * (*py--);
 463
 464           /* Decrement the loop counter */
 465           k--;
 466         }
 467
 468         /* Store the result in the accumulator in the destination buffer. */
 469         *pOut++ = (q31_t) (sum >> 31);
 470
 471         /* Update the inputA and inputB pointers for next MAC calculation */
 472         px = pIn1 + count;
 473         py = pSrc2;
 474
 475         /* Increment the MAC count */
 476         count++;
 477
 478         /* Decrement the loop counter */
 479         blkCnt--;
 480       }
 481     }
 482
 483
 484     /* --------------------------
 485      * Initializations of stage3
 486      * -------------------------*/
 487
 488     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
 489      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
 490      * ....
 491      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
 492      * sum +=  x[srcALen-1] * y[srcBLen-1]
 493      */
 494
 495     /* In this stage the MAC operations are decreased by 1 for every iteration.
 496        The blockSize3 variable holds the number of MAC operations performed */
 497     count = srcBLen - 1u;
 498
 499     /* Working pointer of inputA */
 500     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 501     px = pSrc1;
 502
 503     /* Working pointer of inputB */
 504     pSrc2 = pIn2 + (srcBLen - 1u);
 505     py = pSrc2;
 506
 507     /* -------------------
 508      * Stage3 process
 509      * ------------------*/
 510
 511     while(blockSize3 > 0)
 512     {
 513       /* Accumulator is made zero for every iteration */
 514       sum = 0;
 515
 516       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 517       k = count >> 2u;
 518
 519       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 520        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 521       while(k > 0u)
 522       {
 523         sum += (q63_t) * px++ * (*py--);
 524         sum += (q63_t) * px++ * (*py--);
 525         sum += (q63_t) * px++ * (*py--);
 526         sum += (q63_t) * px++ * (*py--);
 527
 528         /* Decrement the loop counter */
 529         k--;
 530       }
 531
 532       /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
 533        ** No loop unrolling is used. */
 534       k = count % 0x4u;
 535
 536       while(k > 0u)
 537       {
 538         /* Perform the multiply-accumulate */
 539         sum += (q63_t) * px++ * (*py--);
 540
 541         /* Decrement the loop counter */
 542         k--;
 543       }
 544
 545       /* Store the result in the accumulator in the destination buffer. */
 546       *pOut++ = (q31_t) (sum >> 31);
 547
 548       /* Update the inputA and inputB pointers for next MAC calculation */
 549       px = ++pSrc1;
 550       py = pSrc2;
 551
 552       /* Decrement the MAC count */
 553       count--;
 554
 555       /* Decrement the loop counter */
 556       blockSize3--;
 557
 558     }
 559
 560     /* set status as ARM_MATH_SUCCESS */
 561     status = ARM_MATH_SUCCESS;
 562   }
 563
 564   /* Return to application */
 565   return (status);
 566
 567 #else
 568
 569   /* Run the below code for Cortex-M0 */
 570
 571   q31_t *pIn1 = pSrcA;                           /* inputA pointer */
 572   q31_t *pIn2 = pSrcB;                           /* inputB pointer */
 573   q63_t sum;                                     /* Accumulator */
 574   uint32_t i, j;                                 /* loop counters */
 575   arm_status status;                             /* status of Partial convolution */
 576
 577   /* Check for range of output samples to be calculated */
 578   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
 579   {
 580     /* Set status as ARM_ARGUMENT_ERROR */
 581     status = ARM_MATH_ARGUMENT_ERROR;
 582   }
 583   else
 584   {
 585     /* Loop to calculate convolution for output length number of values */
 586     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
 587     {
 588       /* Initialize sum with zero to carry on MAC operations */
 589       sum = 0;
 590
 591       /* Loop to perform MAC operations according to convolution equation */
 592       for (j = 0; j <= i; j++)
 593       {
 594         /* Check the array limitations */
 595         if(((i - j) < srcBLen) && (j < srcALen))
 596         {
 597           /* z[i] += x[i-j] * y[j] */
 598           sum += ((q63_t) pIn1[j] * (pIn2[i - j]));
 599         }
 600       }
 601
 602       /* Store the output in the destination buffer */
 603       pDst[i] = (q31_t) (sum >> 31u);
 604     }
 605     /* set status as ARM_SUCCESS as there are no argument errors */
 606     status = ARM_MATH_SUCCESS;
 607   }
 608   return (status);
 609
 610 #endif /*    #ifndef ARM_MATH_CM0      */
 611
 612 }
 613
 614 /**
 615  * @} end of PartialConv group
 616  */