CMSIS DSP Software Library: arm_conv_partial_fast

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------   
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.   
00003 *   
00004 * $Date:        15. July 2011  
00005 * $Revision:    V1.0.10  
00006 *   
00007 * Project:      CMSIS DSP Library   
00008 * Title:        arm_conv_partial_fast_q15.c   
00009 *   
00010 * Description:  Fast Q15 Partial convolution.   
00011 *   
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.10 2011/7/15 
00015 *    Big Endian support added and Merged M0 and M3/M4 Source code.  
00016 *   
00017 * Version 1.0.3 2010/11/29  
00018 *    Re-organized the CMSIS folders and updated documentation.   
00019 *    
00020 * Version 1.0.2 2010/11/11   
00021 *    Documentation updated.    
00022 *   
00023 * Version 1.0.1 2010/10/05    
00024 *    Production release and review comments incorporated.   
00025 *   
00026 * Version 1.0.0 2010/09/20    
00027 *    Production release and review comments incorporated.   
00028 * -------------------------------------------------------------------- */
00029 
00030 #include "arm_math.h"
00031 
00056 arm_status arm_conv_partial_fast_q15(
00057   q15_t * pSrcA,
00058   uint32_t srcALen,
00059   q15_t * pSrcB,
00060   uint32_t srcBLen,
00061   q15_t * pDst,
00062   uint32_t firstIndex,
00063   uint32_t numPoints)
00064 {
00065   q15_t *pIn1;                                   /* inputA pointer               */
00066   q15_t *pIn2;                                   /* inputB pointer               */
00067   q15_t *pOut = pDst;                            /* output pointer               */
00068   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */
00069   q15_t *px;                                     /* Intermediate inputA pointer  */
00070   q15_t *py;                                     /* Intermediate inputB pointer  */
00071   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
00072   q31_t x0, x1, x2, x3, c0;
00073   uint32_t j, k, count, check, blkCnt;
00074   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters                 */
00075   arm_status status;                             /* status of Partial convolution */
00076   q31_t *pb;                                     /* 32 bit pointer for inputB buffer */
00077 
00078   /* Check for range of output samples to be calculated */
00079   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00080   {
00081     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00082     status = ARM_MATH_ARGUMENT_ERROR;
00083   }
00084   else
00085   {
00086 
00087     /* The algorithm implementation is based on the lengths of the inputs. */
00088     /* srcB is always made to slide across srcA. */
00089     /* So srcBLen is always considered as shorter or equal to srcALen */
00090     if(srcALen >= srcBLen)
00091     {
00092       /* Initialization of inputA pointer */
00093       pIn1 = pSrcA;
00094 
00095       /* Initialization of inputB pointer */
00096       pIn2 = pSrcB;
00097     }
00098     else
00099     {
00100       /* Initialization of inputA pointer */
00101       pIn1 = pSrcB;
00102 
00103       /* Initialization of inputB pointer */
00104       pIn2 = pSrcA;
00105 
00106       /* srcBLen is always considered as shorter or equal to srcALen */
00107       j = srcBLen;
00108       srcBLen = srcALen;
00109       srcALen = j;
00110     }
00111 
00112     /* Conditions to check which loopCounter holds   
00113      * the first and last indices of the output samples to be calculated. */
00114     check = firstIndex + numPoints;
00115     blockSize3 = ((int32_t) check - (int32_t) srcALen);
00116     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
00117     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
00118     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
00119                                      (int32_t) numPoints) : 0;
00120     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
00121                                     (int32_t) firstIndex);
00122     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00123 
00124     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00125     /* The function is internally   
00126      * divided into three stages according to the number of multiplications that has to be   
00127      * taken place between inputA samples and inputB samples. In the first stage of the   
00128      * algorithm, the multiplications increase by one for every iteration.   
00129      * In the second stage of the algorithm, srcBLen number of multiplications are done.   
00130      * In the third stage of the algorithm, the multiplications decrease by one   
00131      * for every iteration. */
00132 
00133     /* Set the output pointer to point to the firstIndex   
00134      * of the output sample to be calculated. */
00135     pOut = pDst + firstIndex;
00136 
00137     /* --------------------------   
00138      * Initializations of stage1   
00139      * -------------------------*/
00140 
00141     /* sum = x[0] * y[0]   
00142      * sum = x[0] * y[1] + x[1] * y[0]   
00143      * ....   
00144      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]   
00145      */
00146 
00147     /* In this stage the MAC operations are increased by 1 for every iteration.   
00148        The count variable holds the number of MAC operations performed.   
00149        Since the partial convolution starts from firstIndex   
00150        Number of Macs to be performed is firstIndex + 1 */
00151     count = 1u + firstIndex;
00152 
00153     /* Working pointer of inputA */
00154     px = pIn1;
00155 
00156     /* Working pointer of inputB */
00157     pSrc2 = pIn2 + firstIndex;
00158     py = pSrc2;
00159 
00160     /* ------------------------   
00161      * Stage1 process   
00162      * ----------------------*/
00163 
00164     /* For loop unrolling by 4, this stage is divided into two. */
00165     /* First part of this stage computes the MAC operations less than 4 */
00166     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
00167 
00168     /* The first part of the stage starts here */
00169     while((count < 4u) && (blockSize1 > 0))
00170     {
00171       /* Accumulator is made zero for every iteration */
00172       sum = 0;
00173 
00174       /* Loop over number of MAC operations between   
00175        * inputA samples and inputB samples */
00176       k = count;
00177 
00178       while(k > 0u)
00179       {
00180         /* Perform the multiply-accumulates */
00181         sum = __SMLAD(*px++, *py--, sum);
00182 
00183         /* Decrement the loop counter */
00184         k--;
00185       }
00186 
00187       /* Store the result in the accumulator in the destination buffer. */
00188       *pOut++ = (q15_t) (sum >> 15);
00189 
00190       /* Update the inputA and inputB pointers for next MAC calculation */
00191       py = ++pSrc2;
00192       px = pIn1;
00193 
00194       /* Increment the MAC count */
00195       count++;
00196 
00197       /* Decrement the loop counter */
00198       blockSize1--;
00199     }
00200 
00201     /* The second part of the stage starts here */
00202     /* The internal loop, over count, is unrolled by 4 */
00203     /* To, read the last two inputB samples using SIMD:   
00204      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
00205     py = py - 1;
00206 
00207     while(blockSize1 > 0)
00208     {
00209       /* Accumulator is made zero for every iteration */
00210       sum = 0;
00211 
00212       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00213       k = count >> 2u;
00214 
00215       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00216        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00217       while(k > 0u)
00218       {
00219         /* Perform the multiply-accumulates */
00220         /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
00221         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00222         /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
00223         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00224 
00225         /* Decrement the loop counter */
00226         k--;
00227       }
00228 
00229       /* For the next MAC operations, the pointer py is used without SIMD   
00230        * So, py is incremented by 1 */
00231       py = py + 1u;
00232 
00233       /* If the count is not a multiple of 4, compute any remaining MACs here.   
00234        ** No loop unrolling is used. */
00235       k = count % 0x4u;
00236 
00237       while(k > 0u)
00238       {
00239         /* Perform the multiply-accumulates */
00240         sum = __SMLAD(*px++, *py--, sum);
00241 
00242         /* Decrement the loop counter */
00243         k--;
00244       }
00245 
00246       /* Store the result in the accumulator in the destination buffer. */
00247       *pOut++ = (q15_t) (sum >> 15);
00248 
00249       /* Update the inputA and inputB pointers for next MAC calculation */
00250       py = ++pSrc2 - 1u;
00251       px = pIn1;
00252 
00253       /* Increment the MAC count */
00254       count++;
00255 
00256       /* Decrement the loop counter */
00257       blockSize1--;
00258     }
00259 
00260     /* --------------------------   
00261      * Initializations of stage2   
00262      * ------------------------*/
00263 
00264     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]   
00265      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]   
00266      * ....   
00267      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]   
00268      */
00269 
00270     /* Working pointer of inputA */
00271     px = pIn1;
00272 
00273     /* Working pointer of inputB */
00274     pSrc2 = pIn2 + (srcBLen - 1u);
00275     py = pSrc2;
00276 
00277     /* Initialize inputB pointer of type q31 */
00278     pb = (q31_t *) (py - 1u);
00279 
00280     /* count is the index by which the pointer pIn1 to be incremented */
00281     count = 1u;
00282 
00283 
00284     /* --------------------   
00285      * Stage2 process   
00286      * -------------------*/
00287 
00288     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00289      * So, to loop unroll over blockSize2,   
00290      * srcBLen should be greater than or equal to 4 */
00291     if(srcBLen >= 4u)
00292     {
00293       /* Loop unroll over blockSize2, by 4 */
00294       blkCnt = ((uint32_t) blockSize2 >> 2u);
00295 
00296       while(blkCnt > 0u)
00297       {
00298         /* Set all accumulators to zero */
00299         acc0 = 0;
00300         acc1 = 0;
00301         acc2 = 0;
00302         acc3 = 0;
00303 
00304 
00305         /* read x[0], x[1] samples */
00306         x0 = *(q31_t *) (px++);
00307         /* read x[1], x[2] samples */
00308         x1 = *(q31_t *) (px++);
00309 
00310 
00311         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00312         k = srcBLen >> 2u;
00313 
00314         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00315          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00316         do
00317         {
00318           /* Read the last two inputB samples using SIMD:   
00319            * y[srcBLen - 1] and y[srcBLen - 2] */
00320           c0 = *(pb--);
00321 
00322           /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
00323           acc0 = __SMLADX(x0, c0, acc0);
00324 
00325           /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
00326           acc1 = __SMLADX(x1, c0, acc1);
00327 
00328           /* Read x[2], x[3] */
00329           x2 = *(q31_t *) (px++);
00330 
00331           /* Read x[3], x[4] */
00332           x3 = *(q31_t *) (px++);
00333 
00334           /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
00335           acc2 = __SMLADX(x2, c0, acc2);
00336 
00337           /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
00338           acc3 = __SMLADX(x3, c0, acc3);
00339 
00340           /* Read y[srcBLen - 3] and y[srcBLen - 4] */
00341           c0 = *(pb--);
00342 
00343           /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
00344           acc0 = __SMLADX(x2, c0, acc0);
00345 
00346           /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
00347           acc1 = __SMLADX(x3, c0, acc1);
00348 
00349           /* Read x[4], x[5] */
00350           x0 = *(q31_t *) (px++);
00351 
00352           /* Read x[5], x[6] */
00353           x1 = *(q31_t *) (px++);
00354 
00355           /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
00356           acc2 = __SMLADX(x0, c0, acc2);
00357 
00358           /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
00359           acc3 = __SMLADX(x1, c0, acc3);
00360 
00361         } while(--k);
00362 
00363         /* For the next MAC operations, SIMD is not used   
00364          * So, the 16 bit pointer if inputB, py is updated */
00365         py = (q15_t *) pb;
00366         py = py + 1;
00367 
00368         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00369          ** No loop unrolling is used. */
00370         k = srcBLen % 0x4u;
00371 
00372         if(k == 1u)
00373         {
00374           /* Read y[srcBLen - 5] */
00375           c0 = *(py);
00376 #ifdef  ARM_MATH_BIG_ENDIAN
00377 
00378           c0 = c0 << 16;
00379 
00380 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00381 
00382           /* Read x[7] */
00383           x3 = *(q31_t *) px++;
00384 
00385           /* Perform the multiply-accumulates */
00386           acc0 = __SMLAD(x0, c0, acc0);
00387           acc1 = __SMLAD(x1, c0, acc1);
00388           acc2 = __SMLADX(x1, c0, acc2);
00389           acc3 = __SMLADX(x3, c0, acc3);
00390         }
00391 
00392         if(k == 2u)
00393         {
00394           /* Read y[srcBLen - 5], y[srcBLen - 6] */
00395           c0 = *(pb);
00396 
00397           /* Read x[7], x[8] */
00398           x3 = *(q31_t *) px++;
00399 
00400           /* Read x[9] */
00401           x2 = *(q31_t *) px++;
00402 
00403           /* Perform the multiply-accumulates */
00404           acc0 = __SMLADX(x0, c0, acc0);
00405           acc1 = __SMLADX(x1, c0, acc1);
00406           acc2 = __SMLADX(x3, c0, acc2);
00407           acc3 = __SMLADX(x2, c0, acc3);
00408         }
00409 
00410         if(k == 3u)
00411         {
00412           /* Read y[srcBLen - 5], y[srcBLen - 6] */
00413           c0 = *pb--;
00414 
00415           /* Read x[7], x[8] */
00416           x3 = *(q31_t *) px++;
00417 
00418           /* Read x[9] */
00419           x2 = *(q31_t *) px++;
00420 
00421           /* Perform the multiply-accumulates */
00422           acc0 = __SMLADX(x0, c0, acc0);
00423           acc1 = __SMLADX(x1, c0, acc1);
00424           acc2 = __SMLADX(x3, c0, acc2);
00425           acc3 = __SMLADX(x2, c0, acc3);
00426 
00427           /* Read y[srcBLen - 7] */
00428 #ifdef  ARM_MATH_BIG_ENDIAN
00429 
00430           c0 = (*pb);
00431           c0 = (c0) << 16;
00432 
00433 #else
00434 
00435           c0 = (q15_t) (*pb >> 16);
00436 
00437 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00438 
00439           /* Read x[10] */
00440           x3 = *(q31_t *) px++;
00441 
00442           /* Perform the multiply-accumulates */
00443           acc0 = __SMLADX(x1, c0, acc0);
00444           acc1 = __SMLAD(x2, c0, acc1);
00445           acc2 = __SMLADX(x2, c0, acc2);
00446           acc3 = __SMLADX(x3, c0, acc3);
00447         }
00448 
00449         /* Store the results in the accumulators in the destination buffer. */
00450 #ifndef ARM_MATH_BIG_ENDIAN
00451 
00452         *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16);
00453         *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16);
00454 
00455 #else
00456 
00457         *__SIMD32(pOut)++ = __PKHBT(acc1 >> 15, acc0 >> 15, 16);
00458         *__SIMD32(pOut)++ = __PKHBT(acc3 >> 15, acc2 >> 15, 16);
00459 
00460 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
00461 
00462         /* Update the inputA and inputB pointers for next MAC calculation */
00463         px = pIn1 + (count * 4u);
00464         py = pSrc2;
00465         pb = (q31_t *) (py - 1);
00466 
00467         /* Increment the pointer pIn1 index, count by 1 */
00468         count++;
00469 
00470         /* Decrement the loop counter */
00471         blkCnt--;
00472       }
00473 
00474       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
00475        ** No loop unrolling is used. */
00476       blkCnt = (uint32_t) blockSize2 % 0x4u;
00477 
00478       while(blkCnt > 0u)
00479       {
00480         /* Accumulator is made zero for every iteration */
00481         sum = 0;
00482 
00483         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00484         k = srcBLen >> 2u;
00485 
00486         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00487          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00488         while(k > 0u)
00489         {
00490           /* Perform the multiply-accumulates */
00491           sum += ((q31_t) * px++ * *py--);
00492           sum += ((q31_t) * px++ * *py--);
00493           sum += ((q31_t) * px++ * *py--);
00494           sum += ((q31_t) * px++ * *py--);
00495 
00496           /* Decrement the loop counter */
00497           k--;
00498         }
00499 
00500         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00501          ** No loop unrolling is used. */
00502         k = srcBLen % 0x4u;
00503 
00504         while(k > 0u)
00505         {
00506           /* Perform the multiply-accumulates */
00507           sum += ((q31_t) * px++ * *py--);
00508 
00509           /* Decrement the loop counter */
00510           k--;
00511         }
00512 
00513         /* Store the result in the accumulator in the destination buffer. */
00514         *pOut++ = (q15_t) (sum >> 15);
00515 
00516         /* Update the inputA and inputB pointers for next MAC calculation */
00517         px = pIn1 + count;
00518         py = pSrc2;
00519 
00520         /* Increment the pointer pIn1 index, count by 1 */
00521         count++;
00522 
00523         /* Decrement the loop counter */
00524         blkCnt--;
00525       }
00526     }
00527     else
00528     {
00529       /* If the srcBLen is not a multiple of 4,   
00530        * the blockSize2 loop cannot be unrolled by 4 */
00531       blkCnt = (uint32_t) blockSize2;
00532 
00533       while(blkCnt > 0u)
00534       {
00535         /* Accumulator is made zero for every iteration */
00536         sum = 0;
00537 
00538         /* srcBLen number of MACS should be performed */
00539         k = srcBLen;
00540 
00541         while(k > 0u)
00542         {
00543           /* Perform the multiply-accumulate */
00544           sum += ((q31_t) * px++ * *py--);
00545 
00546           /* Decrement the loop counter */
00547           k--;
00548         }
00549 
00550         /* Store the result in the accumulator in the destination buffer. */
00551         *pOut++ = (q15_t) (sum >> 15);
00552 
00553         /* Update the inputA and inputB pointers for next MAC calculation */
00554         px = pIn1 + count;
00555         py = pSrc2;
00556 
00557         /* Increment the MAC count */
00558         count++;
00559 
00560         /* Decrement the loop counter */
00561         blkCnt--;
00562       }
00563     }
00564 
00565 
00566     /* --------------------------   
00567      * Initializations of stage3   
00568      * -------------------------*/
00569 
00570     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]   
00571      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]   
00572      * ....   
00573      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]   
00574      * sum +=  x[srcALen-1] * y[srcBLen-1]   
00575      */
00576 
00577     /* In this stage the MAC operations are decreased by 1 for every iteration.   
00578        The count variable holds the number of MAC operations performed */
00579     count = srcBLen - 1u;
00580 
00581     /* Working pointer of inputA */
00582     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00583     px = pSrc1;
00584 
00585     /* Working pointer of inputB */
00586     pSrc2 = pIn2 + (srcBLen - 1u);
00587     pIn2 = pSrc2 - 1u;
00588     py = pIn2;
00589 
00590     /* -------------------   
00591      * Stage3 process   
00592      * ------------------*/
00593 
00594     /* For loop unrolling by 4, this stage is divided into two. */
00595     /* First part of this stage computes the MAC operations greater than 4 */
00596     /* Second part of this stage computes the MAC operations less than or equal to 4 */
00597 
00598     /* The first part of the stage starts here */
00599     j = count >> 2u;
00600 
00601     while((j > 0u) && (blockSize3 > 0))
00602     {
00603       /* Accumulator is made zero for every iteration */
00604       sum = 0;
00605 
00606       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00607       k = count >> 2u;
00608 
00609       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00610        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00611       while(k > 0u)
00612       {
00613         /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied   
00614          * with y[srcBLen - 1], y[srcBLen - 2] respectively */
00615         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00616         /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied   
00617          * with y[srcBLen - 3], y[srcBLen - 4] respectively */
00618         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00619 
00620         /* Decrement the loop counter */
00621         k--;
00622       }
00623 
00624       /* For the next MAC operations, the pointer py is used without SIMD   
00625        * So, py is incremented by 1 */
00626       py = py + 1u;
00627 
00628       /* If the count is not a multiple of 4, compute any remaining MACs here.   
00629        ** No loop unrolling is used. */
00630       k = count % 0x4u;
00631 
00632       while(k > 0u)
00633       {
00634         /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
00635         sum = __SMLAD(*px++, *py--, sum);
00636 
00637         /* Decrement the loop counter */
00638         k--;
00639       }
00640 
00641       /* Store the result in the accumulator in the destination buffer. */
00642       *pOut++ = (q15_t) (sum >> 15);
00643 
00644       /* Update the inputA and inputB pointers for next MAC calculation */
00645       px = ++pSrc1;
00646       py = pIn2;
00647 
00648       /* Decrement the MAC count */
00649       count--;
00650 
00651       /* Decrement the loop counter */
00652       blockSize3--;
00653 
00654       j--;
00655     }
00656 
00657     /* The second part of the stage starts here */
00658     /* SIMD is not used for the next MAC operations,   
00659      * so pointer py is updated to read only one sample at a time */
00660     py = py + 1u;
00661 
00662     while(blockSize3 > 0)
00663     {
00664       /* Accumulator is made zero for every iteration */
00665       sum = 0;
00666 
00667       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00668       k = count;
00669 
00670       while(k > 0u)
00671       {
00672         /* Perform the multiply-accumulates */
00673         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00674         sum = __SMLAD(*px++, *py--, sum);
00675 
00676         /* Decrement the loop counter */
00677         k--;
00678       }
00679 
00680       /* Store the result in the accumulator in the destination buffer. */
00681       *pOut++ = (q15_t) (sum >> 15);
00682 
00683       /* Update the inputA and inputB pointers for next MAC calculation */
00684       px = ++pSrc1;
00685       py = pSrc2;
00686 
00687       /* Decrement the MAC count */
00688       count--;
00689 
00690       /* Decrement the loop counter */
00691       blockSize3--;
00692     }
00693 
00694     /* set status as ARM_MATH_SUCCESS */
00695     status = ARM_MATH_SUCCESS;
00696   }
00697 
00698   /* Return to application */
00699   return (status);
00700 
00701 }
00702