CMSIS DSP Software Library: arm_conv_fast

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------   
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.   
00003 *   
00004 * $Date:        15. July 2011  
00005 * $Revision:    V1.0.10  
00006 *   
00007 * Project:      CMSIS DSP Library   
00008 * Title:        arm_conv_fast_q15.c   
00009 *   
00010 * Description:  Fast Q15 Convolution.   
00011 *   
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.10 2011/7/15 
00015 *    Big Endian support added and Merged M0 and M3/M4 Source code.  
00016 *   
00017 * Version 1.0.3 2010/11/29  
00018 *    Re-organized the CMSIS folders and updated documentation.   
00019 *    
00020 * Version 1.0.2 2010/11/11   
00021 *    Documentation updated.    
00022 *   
00023 * Version 1.0.1 2010/10/05    
00024 *    Production release and review comments incorporated.   
00025 *   
00026 * Version 1.0.0 2010/09/20    
00027 *    Production release and review comments incorporated.   
00028 * -------------------------------------------------------------------- */
00029 
00030 #include "arm_math.h"
00031 
00066 void arm_conv_fast_q15(
00067   q15_t * pSrcA,
00068   uint32_t srcALen,
00069   q15_t * pSrcB,
00070   uint32_t srcBLen,
00071   q15_t * pDst)
00072 {
00073   q15_t *pIn1;                                   /* inputA pointer */
00074   q15_t *pIn2;                                   /* inputB pointer */
00075   q15_t *pOut = pDst;                            /* output pointer */
00076   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
00077   q15_t *px;                                     /* Intermediate inputA pointer  */
00078   q15_t *py;                                     /* Intermediate inputB pointer  */
00079   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
00080   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
00081   uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt;     /* loop counter */
00082   q31_t *pb;                                     /* 32 bit pointer for inputB buffer */
00083 
00084 
00085   /* The algorithm implementation is based on the lengths of the inputs. */
00086   /* srcB is always made to slide across srcA. */
00087   /* So srcBLen is always considered as shorter or equal to srcALen */
00088   if(srcALen >= srcBLen)
00089   {
00090     /* Initialization of inputA pointer */
00091     pIn1 = pSrcA;
00092 
00093     /* Initialization of inputB pointer */
00094     pIn2 = pSrcB;
00095   }
00096   else
00097   {
00098     /* Initialization of inputA pointer */
00099     pIn1 = pSrcB;
00100 
00101     /* Initialization of inputB pointer */
00102     pIn2 = pSrcA;
00103 
00104     /* srcBLen is always considered as shorter or equal to srcALen */
00105     j = srcBLen;
00106     srcBLen = srcALen;
00107     srcALen = j;
00108   }
00109 
00110   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00111   /* The function is internally   
00112    * divided into three stages according to the number of multiplications that has to be   
00113    * taken place between inputA samples and inputB samples. In the first stage of the   
00114    * algorithm, the multiplications increase by one for every iteration.   
00115    * In the second stage of the algorithm, srcBLen number of multiplications are done.   
00116    * In the third stage of the algorithm, the multiplications decrease by one   
00117    * for every iteration. */
00118 
00119   /* The algorithm is implemented in three stages.   
00120      The loop counters of each stage is initiated here. */
00121   blockSize1 = srcBLen - 1u;
00122   blockSize2 = srcALen - (srcBLen - 1u);
00123   blockSize3 = blockSize1;
00124 
00125   /* --------------------------   
00126    * Initializations of stage1   
00127    * -------------------------*/
00128 
00129   /* sum = x[0] * y[0]   
00130    * sum = x[0] * y[1] + x[1] * y[0]   
00131    * ....   
00132    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]   
00133    */
00134 
00135   /* In this stage the MAC operations are increased by 1 for every iteration.   
00136      The count variable holds the number of MAC operations performed */
00137   count = 1u;
00138 
00139   /* Working pointer of inputA */
00140   px = pIn1;
00141 
00142   /* Working pointer of inputB */
00143   py = pIn2;
00144 
00145 
00146   /* ------------------------   
00147    * Stage1 process   
00148    * ----------------------*/
00149 
00150   /* For loop unrolling by 4, this stage is divided into two. */
00151   /* First part of this stage computes the MAC operations less than 4 */
00152   /* Second part of this stage computes the MAC operations greater than or equal to 4 */
00153 
00154   /* The first part of the stage starts here */
00155   while((count < 4u) && (blockSize1 > 0u))
00156   {
00157     /* Accumulator is made zero for every iteration */
00158     sum = 0;
00159 
00160     /* Loop over number of MAC operations between   
00161      * inputA samples and inputB samples */
00162     k = count;
00163 
00164     while(k > 0u)
00165     {
00166       /* Perform the multiply-accumulates */
00167       sum = __SMLAD(*px++, *py--, sum);
00168 
00169       /* Decrement the loop counter */
00170       k--;
00171     }
00172 
00173     /* Store the result in the accumulator in the destination buffer. */
00174     *pOut++ = (q15_t) (sum >> 15);
00175 
00176     /* Update the inputA and inputB pointers for next MAC calculation */
00177     py = pIn2 + count;
00178     px = pIn1;
00179 
00180     /* Increment the MAC count */
00181     count++;
00182 
00183     /* Decrement the loop counter */
00184     blockSize1--;
00185   }
00186 
00187   /* The second part of the stage starts here */
00188   /* The internal loop, over count, is unrolled by 4 */
00189   /* To, read the last two inputB samples using SIMD:   
00190    * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
00191   py = py - 1;
00192 
00193   while(blockSize1 > 0u)
00194   {
00195     /* Accumulator is made zero for every iteration */
00196     sum = 0;
00197 
00198     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00199     k = count >> 2u;
00200 
00201     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00202      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00203     while(k > 0u)
00204     {
00205       /* Perform the multiply-accumulates */
00206       /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
00207       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00208       /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
00209       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00210 
00211       /* Decrement the loop counter */
00212       k--;
00213     }
00214 
00215     /* For the next MAC operations, the pointer py is used without SIMD   
00216      * So, py is incremented by 1 */
00217     py = py + 1u;
00218 
00219     /* If the count is not a multiple of 4, compute any remaining MACs here.   
00220      ** No loop unrolling is used. */
00221     k = count % 0x4u;
00222 
00223     while(k > 0u)
00224     {
00225       /* Perform the multiply-accumulates */
00226       sum = __SMLAD(*px++, *py--, sum);
00227 
00228       /* Decrement the loop counter */
00229       k--;
00230     }
00231 
00232     /* Store the result in the accumulator in the destination buffer. */
00233     *pOut++ = (q15_t) (sum >> 15);
00234 
00235     /* Update the inputA and inputB pointers for next MAC calculation */
00236     py = pIn2 + (count - 1u);
00237     px = pIn1;
00238 
00239     /* Increment the MAC count */
00240     count++;
00241 
00242     /* Decrement the loop counter */
00243     blockSize1--;
00244   }
00245 
00246   /* --------------------------   
00247    * Initializations of stage2   
00248    * ------------------------*/
00249 
00250   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]   
00251    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]   
00252    * ....   
00253    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]   
00254    */
00255 
00256   /* Working pointer of inputA */
00257   px = pIn1;
00258 
00259   /* Working pointer of inputB */
00260   pSrc2 = pIn2 + (srcBLen - 1u);
00261   py = pSrc2;
00262 
00263   /* Initialize inputB pointer of type q31 */
00264   pb = (q31_t *) (py - 1u);
00265 
00266   /* count is the index by which the pointer pIn1 to be incremented */
00267   count = 1u;
00268 
00269 
00270   /* --------------------   
00271    * Stage2 process   
00272    * -------------------*/
00273 
00274   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00275    * So, to loop unroll over blockSize2,   
00276    * srcBLen should be greater than or equal to 4 */
00277   if(srcBLen >= 4u)
00278   {
00279     /* Loop unroll over blockSize2, by 4 */
00280     blkCnt = blockSize2 >> 2u;
00281 
00282     while(blkCnt > 0u)
00283     {
00284       /* Set all accumulators to zero */
00285       acc0 = 0;
00286       acc1 = 0;
00287       acc2 = 0;
00288       acc3 = 0;
00289 
00290 
00291       /* read x[0], x[1] samples */
00292       x0 = *(q31_t *) (px++);
00293       /* read x[1], x[2] samples */
00294       x1 = *(q31_t *) (px++);
00295 
00296 
00297       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00298       k = srcBLen >> 2u;
00299 
00300       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00301        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00302       do
00303       {
00304         /* Read the last two inputB samples using SIMD:   
00305          * y[srcBLen - 1] and y[srcBLen - 2] */
00306         c0 = *(pb--);
00307 
00308         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
00309         acc0 = __SMLADX(x0, c0, acc0);
00310 
00311         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
00312         acc1 = __SMLADX(x1, c0, acc1);
00313 
00314         /* Read x[2], x[3] */
00315         x2 = *(q31_t *) (px++);
00316 
00317         /* Read x[3], x[4] */
00318         x3 = *(q31_t *) (px++);
00319 
00320         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
00321         acc2 = __SMLADX(x2, c0, acc2);
00322 
00323         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
00324         acc3 = __SMLADX(x3, c0, acc3);
00325 
00326         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
00327         c0 = *(pb--);
00328 
00329         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
00330         acc0 = __SMLADX(x2, c0, acc0);
00331 
00332         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
00333         acc1 = __SMLADX(x3, c0, acc1);
00334 
00335         /* Read x[4], x[5] */
00336         x0 = *(q31_t *) (px++);
00337 
00338         /* Read x[5], x[6] */
00339         x1 = *(q31_t *) (px++);
00340 
00341         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
00342         acc2 = __SMLADX(x0, c0, acc2);
00343 
00344         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
00345         acc3 = __SMLADX(x1, c0, acc3);
00346 
00347       } while(--k);
00348 
00349       /* For the next MAC operations, SIMD is not used   
00350        * So, the 16 bit pointer if inputB, py is updated */
00351       py = (q15_t *) pb;
00352       py = py + 1;
00353 
00354       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00355        ** No loop unrolling is used. */
00356       k = srcBLen % 0x4u;
00357 
00358       if(k == 1u)
00359       {
00360         /* Read y[srcBLen - 5] */
00361         c0 = *(py);
00362 #ifdef  ARM_MATH_BIG_ENDIAN
00363 
00364 //          c0 = unallign_rev(p, c0); 
00365         c0 = c0 << 16;
00366 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00367 
00368         /* Read x[7] */
00369         x3 = *(q31_t *) px++;
00370 
00371         /* Perform the multiply-accumulates */
00372         acc0 = __SMLAD(x0, c0, acc0);
00373         acc1 = __SMLAD(x1, c0, acc1);
00374         acc2 = __SMLADX(x1, c0, acc2);
00375         acc3 = __SMLADX(x3, c0, acc3);
00376       }
00377 
00378       if(k == 2u)
00379       {
00380         /* Read y[srcBLen - 5], y[srcBLen - 6] */
00381         c0 = *(pb);
00382 
00383         /* Read x[7], x[8] */
00384         x3 = *(q31_t *) px++;
00385 
00386         /* Read x[9] */
00387         x2 = *(q31_t *) px++;
00388 
00389         /* Perform the multiply-accumulates */
00390         acc0 = __SMLADX(x0, c0, acc0);
00391         acc1 = __SMLADX(x1, c0, acc1);
00392         acc2 = __SMLADX(x3, c0, acc2);
00393         acc3 = __SMLADX(x2, c0, acc3);
00394       }
00395 
00396       if(k == 3u)
00397       {
00398         /* Read y[srcBLen - 5], y[srcBLen - 6] */
00399         c0 = *pb--;
00400 
00401         /* Read x[7], x[8] */
00402         x3 = *(q31_t *) px++;
00403 
00404         /* Read x[9] */
00405         x2 = *(q31_t *) px++;
00406 
00407         /* Perform the multiply-accumulates */
00408         acc0 = __SMLADX(x0, c0, acc0);
00409         acc1 = __SMLADX(x1, c0, acc1);
00410         acc2 = __SMLADX(x3, c0, acc2);
00411         acc3 = __SMLADX(x2, c0, acc3);
00412 
00413         /* Read y[srcBLen - 7] */
00414 #ifdef  ARM_MATH_BIG_ENDIAN
00415 
00416         c0 = (*pb);
00417 //              c0 = (c0 & 0x0000FFFF)<<16;     
00418         c0 = (c0) << 16;
00419 
00420 #else
00421 
00422         c0 = (q15_t) (*pb >> 16);
00423 
00424 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00425 
00426         /* Read x[10] */
00427         x3 = *(q31_t *) px++;
00428 
00429         /* Perform the multiply-accumulates */
00430         acc0 = __SMLADX(x1, c0, acc0);
00431         acc1 = __SMLAD(x2, c0, acc1);
00432         acc2 = __SMLADX(x2, c0, acc2);
00433         acc3 = __SMLADX(x3, c0, acc3);
00434       }
00435 
00436       /* Store the results in the accumulators in the destination buffer. */
00437 #ifndef ARM_MATH_BIG_ENDIAN
00438 
00439       *__SIMD32(pOut)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16);
00440       *__SIMD32(pOut)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16);
00441 
00442 #else
00443 
00444       *__SIMD32(pOut)++ = __PKHBT((acc1 >> 15), (acc0 >> 15), 16);
00445       *__SIMD32(pOut)++ = __PKHBT((acc3 >> 15), (acc2 >> 15), 16);
00446 
00447 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
00448       /* Update the inputA and inputB pointers for next MAC calculation */
00449       px = pIn1 + (count * 4u);
00450       py = pSrc2;
00451       pb = (q31_t *) (py - 1);
00452 
00453       /* Increment the pointer pIn1 index, count by 1 */
00454       count++;
00455 
00456       /* Decrement the loop counter */
00457       blkCnt--;
00458     }
00459 
00460     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
00461      ** No loop unrolling is used. */
00462     blkCnt = blockSize2 % 0x4u;
00463 
00464     while(blkCnt > 0u)
00465     {
00466       /* Accumulator is made zero for every iteration */
00467       sum = 0;
00468 
00469       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00470       k = srcBLen >> 2u;
00471 
00472       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00473        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00474       while(k > 0u)
00475       {
00476         /* Perform the multiply-accumulates */
00477         sum += ((q31_t) * px++ * *py--);
00478         sum += ((q31_t) * px++ * *py--);
00479         sum += ((q31_t) * px++ * *py--);
00480         sum += ((q31_t) * px++ * *py--);
00481 
00482         /* Decrement the loop counter */
00483         k--;
00484       }
00485 
00486       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00487        ** No loop unrolling is used. */
00488       k = srcBLen % 0x4u;
00489 
00490       while(k > 0u)
00491       {
00492         /* Perform the multiply-accumulates */
00493         sum += ((q31_t) * px++ * *py--);
00494 
00495         /* Decrement the loop counter */
00496         k--;
00497       }
00498 
00499       /* Store the result in the accumulator in the destination buffer. */
00500       *pOut++ = (q15_t) (sum >> 15);
00501 
00502       /* Update the inputA and inputB pointers for next MAC calculation */
00503       px = pIn1 + count;
00504       py = pSrc2;
00505 
00506       /* Increment the pointer pIn1 index, count by 1 */
00507       count++;
00508 
00509       /* Decrement the loop counter */
00510       blkCnt--;
00511     }
00512   }
00513   else
00514   {
00515     /* If the srcBLen is not a multiple of 4,   
00516      * the blockSize2 loop cannot be unrolled by 4 */
00517     blkCnt = blockSize2;
00518 
00519     while(blkCnt > 0u)
00520     {
00521       /* Accumulator is made zero for every iteration */
00522       sum = 0;
00523 
00524       /* srcBLen number of MACS should be performed */
00525       k = srcBLen;
00526 
00527       while(k > 0u)
00528       {
00529         /* Perform the multiply-accumulate */
00530         sum += ((q31_t) * px++ * *py--);
00531 
00532         /* Decrement the loop counter */
00533         k--;
00534       }
00535 
00536       /* Store the result in the accumulator in the destination buffer. */
00537       *pOut++ = (q15_t) (sum >> 15);
00538 
00539       /* Update the inputA and inputB pointers for next MAC calculation */
00540       px = pIn1 + count;
00541       py = pSrc2;
00542 
00543       /* Increment the MAC count */
00544       count++;
00545 
00546       /* Decrement the loop counter */
00547       blkCnt--;
00548     }
00549   }
00550 
00551 
00552   /* --------------------------   
00553    * Initializations of stage3   
00554    * -------------------------*/
00555 
00556   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]   
00557    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]   
00558    * ....   
00559    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]   
00560    * sum +=  x[srcALen-1] * y[srcBLen-1]   
00561    */
00562 
00563   /* In this stage the MAC operations are decreased by 1 for every iteration.   
00564      The blockSize3 variable holds the number of MAC operations performed */
00565 
00566   /* Working pointer of inputA */
00567   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00568   px = pSrc1;
00569 
00570   /* Working pointer of inputB */
00571   pSrc2 = pIn2 + (srcBLen - 1u);
00572   pIn2 = pSrc2 - 1u;
00573   py = pIn2;
00574 
00575   /* -------------------   
00576    * Stage3 process   
00577    * ------------------*/
00578 
00579   /* For loop unrolling by 4, this stage is divided into two. */
00580   /* First part of this stage computes the MAC operations greater than 4 */
00581   /* Second part of this stage computes the MAC operations less than or equal to 4 */
00582 
00583   /* The first part of the stage starts here */
00584   j = blockSize3 >> 2u;
00585 
00586   while((j > 0u) && (blockSize3 > 0u))
00587   {
00588     /* Accumulator is made zero for every iteration */
00589     sum = 0;
00590 
00591     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00592     k = blockSize3 >> 2u;
00593 
00594     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00595      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00596     while(k > 0u)
00597     {
00598       /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied   
00599        * with y[srcBLen - 1], y[srcBLen - 2] respectively */
00600       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00601       /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied   
00602        * with y[srcBLen - 3], y[srcBLen - 4] respectively */
00603       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00604 
00605       /* Decrement the loop counter */
00606       k--;
00607     }
00608 
00609     /* For the next MAC operations, the pointer py is used without SIMD   
00610      * So, py is incremented by 1 */
00611     py = py + 1u;
00612 
00613     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.   
00614      ** No loop unrolling is used. */
00615     k = blockSize3 % 0x4u;
00616 
00617     while(k > 0u)
00618     {
00619       /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
00620       sum = __SMLAD(*px++, *py--, sum);
00621 
00622       /* Decrement the loop counter */
00623       k--;
00624     }
00625 
00626     /* Store the result in the accumulator in the destination buffer. */
00627     *pOut++ = (q15_t) (sum >> 15);
00628 
00629     /* Update the inputA and inputB pointers for next MAC calculation */
00630     px = ++pSrc1;
00631     py = pIn2;
00632 
00633     /* Decrement the loop counter */
00634     blockSize3--;
00635 
00636     j--;
00637   }
00638 
00639   /* The second part of the stage starts here */
00640   /* SIMD is not used for the next MAC operations,   
00641    * so pointer py is updated to read only one sample at a time */
00642   py = py + 1u;
00643 
00644   while(blockSize3 > 0u)
00645   {
00646     /* Accumulator is made zero for every iteration */
00647     sum = 0;
00648 
00649     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00650     k = blockSize3;
00651 
00652     while(k > 0u)
00653     {
00654       /* Perform the multiply-accumulates */
00655       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00656       sum = __SMLAD(*px++, *py--, sum);
00657 
00658       /* Decrement the loop counter */
00659       k--;
00660     }
00661 
00662     /* Store the result in the accumulator in the destination buffer. */
00663     *pOut++ = (q15_t) (sum >> 15);
00664 
00665     /* Update the inputA and inputB pointers for next MAC calculation */
00666     px = ++pSrc1;
00667     py = pSrc2;
00668 
00669     /* Decrement the loop counter */
00670     blockSize3--;
00671   }
00672 
00673 }
00674