CMSIS DSP Software Library: arm_conv_partial

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------   
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.   
00003 *   
00004 * $Date:        15. July 2011  
00005 * $Revision:    V1.0.10  
00006 *   
00007 * Project:      CMSIS DSP Library   
00008 * Title:        arm_conv_partial_q15.c   
00009 *   
00010 * Description:  Partial convolution of Q15 sequences.  
00011 *   
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Version 1.0.10 2011/7/15 
00015 *    Big Endian support added and Merged M0 and M3/M4 Source code.  
00016 *   
00017 * Version 1.0.3 2010/11/29  
00018 *    Re-organized the CMSIS folders and updated documentation.   
00019 *    
00020 * Version 1.0.2 2010/11/11   
00021 *    Documentation updated.    
00022 *   
00023 * Version 1.0.1 2010/10/05    
00024 *    Production release and review comments incorporated.   
00025 *   
00026 * Version 1.0.0 2010/09/20    
00027 *    Production release and review comments incorporated   
00028 *   
00029 * Version 0.0.7  2010/06/10    
00030 *    Misra-C changes done   
00031 *   
00032 * -------------------------------------------------------------------- */
00033 
00034 #include "arm_math.h"
00035 
00060 arm_status arm_conv_partial_q15(
00061   q15_t * pSrcA,
00062   uint32_t srcALen,
00063   q15_t * pSrcB,
00064   uint32_t srcBLen,
00065   q15_t * pDst,
00066   uint32_t firstIndex,
00067   uint32_t numPoints)
00068 {
00069 
00070 
00071 #ifndef ARM_MATH_CM0
00072 
00073   /* Run the below code for Cortex-M4 and Cortex-M3 */
00074 
00075   q15_t *pIn1;                                   /* inputA pointer               */
00076   q15_t *pIn2;                                   /* inputB pointer               */
00077   q15_t *pOut = pDst;                            /* output pointer               */
00078   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */
00079   q15_t *px;                                     /* Intermediate inputA pointer  */
00080   q15_t *py;                                     /* Intermediate inputB pointer  */
00081   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
00082   q31_t x0, x1, x2, x3, c0;                      /* Temporary input variables */
00083   uint32_t j, k, count, check, blkCnt;
00084   int32_t blockSize1, blockSize2, blockSize3;    /* loop counter                 */
00085   arm_status status;                             /* status of Partial convolution */
00086   q31_t *pb;                                     /* 32 bit pointer for inputB buffer */
00087 
00088   /* Check for range of output samples to be calculated */
00089   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00090   {
00091     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00092     status = ARM_MATH_ARGUMENT_ERROR;
00093   }
00094   else
00095   {
00096 
00097     /* The algorithm implementation is based on the lengths of the inputs. */
00098     /* srcB is always made to slide across srcA. */
00099     /* So srcBLen is always considered as shorter or equal to srcALen */
00100     if(srcALen >= srcBLen)
00101     {
00102       /* Initialization of inputA pointer */
00103       pIn1 = pSrcA;
00104 
00105       /* Initialization of inputB pointer */
00106       pIn2 = pSrcB;
00107     }
00108     else
00109     {
00110       /* Initialization of inputA pointer */
00111       pIn1 = pSrcB;
00112 
00113       /* Initialization of inputB pointer */
00114       pIn2 = pSrcA;
00115 
00116       /* srcBLen is always considered as shorter or equal to srcALen */
00117       j = srcBLen;
00118       srcBLen = srcALen;
00119       srcALen = j;
00120     }
00121 
00122     /* Conditions to check which loopCounter holds   
00123      * the first and last indices of the output samples to be calculated. */
00124     check = firstIndex + numPoints;
00125     blockSize3 = ((int32_t) check - (int32_t) srcALen);
00126     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
00127     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
00128     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
00129                                      (int32_t) numPoints) : 0;
00130     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
00131                                     (int32_t) firstIndex);
00132     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00133 
00134     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00135     /* The function is internally   
00136      * divided into three stages according to the number of multiplications that has to be   
00137      * taken place between inputA samples and inputB samples. In the first stage of the   
00138      * algorithm, the multiplications increase by one for every iteration.   
00139      * In the second stage of the algorithm, srcBLen number of multiplications are done.   
00140      * In the third stage of the algorithm, the multiplications decrease by one   
00141      * for every iteration. */
00142 
00143     /* Set the output pointer to point to the firstIndex   
00144      * of the output sample to be calculated. */
00145     pOut = pDst + firstIndex;
00146 
00147     /* --------------------------   
00148      * Initializations of stage1   
00149      * -------------------------*/
00150 
00151     /* sum = x[0] * y[0]   
00152      * sum = x[0] * y[1] + x[1] * y[0]   
00153      * ....   
00154      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]   
00155      */
00156 
00157     /* In this stage the MAC operations are increased by 1 for every iteration.   
00158        The count variable holds the number of MAC operations performed.   
00159        Since the partial convolution starts from firstIndex   
00160        Number of Macs to be performed is firstIndex + 1 */
00161     count = 1u + firstIndex;
00162 
00163     /* Working pointer of inputA */
00164     px = pIn1;
00165 
00166     /* Working pointer of inputB */
00167     pSrc2 = pIn2 + firstIndex;
00168     py = pSrc2;
00169 
00170     /* ------------------------   
00171      * Stage1 process   
00172      * ----------------------*/
00173 
00174     /* For loop unrolling by 4, this stage is divided into two. */
00175     /* First part of this stage computes the MAC operations less than 4 */
00176     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
00177 
00178     /* The first part of the stage starts here */
00179     while((count < 4u) && (blockSize1 > 0))
00180     {
00181       /* Accumulator is made zero for every iteration */
00182       sum = 0;
00183 
00184       /* Loop over number of MAC operations between   
00185        * inputA samples and inputB samples */
00186       k = count;
00187 
00188       while(k > 0u)
00189       {
00190         /* Perform the multiply-accumulates */
00191         sum = __SMLALD(*px++, *py--, sum);
00192 
00193         /* Decrement the loop counter */
00194         k--;
00195       }
00196 
00197       /* Store the result in the accumulator in the destination buffer. */
00198       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00199 
00200       /* Update the inputA and inputB pointers for next MAC calculation */
00201       py = ++pSrc2;
00202       px = pIn1;
00203 
00204       /* Increment the MAC count */
00205       count++;
00206 
00207       /* Decrement the loop counter */
00208       blockSize1--;
00209     }
00210 
00211     /* The second part of the stage starts here */
00212     /* The internal loop, over count, is unrolled by 4 */
00213     /* To, read the last two inputB samples using SIMD:   
00214      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
00215     py = py - 1;
00216 
00217     while(blockSize1 > 0)
00218     {
00219       /* Accumulator is made zero for every iteration */
00220       sum = 0;
00221 
00222       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00223       k = count >> 2u;
00224 
00225       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00226        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00227       while(k > 0u)
00228       {
00229         /* Perform the multiply-accumulates */
00230         /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
00231         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00232         /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
00233         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00234 
00235         /* Decrement the loop counter */
00236         k--;
00237       }
00238 
00239       /* For the next MAC operations, the pointer py is used without SIMD   
00240        * So, py is incremented by 1 */
00241       py = py + 1u;
00242 
00243       /* If the count is not a multiple of 4, compute any remaining MACs here.   
00244        ** No loop unrolling is used. */
00245       k = count % 0x4u;
00246 
00247       while(k > 0u)
00248       {
00249         /* Perform the multiply-accumulates */
00250         sum = __SMLALD(*px++, *py--, sum);
00251 
00252         /* Decrement the loop counter */
00253         k--;
00254       }
00255 
00256       /* Store the result in the accumulator in the destination buffer. */
00257       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00258 
00259       /* Update the inputA and inputB pointers for next MAC calculation */
00260       py = ++pSrc2 - 1u;
00261       px = pIn1;
00262 
00263       /* Increment the MAC count */
00264       count++;
00265 
00266       /* Decrement the loop counter */
00267       blockSize1--;
00268     }
00269 
00270     /* --------------------------   
00271      * Initializations of stage2   
00272      * ------------------------*/
00273 
00274     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]   
00275      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]   
00276      * ....   
00277      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]   
00278      */
00279 
00280     /* Working pointer of inputA */
00281     px = pIn1;
00282 
00283     /* Working pointer of inputB */
00284     pSrc2 = pIn2 + (srcBLen - 1u);
00285     py = pSrc2;
00286 
00287     /* Initialize inputB pointer of type q31 */
00288     pb = (q31_t *) (py - 1u);
00289 
00290     /* count is the index by which the pointer pIn1 to be incremented */
00291     count = 1u;
00292 
00293 
00294     /* --------------------   
00295      * Stage2 process   
00296      * -------------------*/
00297 
00298     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00299      * So, to loop unroll over blockSize2,   
00300      * srcBLen should be greater than or equal to 4 */
00301     if(srcBLen >= 4u)
00302     {
00303       /* Loop unroll over blockSize2, by 4 */
00304       blkCnt = ((uint32_t) blockSize2 >> 2u);
00305 
00306       while(blkCnt > 0u)
00307       {
00308         /* Set all accumulators to zero */
00309         acc0 = 0;
00310         acc1 = 0;
00311         acc2 = 0;
00312         acc3 = 0;
00313 
00314 
00315         /* read x[0], x[1] samples */
00316         x0 = *(q31_t *) (px++);
00317         /* read x[1], x[2] samples */
00318         x1 = *(q31_t *) (px++);
00319 
00320 
00321         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00322         k = srcBLen >> 2u;
00323 
00324         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00325          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00326         do
00327         {
00328           /* Read the last two inputB samples using SIMD:   
00329            * y[srcBLen - 1] and y[srcBLen - 2] */
00330           c0 = *(pb--);
00331 
00332           /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
00333           acc0 = __SMLALDX(x0, c0, acc0);
00334 
00335           /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
00336           acc1 = __SMLALDX(x1, c0, acc1);
00337 
00338           /* Read x[2], x[3] */
00339           x2 = *(q31_t *) (px++);
00340 
00341           /* Read x[3], x[4] */
00342           x3 = *(q31_t *) (px++);
00343 
00344           /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
00345           acc2 = __SMLALDX(x2, c0, acc2);
00346 
00347           /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
00348           acc3 = __SMLALDX(x3, c0, acc3);
00349 
00350           /* Read y[srcBLen - 3] and y[srcBLen - 4] */
00351           c0 = *(pb--);
00352 
00353           /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
00354           acc0 = __SMLALDX(x2, c0, acc0);
00355 
00356           /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
00357           acc1 = __SMLALDX(x3, c0, acc1);
00358 
00359           /* Read x[4], x[5] */
00360           x0 = *(q31_t *) (px++);
00361 
00362           /* Read x[5], x[6] */
00363           x1 = *(q31_t *) (px++);
00364 
00365           /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
00366           acc2 = __SMLALDX(x0, c0, acc2);
00367 
00368           /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
00369           acc3 = __SMLALDX(x1, c0, acc3);
00370 
00371         } while(--k);
00372 
00373         /* For the next MAC operations, SIMD is not used   
00374          * So, the 16 bit pointer if inputB, py is updated */
00375         py = (q15_t *) pb;
00376         py = py + 1;
00377 
00378         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00379          ** No loop unrolling is used. */
00380         k = srcBLen % 0x4u;
00381 
00382         if(k == 1u)
00383         {
00384           /* Read y[srcBLen - 5] */
00385           c0 = *(py);
00386 
00387 #ifdef  ARM_MATH_BIG_ENDIAN
00388 
00389           c0 = c0 << 16u;
00390 
00391 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00392           /* Read x[7] */
00393           x3 = *(q31_t *) px++;
00394 
00395           /* Perform the multiply-accumulates */
00396           acc0 = __SMLALD(x0, c0, acc0);
00397           acc1 = __SMLALD(x1, c0, acc1);
00398           acc2 = __SMLALDX(x1, c0, acc2);
00399           acc3 = __SMLALDX(x3, c0, acc3);
00400         }
00401 
00402         if(k == 2u)
00403         {
00404           /* Read y[srcBLen - 5], y[srcBLen - 6] */
00405           c0 = *(pb);
00406 
00407           /* Read x[7], x[8] */
00408           x3 = *(q31_t *) px++;
00409 
00410           /* Read x[9] */
00411           x2 = *(q31_t *) px++;
00412 
00413           /* Perform the multiply-accumulates */
00414           acc0 = __SMLALDX(x0, c0, acc0);
00415           acc1 = __SMLALDX(x1, c0, acc1);
00416           acc2 = __SMLALDX(x3, c0, acc2);
00417           acc3 = __SMLALDX(x2, c0, acc3);
00418         }
00419 
00420         if(k == 3u)
00421         {
00422           /* Read y[srcBLen - 5], y[srcBLen - 6] */
00423           c0 = *pb--;
00424 
00425           /* Read x[7], x[8] */
00426           x3 = *(q31_t *) px++;
00427 
00428           /* Read x[9] */
00429           x2 = *(q31_t *) px++;
00430 
00431           /* Perform the multiply-accumulates */
00432           acc0 = __SMLALDX(x0, c0, acc0);
00433           acc1 = __SMLALDX(x1, c0, acc1);
00434           acc2 = __SMLALDX(x3, c0, acc2);
00435           acc3 = __SMLALDX(x2, c0, acc3);
00436 
00437 #ifdef  ARM_MATH_BIG_ENDIAN
00438 
00439           /* Read y[srcBLen - 7] */
00440           c0 = (*pb);
00441           c0 = (c0) << 16;
00442 
00443 #else
00444 
00445           /* Read y[srcBLen - 7] */
00446           c0 = (q15_t) (*pb >> 16);
00447 
00448 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00449 
00450           /* Read x[10] */
00451           x3 = *(q31_t *) px++;
00452 
00453           /* Perform the multiply-accumulates */
00454           acc0 = __SMLALDX(x1, c0, acc0);
00455           acc1 = __SMLALD(x2, c0, acc1);
00456           acc2 = __SMLALDX(x2, c0, acc2);
00457           acc3 = __SMLALDX(x3, c0, acc3);
00458         }
00459 
00460         /* Store the results in the accumulators in the destination buffer. */
00461 #ifndef  ARM_MATH_BIG_ENDIAN
00462 
00463         *__SIMD32(pOut)++ =
00464           __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
00465         *__SIMD32(pOut)++ =
00466           __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
00467 
00468 #else
00469 
00470         *__SIMD32(pOut)++ =
00471           __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
00472         *__SIMD32(pOut)++ =
00473           __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
00474 
00475 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
00476 
00477         /* Update the inputA and inputB pointers for next MAC calculation */
00478         px = pIn1 + (count * 4u);
00479         py = pSrc2;
00480         pb = (q31_t *) (py - 1);
00481 
00482         /* Increment the pointer pIn1 index, count by 1 */
00483         count++;
00484 
00485         /* Decrement the loop counter */
00486         blkCnt--;
00487       }
00488 
00489       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
00490        ** No loop unrolling is used. */
00491       blkCnt = (uint32_t) blockSize2 % 0x4u;
00492 
00493       while(blkCnt > 0u)
00494       {
00495         /* Accumulator is made zero for every iteration */
00496         sum = 0;
00497 
00498         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00499         k = srcBLen >> 2u;
00500 
00501         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00502          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00503         while(k > 0u)
00504         {
00505           /* Perform the multiply-accumulates */
00506           sum += (q63_t) ((q31_t) * px++ * *py--);
00507           sum += (q63_t) ((q31_t) * px++ * *py--);
00508           sum += (q63_t) ((q31_t) * px++ * *py--);
00509           sum += (q63_t) ((q31_t) * px++ * *py--);
00510 
00511           /* Decrement the loop counter */
00512           k--;
00513         }
00514 
00515         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00516          ** No loop unrolling is used. */
00517         k = srcBLen % 0x4u;
00518 
00519         while(k > 0u)
00520         {
00521           /* Perform the multiply-accumulates */
00522           sum += (q63_t) ((q31_t) * px++ * *py--);
00523 
00524           /* Decrement the loop counter */
00525           k--;
00526         }
00527 
00528         /* Store the result in the accumulator in the destination buffer. */
00529         *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
00530 
00531         /* Update the inputA and inputB pointers for next MAC calculation */
00532         px = pIn1 + count;
00533         py = pSrc2;
00534 
00535         /* Increment the pointer pIn1 index, count by 1 */
00536         count++;
00537 
00538         /* Decrement the loop counter */
00539         blkCnt--;
00540       }
00541     }
00542     else
00543     {
00544       /* If the srcBLen is not a multiple of 4,   
00545        * the blockSize2 loop cannot be unrolled by 4 */
00546       blkCnt = (uint32_t) blockSize2;
00547 
00548       while(blkCnt > 0u)
00549       {
00550         /* Accumulator is made zero for every iteration */
00551         sum = 0;
00552 
00553         /* srcBLen number of MACS should be performed */
00554         k = srcBLen;
00555 
00556         while(k > 0u)
00557         {
00558           /* Perform the multiply-accumulate */
00559           sum += (q63_t) ((q31_t) * px++ * *py--);
00560 
00561           /* Decrement the loop counter */
00562           k--;
00563         }
00564 
00565         /* Store the result in the accumulator in the destination buffer. */
00566         *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
00567 
00568         /* Update the inputA and inputB pointers for next MAC calculation */
00569         px = pIn1 + count;
00570         py = pSrc2;
00571 
00572         /* Increment the MAC count */
00573         count++;
00574 
00575         /* Decrement the loop counter */
00576         blkCnt--;
00577       }
00578     }
00579 
00580 
00581     /* --------------------------   
00582      * Initializations of stage3   
00583      * -------------------------*/
00584 
00585     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]   
00586      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]   
00587      * ....   
00588      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]   
00589      * sum +=  x[srcALen-1] * y[srcBLen-1]   
00590      */
00591 
00592     /* In this stage the MAC operations are decreased by 1 for every iteration.   
00593        The count variable holds the number of MAC operations performed */
00594     count = srcBLen - 1u;
00595 
00596     /* Working pointer of inputA */
00597     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00598     px = pSrc1;
00599 
00600     /* Working pointer of inputB */
00601     pSrc2 = pIn2 + (srcBLen - 1u);
00602     pIn2 = pSrc2 - 1u;
00603     py = pIn2;
00604 
00605     /* -------------------   
00606      * Stage3 process   
00607      * ------------------*/
00608 
00609     /* For loop unrolling by 4, this stage is divided into two. */
00610     /* First part of this stage computes the MAC operations greater than 4 */
00611     /* Second part of this stage computes the MAC operations less than or equal to 4 */
00612 
00613     /* The first part of the stage starts here */
00614     j = count >> 2u;
00615 
00616     while((j > 0u) && (blockSize3 > 0))
00617     {
00618       /* Accumulator is made zero for every iteration */
00619       sum = 0;
00620 
00621       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00622       k = count >> 2u;
00623 
00624       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00625        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00626       while(k > 0u)
00627       {
00628         /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied   
00629          * with y[srcBLen - 1], y[srcBLen - 2] respectively */
00630         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00631         /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied   
00632          * with y[srcBLen - 3], y[srcBLen - 4] respectively */
00633         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00634 
00635         /* Decrement the loop counter */
00636         k--;
00637       }
00638 
00639       /* For the next MAC operations, the pointer py is used without SIMD   
00640        * So, py is incremented by 1 */
00641       py = py + 1u;
00642 
00643       /* If the count is not a multiple of 4, compute any remaining MACs here.   
00644        ** No loop unrolling is used. */
00645       k = count % 0x4u;
00646 
00647       while(k > 0u)
00648       {
00649         /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
00650         sum = __SMLALD(*px++, *py--, sum);
00651 
00652         /* Decrement the loop counter */
00653         k--;
00654       }
00655 
00656       /* Store the result in the accumulator in the destination buffer. */
00657       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00658 
00659       /* Update the inputA and inputB pointers for next MAC calculation */
00660       px = ++pSrc1;
00661       py = pIn2;
00662 
00663       /* Decrement the MAC count */
00664       count--;
00665 
00666       /* Decrement the loop counter */
00667       blockSize3--;
00668 
00669       j--;
00670     }
00671 
00672     /* The second part of the stage starts here */
00673     /* SIMD is not used for the next MAC operations,   
00674      * so pointer py is updated to read only one sample at a time */
00675     py = py + 1u;
00676 
00677     while(blockSize3 > 0)
00678     {
00679       /* Accumulator is made zero for every iteration */
00680       sum = 0;
00681 
00682       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00683       k = count;
00684 
00685       while(k > 0u)
00686       {
00687         /* Perform the multiply-accumulates */
00688         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00689         sum = __SMLALD(*px++, *py--, sum);
00690 
00691         /* Decrement the loop counter */
00692         k--;
00693       }
00694 
00695       /* Store the result in the accumulator in the destination buffer. */
00696       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00697 
00698       /* Update the inputA and inputB pointers for next MAC calculation */
00699       px = ++pSrc1;
00700       py = pSrc2;
00701 
00702       /* Decrement the MAC count */
00703       count--;
00704 
00705       /* Decrement the loop counter */
00706       blockSize3--;
00707     }
00708 
00709     /* set status as ARM_MATH_SUCCESS */
00710     status = ARM_MATH_SUCCESS;
00711   }
00712 
00713   /* Return to application */
00714   return (status);
00715 
00716 #else
00717 
00718   /* Run the below code for Cortex-M0 */
00719 
00720   q15_t *pIn1 = pSrcA;                           /* inputA pointer */
00721   q15_t *pIn2 = pSrcB;                           /* inputB pointer */
00722   q63_t sum;                                     /* Accumulator */
00723   uint32_t i, j;                                 /* loop counters */
00724   arm_status status;                             /* status of Partial convolution */
00725 
00726   /* Check for range of output samples to be calculated */
00727   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00728   {
00729     /* Set status as ARM_ARGUMENT_ERROR */
00730     status = ARM_MATH_ARGUMENT_ERROR;
00731   }
00732   else
00733   {
00734     /* Loop to calculate convolution for output length number of values */
00735     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
00736     {
00737       /* Initialize sum with zero to carry on MAC operations */
00738       sum = 0;
00739 
00740       /* Loop to perform MAC operations according to convolution equation */
00741       for (j = 0; j <= i; j++)
00742       {
00743         /* Check the array limitations */
00744         if(((i - j) < srcBLen) && (j < srcALen))
00745         {
00746           /* z[i] += x[i-j] * y[j] */
00747           sum += ((q31_t) pIn1[j] * (pIn2[i - j]));
00748         }
00749       }
00750 
00751       /* Store the output in the destination buffer */
00752       pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u);
00753     }
00754     /* set status as ARM_SUCCESS as there are no argument errors */
00755     status = ARM_MATH_SUCCESS;
00756   }
00757   return (status);
00758 
00759 #endif /*     #ifndef ARM_MATH_CM0      */
00760 
00761 }
00762