00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_q15.c 00009 * 00010 * Description: Partial convolution of Q15 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated 00028 * 00029 * Version 0.0.7 2010/06/10 00030 * Misra-C changes done 00031 * 00032 * -------------------------------------------------------------------- */ 00033 00034 #include "arm_math.h" 00035 00060 arm_status arm_conv_partial_q15( 00061 q15_t * pSrcA, 00062 uint32_t srcALen, 00063 q15_t * pSrcB, 00064 uint32_t srcBLen, 00065 q15_t * pDst, 00066 uint32_t firstIndex, 00067 uint32_t numPoints) 00068 { 00069 00070 00071 #ifndef ARM_MATH_CM0 00072 00073 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00074 00075 q15_t *pIn1; /* inputA pointer */ 00076 q15_t *pIn2; /* inputB pointer */ 00077 q15_t *pOut = pDst; /* output pointer */ 00078 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00079 q15_t *px; /* Intermediate inputA pointer */ 00080 q15_t *py; /* Intermediate inputB pointer */ 00081 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00082 q31_t x0, x1, x2, x3, c0; /* Temporary input variables */ 00083 uint32_t j, k, count, check, blkCnt; 00084 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ 00085 arm_status status; /* status of Partial convolution */ 00086 q31_t *pb; /* 32 bit pointer for inputB buffer */ 00087 00088 /* Check for range of output samples to be calculated */ 00089 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00090 { 00091 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00092 status = ARM_MATH_ARGUMENT_ERROR; 00093 } 00094 else 00095 { 00096 00097 /* The algorithm implementation is based on the lengths of the inputs. */ 00098 /* srcB is always made to slide across srcA. */ 00099 /* So srcBLen is always considered as shorter or equal to srcALen */ 00100 if(srcALen >= srcBLen) 00101 { 00102 /* Initialization of inputA pointer */ 00103 pIn1 = pSrcA; 00104 00105 /* Initialization of inputB pointer */ 00106 pIn2 = pSrcB; 00107 } 00108 else 00109 { 00110 /* Initialization of inputA pointer */ 00111 pIn1 = pSrcB; 00112 00113 /* Initialization of inputB pointer */ 00114 pIn2 = pSrcA; 00115 00116 /* srcBLen is always considered as shorter or equal to srcALen */ 00117 j = srcBLen; 00118 srcBLen = srcALen; 00119 srcALen = j; 00120 } 00121 00122 /* Conditions to check which loopCounter holds 00123 * the first and last indices of the output samples to be calculated. */ 00124 check = firstIndex + numPoints; 00125 blockSize3 = ((int32_t) check - (int32_t) srcALen); 00126 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00127 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00128 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00129 (int32_t) numPoints) : 0; 00130 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00131 (int32_t) firstIndex); 00132 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00133 00134 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00135 /* The function is internally 00136 * divided into three stages according to the number of multiplications that has to be 00137 * taken place between inputA samples and inputB samples. In the first stage of the 00138 * algorithm, the multiplications increase by one for every iteration. 00139 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00140 * In the third stage of the algorithm, the multiplications decrease by one 00141 * for every iteration. */ 00142 00143 /* Set the output pointer to point to the firstIndex 00144 * of the output sample to be calculated. */ 00145 pOut = pDst + firstIndex; 00146 00147 /* -------------------------- 00148 * Initializations of stage1 00149 * -------------------------*/ 00150 00151 /* sum = x[0] * y[0] 00152 * sum = x[0] * y[1] + x[1] * y[0] 00153 * .... 00154 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00155 */ 00156 00157 /* In this stage the MAC operations are increased by 1 for every iteration. 00158 The count variable holds the number of MAC operations performed. 00159 Since the partial convolution starts from firstIndex 00160 Number of Macs to be performed is firstIndex + 1 */ 00161 count = 1u + firstIndex; 00162 00163 /* Working pointer of inputA */ 00164 px = pIn1; 00165 00166 /* Working pointer of inputB */ 00167 pSrc2 = pIn2 + firstIndex; 00168 py = pSrc2; 00169 00170 /* ------------------------ 00171 * Stage1 process 00172 * ----------------------*/ 00173 00174 /* For loop unrolling by 4, this stage is divided into two. */ 00175 /* First part of this stage computes the MAC operations less than 4 */ 00176 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00177 00178 /* The first part of the stage starts here */ 00179 while((count < 4u) && (blockSize1 > 0)) 00180 { 00181 /* Accumulator is made zero for every iteration */ 00182 sum = 0; 00183 00184 /* Loop over number of MAC operations between 00185 * inputA samples and inputB samples */ 00186 k = count; 00187 00188 while(k > 0u) 00189 { 00190 /* Perform the multiply-accumulates */ 00191 sum = __SMLALD(*px++, *py--, sum); 00192 00193 /* Decrement the loop counter */ 00194 k--; 00195 } 00196 00197 /* Store the result in the accumulator in the destination buffer. */ 00198 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00199 00200 /* Update the inputA and inputB pointers for next MAC calculation */ 00201 py = ++pSrc2; 00202 px = pIn1; 00203 00204 /* Increment the MAC count */ 00205 count++; 00206 00207 /* Decrement the loop counter */ 00208 blockSize1--; 00209 } 00210 00211 /* The second part of the stage starts here */ 00212 /* The internal loop, over count, is unrolled by 4 */ 00213 /* To, read the last two inputB samples using SIMD: 00214 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00215 py = py - 1; 00216 00217 while(blockSize1 > 0) 00218 { 00219 /* Accumulator is made zero for every iteration */ 00220 sum = 0; 00221 00222 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00223 k = count >> 2u; 00224 00225 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00226 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00227 while(k > 0u) 00228 { 00229 /* Perform the multiply-accumulates */ 00230 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00231 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00232 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00233 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00234 00235 /* Decrement the loop counter */ 00236 k--; 00237 } 00238 00239 /* For the next MAC operations, the pointer py is used without SIMD 00240 * So, py is incremented by 1 */ 00241 py = py + 1u; 00242 00243 /* If the count is not a multiple of 4, compute any remaining MACs here. 00244 ** No loop unrolling is used. */ 00245 k = count % 0x4u; 00246 00247 while(k > 0u) 00248 { 00249 /* Perform the multiply-accumulates */ 00250 sum = __SMLALD(*px++, *py--, sum); 00251 00252 /* Decrement the loop counter */ 00253 k--; 00254 } 00255 00256 /* Store the result in the accumulator in the destination buffer. */ 00257 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00258 00259 /* Update the inputA and inputB pointers for next MAC calculation */ 00260 py = ++pSrc2 - 1u; 00261 px = pIn1; 00262 00263 /* Increment the MAC count */ 00264 count++; 00265 00266 /* Decrement the loop counter */ 00267 blockSize1--; 00268 } 00269 00270 /* -------------------------- 00271 * Initializations of stage2 00272 * ------------------------*/ 00273 00274 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00275 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00276 * .... 00277 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00278 */ 00279 00280 /* Working pointer of inputA */ 00281 px = pIn1; 00282 00283 /* Working pointer of inputB */ 00284 pSrc2 = pIn2 + (srcBLen - 1u); 00285 py = pSrc2; 00286 00287 /* Initialize inputB pointer of type q31 */ 00288 pb = (q31_t *) (py - 1u); 00289 00290 /* count is the index by which the pointer pIn1 to be incremented */ 00291 count = 1u; 00292 00293 00294 /* -------------------- 00295 * Stage2 process 00296 * -------------------*/ 00297 00298 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00299 * So, to loop unroll over blockSize2, 00300 * srcBLen should be greater than or equal to 4 */ 00301 if(srcBLen >= 4u) 00302 { 00303 /* Loop unroll over blockSize2, by 4 */ 00304 blkCnt = ((uint32_t) blockSize2 >> 2u); 00305 00306 while(blkCnt > 0u) 00307 { 00308 /* Set all accumulators to zero */ 00309 acc0 = 0; 00310 acc1 = 0; 00311 acc2 = 0; 00312 acc3 = 0; 00313 00314 00315 /* read x[0], x[1] samples */ 00316 x0 = *(q31_t *) (px++); 00317 /* read x[1], x[2] samples */ 00318 x1 = *(q31_t *) (px++); 00319 00320 00321 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00322 k = srcBLen >> 2u; 00323 00324 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00325 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00326 do 00327 { 00328 /* Read the last two inputB samples using SIMD: 00329 * y[srcBLen - 1] and y[srcBLen - 2] */ 00330 c0 = *(pb--); 00331 00332 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00333 acc0 = __SMLALDX(x0, c0, acc0); 00334 00335 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00336 acc1 = __SMLALDX(x1, c0, acc1); 00337 00338 /* Read x[2], x[3] */ 00339 x2 = *(q31_t *) (px++); 00340 00341 /* Read x[3], x[4] */ 00342 x3 = *(q31_t *) (px++); 00343 00344 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00345 acc2 = __SMLALDX(x2, c0, acc2); 00346 00347 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00348 acc3 = __SMLALDX(x3, c0, acc3); 00349 00350 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00351 c0 = *(pb--); 00352 00353 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00354 acc0 = __SMLALDX(x2, c0, acc0); 00355 00356 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00357 acc1 = __SMLALDX(x3, c0, acc1); 00358 00359 /* Read x[4], x[5] */ 00360 x0 = *(q31_t *) (px++); 00361 00362 /* Read x[5], x[6] */ 00363 x1 = *(q31_t *) (px++); 00364 00365 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00366 acc2 = __SMLALDX(x0, c0, acc2); 00367 00368 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00369 acc3 = __SMLALDX(x1, c0, acc3); 00370 00371 } while(--k); 00372 00373 /* For the next MAC operations, SIMD is not used 00374 * So, the 16 bit pointer if inputB, py is updated */ 00375 py = (q15_t *) pb; 00376 py = py + 1; 00377 00378 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00379 ** No loop unrolling is used. */ 00380 k = srcBLen % 0x4u; 00381 00382 if(k == 1u) 00383 { 00384 /* Read y[srcBLen - 5] */ 00385 c0 = *(py); 00386 00387 #ifdef ARM_MATH_BIG_ENDIAN 00388 00389 c0 = c0 << 16u; 00390 00391 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00392 /* Read x[7] */ 00393 x3 = *(q31_t *) px++; 00394 00395 /* Perform the multiply-accumulates */ 00396 acc0 = __SMLALD(x0, c0, acc0); 00397 acc1 = __SMLALD(x1, c0, acc1); 00398 acc2 = __SMLALDX(x1, c0, acc2); 00399 acc3 = __SMLALDX(x3, c0, acc3); 00400 } 00401 00402 if(k == 2u) 00403 { 00404 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00405 c0 = *(pb); 00406 00407 /* Read x[7], x[8] */ 00408 x3 = *(q31_t *) px++; 00409 00410 /* Read x[9] */ 00411 x2 = *(q31_t *) px++; 00412 00413 /* Perform the multiply-accumulates */ 00414 acc0 = __SMLALDX(x0, c0, acc0); 00415 acc1 = __SMLALDX(x1, c0, acc1); 00416 acc2 = __SMLALDX(x3, c0, acc2); 00417 acc3 = __SMLALDX(x2, c0, acc3); 00418 } 00419 00420 if(k == 3u) 00421 { 00422 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00423 c0 = *pb--; 00424 00425 /* Read x[7], x[8] */ 00426 x3 = *(q31_t *) px++; 00427 00428 /* Read x[9] */ 00429 x2 = *(q31_t *) px++; 00430 00431 /* Perform the multiply-accumulates */ 00432 acc0 = __SMLALDX(x0, c0, acc0); 00433 acc1 = __SMLALDX(x1, c0, acc1); 00434 acc2 = __SMLALDX(x3, c0, acc2); 00435 acc3 = __SMLALDX(x2, c0, acc3); 00436 00437 #ifdef ARM_MATH_BIG_ENDIAN 00438 00439 /* Read y[srcBLen - 7] */ 00440 c0 = (*pb); 00441 c0 = (c0) << 16; 00442 00443 #else 00444 00445 /* Read y[srcBLen - 7] */ 00446 c0 = (q15_t) (*pb >> 16); 00447 00448 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00449 00450 /* Read x[10] */ 00451 x3 = *(q31_t *) px++; 00452 00453 /* Perform the multiply-accumulates */ 00454 acc0 = __SMLALDX(x1, c0, acc0); 00455 acc1 = __SMLALD(x2, c0, acc1); 00456 acc2 = __SMLALDX(x2, c0, acc2); 00457 acc3 = __SMLALDX(x3, c0, acc3); 00458 } 00459 00460 /* Store the results in the accumulators in the destination buffer. */ 00461 #ifndef ARM_MATH_BIG_ENDIAN 00462 00463 *__SIMD32(pOut)++ = 00464 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00465 *__SIMD32(pOut)++ = 00466 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00467 00468 #else 00469 00470 *__SIMD32(pOut)++ = 00471 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00472 *__SIMD32(pOut)++ = 00473 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00474 00475 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00476 00477 /* Update the inputA and inputB pointers for next MAC calculation */ 00478 px = pIn1 + (count * 4u); 00479 py = pSrc2; 00480 pb = (q31_t *) (py - 1); 00481 00482 /* Increment the pointer pIn1 index, count by 1 */ 00483 count++; 00484 00485 /* Decrement the loop counter */ 00486 blkCnt--; 00487 } 00488 00489 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00490 ** No loop unrolling is used. */ 00491 blkCnt = (uint32_t) blockSize2 % 0x4u; 00492 00493 while(blkCnt > 0u) 00494 { 00495 /* Accumulator is made zero for every iteration */ 00496 sum = 0; 00497 00498 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00499 k = srcBLen >> 2u; 00500 00501 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00502 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00503 while(k > 0u) 00504 { 00505 /* Perform the multiply-accumulates */ 00506 sum += (q63_t) ((q31_t) * px++ * *py--); 00507 sum += (q63_t) ((q31_t) * px++ * *py--); 00508 sum += (q63_t) ((q31_t) * px++ * *py--); 00509 sum += (q63_t) ((q31_t) * px++ * *py--); 00510 00511 /* Decrement the loop counter */ 00512 k--; 00513 } 00514 00515 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00516 ** No loop unrolling is used. */ 00517 k = srcBLen % 0x4u; 00518 00519 while(k > 0u) 00520 { 00521 /* Perform the multiply-accumulates */ 00522 sum += (q63_t) ((q31_t) * px++ * *py--); 00523 00524 /* Decrement the loop counter */ 00525 k--; 00526 } 00527 00528 /* Store the result in the accumulator in the destination buffer. */ 00529 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 00530 00531 /* Update the inputA and inputB pointers for next MAC calculation */ 00532 px = pIn1 + count; 00533 py = pSrc2; 00534 00535 /* Increment the pointer pIn1 index, count by 1 */ 00536 count++; 00537 00538 /* Decrement the loop counter */ 00539 blkCnt--; 00540 } 00541 } 00542 else 00543 { 00544 /* If the srcBLen is not a multiple of 4, 00545 * the blockSize2 loop cannot be unrolled by 4 */ 00546 blkCnt = (uint32_t) blockSize2; 00547 00548 while(blkCnt > 0u) 00549 { 00550 /* Accumulator is made zero for every iteration */ 00551 sum = 0; 00552 00553 /* srcBLen number of MACS should be performed */ 00554 k = srcBLen; 00555 00556 while(k > 0u) 00557 { 00558 /* Perform the multiply-accumulate */ 00559 sum += (q63_t) ((q31_t) * px++ * *py--); 00560 00561 /* Decrement the loop counter */ 00562 k--; 00563 } 00564 00565 /* Store the result in the accumulator in the destination buffer. */ 00566 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 00567 00568 /* Update the inputA and inputB pointers for next MAC calculation */ 00569 px = pIn1 + count; 00570 py = pSrc2; 00571 00572 /* Increment the MAC count */ 00573 count++; 00574 00575 /* Decrement the loop counter */ 00576 blkCnt--; 00577 } 00578 } 00579 00580 00581 /* -------------------------- 00582 * Initializations of stage3 00583 * -------------------------*/ 00584 00585 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00586 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00587 * .... 00588 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00589 * sum += x[srcALen-1] * y[srcBLen-1] 00590 */ 00591 00592 /* In this stage the MAC operations are decreased by 1 for every iteration. 00593 The count variable holds the number of MAC operations performed */ 00594 count = srcBLen - 1u; 00595 00596 /* Working pointer of inputA */ 00597 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00598 px = pSrc1; 00599 00600 /* Working pointer of inputB */ 00601 pSrc2 = pIn2 + (srcBLen - 1u); 00602 pIn2 = pSrc2 - 1u; 00603 py = pIn2; 00604 00605 /* ------------------- 00606 * Stage3 process 00607 * ------------------*/ 00608 00609 /* For loop unrolling by 4, this stage is divided into two. */ 00610 /* First part of this stage computes the MAC operations greater than 4 */ 00611 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 00612 00613 /* The first part of the stage starts here */ 00614 j = count >> 2u; 00615 00616 while((j > 0u) && (blockSize3 > 0)) 00617 { 00618 /* Accumulator is made zero for every iteration */ 00619 sum = 0; 00620 00621 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00622 k = count >> 2u; 00623 00624 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00625 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00626 while(k > 0u) 00627 { 00628 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied 00629 * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00630 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00631 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied 00632 * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00633 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00634 00635 /* Decrement the loop counter */ 00636 k--; 00637 } 00638 00639 /* For the next MAC operations, the pointer py is used without SIMD 00640 * So, py is incremented by 1 */ 00641 py = py + 1u; 00642 00643 /* If the count is not a multiple of 4, compute any remaining MACs here. 00644 ** No loop unrolling is used. */ 00645 k = count % 0x4u; 00646 00647 while(k > 0u) 00648 { 00649 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 00650 sum = __SMLALD(*px++, *py--, sum); 00651 00652 /* Decrement the loop counter */ 00653 k--; 00654 } 00655 00656 /* Store the result in the accumulator in the destination buffer. */ 00657 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00658 00659 /* Update the inputA and inputB pointers for next MAC calculation */ 00660 px = ++pSrc1; 00661 py = pIn2; 00662 00663 /* Decrement the MAC count */ 00664 count--; 00665 00666 /* Decrement the loop counter */ 00667 blockSize3--; 00668 00669 j--; 00670 } 00671 00672 /* The second part of the stage starts here */ 00673 /* SIMD is not used for the next MAC operations, 00674 * so pointer py is updated to read only one sample at a time */ 00675 py = py + 1u; 00676 00677 while(blockSize3 > 0) 00678 { 00679 /* Accumulator is made zero for every iteration */ 00680 sum = 0; 00681 00682 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00683 k = count; 00684 00685 while(k > 0u) 00686 { 00687 /* Perform the multiply-accumulates */ 00688 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00689 sum = __SMLALD(*px++, *py--, sum); 00690 00691 /* Decrement the loop counter */ 00692 k--; 00693 } 00694 00695 /* Store the result in the accumulator in the destination buffer. */ 00696 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00697 00698 /* Update the inputA and inputB pointers for next MAC calculation */ 00699 px = ++pSrc1; 00700 py = pSrc2; 00701 00702 /* Decrement the MAC count */ 00703 count--; 00704 00705 /* Decrement the loop counter */ 00706 blockSize3--; 00707 } 00708 00709 /* set status as ARM_MATH_SUCCESS */ 00710 status = ARM_MATH_SUCCESS; 00711 } 00712 00713 /* Return to application */ 00714 return (status); 00715 00716 #else 00717 00718 /* Run the below code for Cortex-M0 */ 00719 00720 q15_t *pIn1 = pSrcA; /* inputA pointer */ 00721 q15_t *pIn2 = pSrcB; /* inputB pointer */ 00722 q63_t sum; /* Accumulator */ 00723 uint32_t i, j; /* loop counters */ 00724 arm_status status; /* status of Partial convolution */ 00725 00726 /* Check for range of output samples to be calculated */ 00727 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00728 { 00729 /* Set status as ARM_ARGUMENT_ERROR */ 00730 status = ARM_MATH_ARGUMENT_ERROR; 00731 } 00732 else 00733 { 00734 /* Loop to calculate convolution for output length number of values */ 00735 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) 00736 { 00737 /* Initialize sum with zero to carry on MAC operations */ 00738 sum = 0; 00739 00740 /* Loop to perform MAC operations according to convolution equation */ 00741 for (j = 0; j <= i; j++) 00742 { 00743 /* Check the array limitations */ 00744 if(((i - j) < srcBLen) && (j < srcALen)) 00745 { 00746 /* z[i] += x[i-j] * y[j] */ 00747 sum += ((q31_t) pIn1[j] * (pIn2[i - j])); 00748 } 00749 } 00750 00751 /* Store the output in the destination buffer */ 00752 pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u); 00753 } 00754 /* set status as ARM_SUCCESS as there are no argument errors */ 00755 status = ARM_MATH_SUCCESS; 00756 } 00757 return (status); 00758 00759 #endif /* #ifndef ARM_MATH_CM0 */ 00760 00761 } 00762