00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_fast_q15.c 00009 * 00010 * Description: Fast Q15 Partial convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated. 00028 * -------------------------------------------------------------------- */ 00029 00030 #include "arm_math.h" 00031 00056 arm_status arm_conv_partial_fast_q15( 00057 q15_t * pSrcA, 00058 uint32_t srcALen, 00059 q15_t * pSrcB, 00060 uint32_t srcBLen, 00061 q15_t * pDst, 00062 uint32_t firstIndex, 00063 uint32_t numPoints) 00064 { 00065 q15_t *pIn1; /* inputA pointer */ 00066 q15_t *pIn2; /* inputB pointer */ 00067 q15_t *pOut = pDst; /* output pointer */ 00068 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00069 q15_t *px; /* Intermediate inputA pointer */ 00070 q15_t *py; /* Intermediate inputB pointer */ 00071 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00072 q31_t x0, x1, x2, x3, c0; 00073 uint32_t j, k, count, check, blkCnt; 00074 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ 00075 arm_status status; /* status of Partial convolution */ 00076 q31_t *pb; /* 32 bit pointer for inputB buffer */ 00077 00078 /* Check for range of output samples to be calculated */ 00079 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00080 { 00081 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00082 status = ARM_MATH_ARGUMENT_ERROR; 00083 } 00084 else 00085 { 00086 00087 /* The algorithm implementation is based on the lengths of the inputs. */ 00088 /* srcB is always made to slide across srcA. */ 00089 /* So srcBLen is always considered as shorter or equal to srcALen */ 00090 if(srcALen >= srcBLen) 00091 { 00092 /* Initialization of inputA pointer */ 00093 pIn1 = pSrcA; 00094 00095 /* Initialization of inputB pointer */ 00096 pIn2 = pSrcB; 00097 } 00098 else 00099 { 00100 /* Initialization of inputA pointer */ 00101 pIn1 = pSrcB; 00102 00103 /* Initialization of inputB pointer */ 00104 pIn2 = pSrcA; 00105 00106 /* srcBLen is always considered as shorter or equal to srcALen */ 00107 j = srcBLen; 00108 srcBLen = srcALen; 00109 srcALen = j; 00110 } 00111 00112 /* Conditions to check which loopCounter holds 00113 * the first and last indices of the output samples to be calculated. */ 00114 check = firstIndex + numPoints; 00115 blockSize3 = ((int32_t) check - (int32_t) srcALen); 00116 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00117 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00118 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00119 (int32_t) numPoints) : 0; 00120 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00121 (int32_t) firstIndex); 00122 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00123 00124 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00125 /* The function is internally 00126 * divided into three stages according to the number of multiplications that has to be 00127 * taken place between inputA samples and inputB samples. In the first stage of the 00128 * algorithm, the multiplications increase by one for every iteration. 00129 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00130 * In the third stage of the algorithm, the multiplications decrease by one 00131 * for every iteration. */ 00132 00133 /* Set the output pointer to point to the firstIndex 00134 * of the output sample to be calculated. */ 00135 pOut = pDst + firstIndex; 00136 00137 /* -------------------------- 00138 * Initializations of stage1 00139 * -------------------------*/ 00140 00141 /* sum = x[0] * y[0] 00142 * sum = x[0] * y[1] + x[1] * y[0] 00143 * .... 00144 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00145 */ 00146 00147 /* In this stage the MAC operations are increased by 1 for every iteration. 00148 The count variable holds the number of MAC operations performed. 00149 Since the partial convolution starts from firstIndex 00150 Number of Macs to be performed is firstIndex + 1 */ 00151 count = 1u + firstIndex; 00152 00153 /* Working pointer of inputA */ 00154 px = pIn1; 00155 00156 /* Working pointer of inputB */ 00157 pSrc2 = pIn2 + firstIndex; 00158 py = pSrc2; 00159 00160 /* ------------------------ 00161 * Stage1 process 00162 * ----------------------*/ 00163 00164 /* For loop unrolling by 4, this stage is divided into two. */ 00165 /* First part of this stage computes the MAC operations less than 4 */ 00166 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00167 00168 /* The first part of the stage starts here */ 00169 while((count < 4u) && (blockSize1 > 0)) 00170 { 00171 /* Accumulator is made zero for every iteration */ 00172 sum = 0; 00173 00174 /* Loop over number of MAC operations between 00175 * inputA samples and inputB samples */ 00176 k = count; 00177 00178 while(k > 0u) 00179 { 00180 /* Perform the multiply-accumulates */ 00181 sum = __SMLAD(*px++, *py--, sum); 00182 00183 /* Decrement the loop counter */ 00184 k--; 00185 } 00186 00187 /* Store the result in the accumulator in the destination buffer. */ 00188 *pOut++ = (q15_t) (sum >> 15); 00189 00190 /* Update the inputA and inputB pointers for next MAC calculation */ 00191 py = ++pSrc2; 00192 px = pIn1; 00193 00194 /* Increment the MAC count */ 00195 count++; 00196 00197 /* Decrement the loop counter */ 00198 blockSize1--; 00199 } 00200 00201 /* The second part of the stage starts here */ 00202 /* The internal loop, over count, is unrolled by 4 */ 00203 /* To, read the last two inputB samples using SIMD: 00204 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00205 py = py - 1; 00206 00207 while(blockSize1 > 0) 00208 { 00209 /* Accumulator is made zero for every iteration */ 00210 sum = 0; 00211 00212 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00213 k = count >> 2u; 00214 00215 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00216 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00217 while(k > 0u) 00218 { 00219 /* Perform the multiply-accumulates */ 00220 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00221 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00222 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00223 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00224 00225 /* Decrement the loop counter */ 00226 k--; 00227 } 00228 00229 /* For the next MAC operations, the pointer py is used without SIMD 00230 * So, py is incremented by 1 */ 00231 py = py + 1u; 00232 00233 /* If the count is not a multiple of 4, compute any remaining MACs here. 00234 ** No loop unrolling is used. */ 00235 k = count % 0x4u; 00236 00237 while(k > 0u) 00238 { 00239 /* Perform the multiply-accumulates */ 00240 sum = __SMLAD(*px++, *py--, sum); 00241 00242 /* Decrement the loop counter */ 00243 k--; 00244 } 00245 00246 /* Store the result in the accumulator in the destination buffer. */ 00247 *pOut++ = (q15_t) (sum >> 15); 00248 00249 /* Update the inputA and inputB pointers for next MAC calculation */ 00250 py = ++pSrc2 - 1u; 00251 px = pIn1; 00252 00253 /* Increment the MAC count */ 00254 count++; 00255 00256 /* Decrement the loop counter */ 00257 blockSize1--; 00258 } 00259 00260 /* -------------------------- 00261 * Initializations of stage2 00262 * ------------------------*/ 00263 00264 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00265 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00266 * .... 00267 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00268 */ 00269 00270 /* Working pointer of inputA */ 00271 px = pIn1; 00272 00273 /* Working pointer of inputB */ 00274 pSrc2 = pIn2 + (srcBLen - 1u); 00275 py = pSrc2; 00276 00277 /* Initialize inputB pointer of type q31 */ 00278 pb = (q31_t *) (py - 1u); 00279 00280 /* count is the index by which the pointer pIn1 to be incremented */ 00281 count = 1u; 00282 00283 00284 /* -------------------- 00285 * Stage2 process 00286 * -------------------*/ 00287 00288 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00289 * So, to loop unroll over blockSize2, 00290 * srcBLen should be greater than or equal to 4 */ 00291 if(srcBLen >= 4u) 00292 { 00293 /* Loop unroll over blockSize2, by 4 */ 00294 blkCnt = ((uint32_t) blockSize2 >> 2u); 00295 00296 while(blkCnt > 0u) 00297 { 00298 /* Set all accumulators to zero */ 00299 acc0 = 0; 00300 acc1 = 0; 00301 acc2 = 0; 00302 acc3 = 0; 00303 00304 00305 /* read x[0], x[1] samples */ 00306 x0 = *(q31_t *) (px++); 00307 /* read x[1], x[2] samples */ 00308 x1 = *(q31_t *) (px++); 00309 00310 00311 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00312 k = srcBLen >> 2u; 00313 00314 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00315 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00316 do 00317 { 00318 /* Read the last two inputB samples using SIMD: 00319 * y[srcBLen - 1] and y[srcBLen - 2] */ 00320 c0 = *(pb--); 00321 00322 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00323 acc0 = __SMLADX(x0, c0, acc0); 00324 00325 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00326 acc1 = __SMLADX(x1, c0, acc1); 00327 00328 /* Read x[2], x[3] */ 00329 x2 = *(q31_t *) (px++); 00330 00331 /* Read x[3], x[4] */ 00332 x3 = *(q31_t *) (px++); 00333 00334 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00335 acc2 = __SMLADX(x2, c0, acc2); 00336 00337 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00338 acc3 = __SMLADX(x3, c0, acc3); 00339 00340 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00341 c0 = *(pb--); 00342 00343 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00344 acc0 = __SMLADX(x2, c0, acc0); 00345 00346 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00347 acc1 = __SMLADX(x3, c0, acc1); 00348 00349 /* Read x[4], x[5] */ 00350 x0 = *(q31_t *) (px++); 00351 00352 /* Read x[5], x[6] */ 00353 x1 = *(q31_t *) (px++); 00354 00355 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00356 acc2 = __SMLADX(x0, c0, acc2); 00357 00358 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00359 acc3 = __SMLADX(x1, c0, acc3); 00360 00361 } while(--k); 00362 00363 /* For the next MAC operations, SIMD is not used 00364 * So, the 16 bit pointer if inputB, py is updated */ 00365 py = (q15_t *) pb; 00366 py = py + 1; 00367 00368 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00369 ** No loop unrolling is used. */ 00370 k = srcBLen % 0x4u; 00371 00372 if(k == 1u) 00373 { 00374 /* Read y[srcBLen - 5] */ 00375 c0 = *(py); 00376 #ifdef ARM_MATH_BIG_ENDIAN 00377 00378 c0 = c0 << 16; 00379 00380 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00381 00382 /* Read x[7] */ 00383 x3 = *(q31_t *) px++; 00384 00385 /* Perform the multiply-accumulates */ 00386 acc0 = __SMLAD(x0, c0, acc0); 00387 acc1 = __SMLAD(x1, c0, acc1); 00388 acc2 = __SMLADX(x1, c0, acc2); 00389 acc3 = __SMLADX(x3, c0, acc3); 00390 } 00391 00392 if(k == 2u) 00393 { 00394 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00395 c0 = *(pb); 00396 00397 /* Read x[7], x[8] */ 00398 x3 = *(q31_t *) px++; 00399 00400 /* Read x[9] */ 00401 x2 = *(q31_t *) px++; 00402 00403 /* Perform the multiply-accumulates */ 00404 acc0 = __SMLADX(x0, c0, acc0); 00405 acc1 = __SMLADX(x1, c0, acc1); 00406 acc2 = __SMLADX(x3, c0, acc2); 00407 acc3 = __SMLADX(x2, c0, acc3); 00408 } 00409 00410 if(k == 3u) 00411 { 00412 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00413 c0 = *pb--; 00414 00415 /* Read x[7], x[8] */ 00416 x3 = *(q31_t *) px++; 00417 00418 /* Read x[9] */ 00419 x2 = *(q31_t *) px++; 00420 00421 /* Perform the multiply-accumulates */ 00422 acc0 = __SMLADX(x0, c0, acc0); 00423 acc1 = __SMLADX(x1, c0, acc1); 00424 acc2 = __SMLADX(x3, c0, acc2); 00425 acc3 = __SMLADX(x2, c0, acc3); 00426 00427 /* Read y[srcBLen - 7] */ 00428 #ifdef ARM_MATH_BIG_ENDIAN 00429 00430 c0 = (*pb); 00431 c0 = (c0) << 16; 00432 00433 #else 00434 00435 c0 = (q15_t) (*pb >> 16); 00436 00437 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00438 00439 /* Read x[10] */ 00440 x3 = *(q31_t *) px++; 00441 00442 /* Perform the multiply-accumulates */ 00443 acc0 = __SMLADX(x1, c0, acc0); 00444 acc1 = __SMLAD(x2, c0, acc1); 00445 acc2 = __SMLADX(x2, c0, acc2); 00446 acc3 = __SMLADX(x3, c0, acc3); 00447 } 00448 00449 /* Store the results in the accumulators in the destination buffer. */ 00450 #ifndef ARM_MATH_BIG_ENDIAN 00451 00452 *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16); 00453 *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16); 00454 00455 #else 00456 00457 *__SIMD32(pOut)++ = __PKHBT(acc1 >> 15, acc0 >> 15, 16); 00458 *__SIMD32(pOut)++ = __PKHBT(acc3 >> 15, acc2 >> 15, 16); 00459 00460 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00461 00462 /* Update the inputA and inputB pointers for next MAC calculation */ 00463 px = pIn1 + (count * 4u); 00464 py = pSrc2; 00465 pb = (q31_t *) (py - 1); 00466 00467 /* Increment the pointer pIn1 index, count by 1 */ 00468 count++; 00469 00470 /* Decrement the loop counter */ 00471 blkCnt--; 00472 } 00473 00474 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00475 ** No loop unrolling is used. */ 00476 blkCnt = (uint32_t) blockSize2 % 0x4u; 00477 00478 while(blkCnt > 0u) 00479 { 00480 /* Accumulator is made zero for every iteration */ 00481 sum = 0; 00482 00483 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00484 k = srcBLen >> 2u; 00485 00486 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00487 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00488 while(k > 0u) 00489 { 00490 /* Perform the multiply-accumulates */ 00491 sum += ((q31_t) * px++ * *py--); 00492 sum += ((q31_t) * px++ * *py--); 00493 sum += ((q31_t) * px++ * *py--); 00494 sum += ((q31_t) * px++ * *py--); 00495 00496 /* Decrement the loop counter */ 00497 k--; 00498 } 00499 00500 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00501 ** No loop unrolling is used. */ 00502 k = srcBLen % 0x4u; 00503 00504 while(k > 0u) 00505 { 00506 /* Perform the multiply-accumulates */ 00507 sum += ((q31_t) * px++ * *py--); 00508 00509 /* Decrement the loop counter */ 00510 k--; 00511 } 00512 00513 /* Store the result in the accumulator in the destination buffer. */ 00514 *pOut++ = (q15_t) (sum >> 15); 00515 00516 /* Update the inputA and inputB pointers for next MAC calculation */ 00517 px = pIn1 + count; 00518 py = pSrc2; 00519 00520 /* Increment the pointer pIn1 index, count by 1 */ 00521 count++; 00522 00523 /* Decrement the loop counter */ 00524 blkCnt--; 00525 } 00526 } 00527 else 00528 { 00529 /* If the srcBLen is not a multiple of 4, 00530 * the blockSize2 loop cannot be unrolled by 4 */ 00531 blkCnt = (uint32_t) blockSize2; 00532 00533 while(blkCnt > 0u) 00534 { 00535 /* Accumulator is made zero for every iteration */ 00536 sum = 0; 00537 00538 /* srcBLen number of MACS should be performed */ 00539 k = srcBLen; 00540 00541 while(k > 0u) 00542 { 00543 /* Perform the multiply-accumulate */ 00544 sum += ((q31_t) * px++ * *py--); 00545 00546 /* Decrement the loop counter */ 00547 k--; 00548 } 00549 00550 /* Store the result in the accumulator in the destination buffer. */ 00551 *pOut++ = (q15_t) (sum >> 15); 00552 00553 /* Update the inputA and inputB pointers for next MAC calculation */ 00554 px = pIn1 + count; 00555 py = pSrc2; 00556 00557 /* Increment the MAC count */ 00558 count++; 00559 00560 /* Decrement the loop counter */ 00561 blkCnt--; 00562 } 00563 } 00564 00565 00566 /* -------------------------- 00567 * Initializations of stage3 00568 * -------------------------*/ 00569 00570 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00571 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00572 * .... 00573 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00574 * sum += x[srcALen-1] * y[srcBLen-1] 00575 */ 00576 00577 /* In this stage the MAC operations are decreased by 1 for every iteration. 00578 The count variable holds the number of MAC operations performed */ 00579 count = srcBLen - 1u; 00580 00581 /* Working pointer of inputA */ 00582 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00583 px = pSrc1; 00584 00585 /* Working pointer of inputB */ 00586 pSrc2 = pIn2 + (srcBLen - 1u); 00587 pIn2 = pSrc2 - 1u; 00588 py = pIn2; 00589 00590 /* ------------------- 00591 * Stage3 process 00592 * ------------------*/ 00593 00594 /* For loop unrolling by 4, this stage is divided into two. */ 00595 /* First part of this stage computes the MAC operations greater than 4 */ 00596 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 00597 00598 /* The first part of the stage starts here */ 00599 j = count >> 2u; 00600 00601 while((j > 0u) && (blockSize3 > 0)) 00602 { 00603 /* Accumulator is made zero for every iteration */ 00604 sum = 0; 00605 00606 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00607 k = count >> 2u; 00608 00609 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00610 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00611 while(k > 0u) 00612 { 00613 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied 00614 * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00615 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00616 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied 00617 * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00618 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00619 00620 /* Decrement the loop counter */ 00621 k--; 00622 } 00623 00624 /* For the next MAC operations, the pointer py is used without SIMD 00625 * So, py is incremented by 1 */ 00626 py = py + 1u; 00627 00628 /* If the count is not a multiple of 4, compute any remaining MACs here. 00629 ** No loop unrolling is used. */ 00630 k = count % 0x4u; 00631 00632 while(k > 0u) 00633 { 00634 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 00635 sum = __SMLAD(*px++, *py--, sum); 00636 00637 /* Decrement the loop counter */ 00638 k--; 00639 } 00640 00641 /* Store the result in the accumulator in the destination buffer. */ 00642 *pOut++ = (q15_t) (sum >> 15); 00643 00644 /* Update the inputA and inputB pointers for next MAC calculation */ 00645 px = ++pSrc1; 00646 py = pIn2; 00647 00648 /* Decrement the MAC count */ 00649 count--; 00650 00651 /* Decrement the loop counter */ 00652 blockSize3--; 00653 00654 j--; 00655 } 00656 00657 /* The second part of the stage starts here */ 00658 /* SIMD is not used for the next MAC operations, 00659 * so pointer py is updated to read only one sample at a time */ 00660 py = py + 1u; 00661 00662 while(blockSize3 > 0) 00663 { 00664 /* Accumulator is made zero for every iteration */ 00665 sum = 0; 00666 00667 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00668 k = count; 00669 00670 while(k > 0u) 00671 { 00672 /* Perform the multiply-accumulates */ 00673 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00674 sum = __SMLAD(*px++, *py--, sum); 00675 00676 /* Decrement the loop counter */ 00677 k--; 00678 } 00679 00680 /* Store the result in the accumulator in the destination buffer. */ 00681 *pOut++ = (q15_t) (sum >> 15); 00682 00683 /* Update the inputA and inputB pointers for next MAC calculation */ 00684 px = ++pSrc1; 00685 py = pSrc2; 00686 00687 /* Decrement the MAC count */ 00688 count--; 00689 00690 /* Decrement the loop counter */ 00691 blockSize3--; 00692 } 00693 00694 /* set status as ARM_MATH_SUCCESS */ 00695 status = ARM_MATH_SUCCESS; 00696 } 00697 00698 /* Return to application */ 00699 return (status); 00700 00701 } 00702