00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_fast_q15.c 00009 * 00010 * Description: Fast Q15 Convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated. 00028 * -------------------------------------------------------------------- */ 00029 00030 #include "arm_math.h" 00031 00066 void arm_conv_fast_q15( 00067 q15_t * pSrcA, 00068 uint32_t srcALen, 00069 q15_t * pSrcB, 00070 uint32_t srcBLen, 00071 q15_t * pDst) 00072 { 00073 q15_t *pIn1; /* inputA pointer */ 00074 q15_t *pIn2; /* inputB pointer */ 00075 q15_t *pOut = pDst; /* output pointer */ 00076 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00077 q15_t *px; /* Intermediate inputA pointer */ 00078 q15_t *py; /* Intermediate inputB pointer */ 00079 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00080 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00081 uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ 00082 q31_t *pb; /* 32 bit pointer for inputB buffer */ 00083 00084 00085 /* The algorithm implementation is based on the lengths of the inputs. */ 00086 /* srcB is always made to slide across srcA. */ 00087 /* So srcBLen is always considered as shorter or equal to srcALen */ 00088 if(srcALen >= srcBLen) 00089 { 00090 /* Initialization of inputA pointer */ 00091 pIn1 = pSrcA; 00092 00093 /* Initialization of inputB pointer */ 00094 pIn2 = pSrcB; 00095 } 00096 else 00097 { 00098 /* Initialization of inputA pointer */ 00099 pIn1 = pSrcB; 00100 00101 /* Initialization of inputB pointer */ 00102 pIn2 = pSrcA; 00103 00104 /* srcBLen is always considered as shorter or equal to srcALen */ 00105 j = srcBLen; 00106 srcBLen = srcALen; 00107 srcALen = j; 00108 } 00109 00110 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00111 /* The function is internally 00112 * divided into three stages according to the number of multiplications that has to be 00113 * taken place between inputA samples and inputB samples. In the first stage of the 00114 * algorithm, the multiplications increase by one for every iteration. 00115 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00116 * In the third stage of the algorithm, the multiplications decrease by one 00117 * for every iteration. */ 00118 00119 /* The algorithm is implemented in three stages. 00120 The loop counters of each stage is initiated here. */ 00121 blockSize1 = srcBLen - 1u; 00122 blockSize2 = srcALen - (srcBLen - 1u); 00123 blockSize3 = blockSize1; 00124 00125 /* -------------------------- 00126 * Initializations of stage1 00127 * -------------------------*/ 00128 00129 /* sum = x[0] * y[0] 00130 * sum = x[0] * y[1] + x[1] * y[0] 00131 * .... 00132 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00133 */ 00134 00135 /* In this stage the MAC operations are increased by 1 for every iteration. 00136 The count variable holds the number of MAC operations performed */ 00137 count = 1u; 00138 00139 /* Working pointer of inputA */ 00140 px = pIn1; 00141 00142 /* Working pointer of inputB */ 00143 py = pIn2; 00144 00145 00146 /* ------------------------ 00147 * Stage1 process 00148 * ----------------------*/ 00149 00150 /* For loop unrolling by 4, this stage is divided into two. */ 00151 /* First part of this stage computes the MAC operations less than 4 */ 00152 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00153 00154 /* The first part of the stage starts here */ 00155 while((count < 4u) && (blockSize1 > 0u)) 00156 { 00157 /* Accumulator is made zero for every iteration */ 00158 sum = 0; 00159 00160 /* Loop over number of MAC operations between 00161 * inputA samples and inputB samples */ 00162 k = count; 00163 00164 while(k > 0u) 00165 { 00166 /* Perform the multiply-accumulates */ 00167 sum = __SMLAD(*px++, *py--, sum); 00168 00169 /* Decrement the loop counter */ 00170 k--; 00171 } 00172 00173 /* Store the result in the accumulator in the destination buffer. */ 00174 *pOut++ = (q15_t) (sum >> 15); 00175 00176 /* Update the inputA and inputB pointers for next MAC calculation */ 00177 py = pIn2 + count; 00178 px = pIn1; 00179 00180 /* Increment the MAC count */ 00181 count++; 00182 00183 /* Decrement the loop counter */ 00184 blockSize1--; 00185 } 00186 00187 /* The second part of the stage starts here */ 00188 /* The internal loop, over count, is unrolled by 4 */ 00189 /* To, read the last two inputB samples using SIMD: 00190 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00191 py = py - 1; 00192 00193 while(blockSize1 > 0u) 00194 { 00195 /* Accumulator is made zero for every iteration */ 00196 sum = 0; 00197 00198 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00199 k = count >> 2u; 00200 00201 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00202 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00203 while(k > 0u) 00204 { 00205 /* Perform the multiply-accumulates */ 00206 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00207 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00208 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00209 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00210 00211 /* Decrement the loop counter */ 00212 k--; 00213 } 00214 00215 /* For the next MAC operations, the pointer py is used without SIMD 00216 * So, py is incremented by 1 */ 00217 py = py + 1u; 00218 00219 /* If the count is not a multiple of 4, compute any remaining MACs here. 00220 ** No loop unrolling is used. */ 00221 k = count % 0x4u; 00222 00223 while(k > 0u) 00224 { 00225 /* Perform the multiply-accumulates */ 00226 sum = __SMLAD(*px++, *py--, sum); 00227 00228 /* Decrement the loop counter */ 00229 k--; 00230 } 00231 00232 /* Store the result in the accumulator in the destination buffer. */ 00233 *pOut++ = (q15_t) (sum >> 15); 00234 00235 /* Update the inputA and inputB pointers for next MAC calculation */ 00236 py = pIn2 + (count - 1u); 00237 px = pIn1; 00238 00239 /* Increment the MAC count */ 00240 count++; 00241 00242 /* Decrement the loop counter */ 00243 blockSize1--; 00244 } 00245 00246 /* -------------------------- 00247 * Initializations of stage2 00248 * ------------------------*/ 00249 00250 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00251 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00252 * .... 00253 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00254 */ 00255 00256 /* Working pointer of inputA */ 00257 px = pIn1; 00258 00259 /* Working pointer of inputB */ 00260 pSrc2 = pIn2 + (srcBLen - 1u); 00261 py = pSrc2; 00262 00263 /* Initialize inputB pointer of type q31 */ 00264 pb = (q31_t *) (py - 1u); 00265 00266 /* count is the index by which the pointer pIn1 to be incremented */ 00267 count = 1u; 00268 00269 00270 /* -------------------- 00271 * Stage2 process 00272 * -------------------*/ 00273 00274 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00275 * So, to loop unroll over blockSize2, 00276 * srcBLen should be greater than or equal to 4 */ 00277 if(srcBLen >= 4u) 00278 { 00279 /* Loop unroll over blockSize2, by 4 */ 00280 blkCnt = blockSize2 >> 2u; 00281 00282 while(blkCnt > 0u) 00283 { 00284 /* Set all accumulators to zero */ 00285 acc0 = 0; 00286 acc1 = 0; 00287 acc2 = 0; 00288 acc3 = 0; 00289 00290 00291 /* read x[0], x[1] samples */ 00292 x0 = *(q31_t *) (px++); 00293 /* read x[1], x[2] samples */ 00294 x1 = *(q31_t *) (px++); 00295 00296 00297 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00298 k = srcBLen >> 2u; 00299 00300 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00301 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00302 do 00303 { 00304 /* Read the last two inputB samples using SIMD: 00305 * y[srcBLen - 1] and y[srcBLen - 2] */ 00306 c0 = *(pb--); 00307 00308 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00309 acc0 = __SMLADX(x0, c0, acc0); 00310 00311 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00312 acc1 = __SMLADX(x1, c0, acc1); 00313 00314 /* Read x[2], x[3] */ 00315 x2 = *(q31_t *) (px++); 00316 00317 /* Read x[3], x[4] */ 00318 x3 = *(q31_t *) (px++); 00319 00320 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00321 acc2 = __SMLADX(x2, c0, acc2); 00322 00323 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00324 acc3 = __SMLADX(x3, c0, acc3); 00325 00326 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00327 c0 = *(pb--); 00328 00329 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00330 acc0 = __SMLADX(x2, c0, acc0); 00331 00332 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00333 acc1 = __SMLADX(x3, c0, acc1); 00334 00335 /* Read x[4], x[5] */ 00336 x0 = *(q31_t *) (px++); 00337 00338 /* Read x[5], x[6] */ 00339 x1 = *(q31_t *) (px++); 00340 00341 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00342 acc2 = __SMLADX(x0, c0, acc2); 00343 00344 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00345 acc3 = __SMLADX(x1, c0, acc3); 00346 00347 } while(--k); 00348 00349 /* For the next MAC operations, SIMD is not used 00350 * So, the 16 bit pointer if inputB, py is updated */ 00351 py = (q15_t *) pb; 00352 py = py + 1; 00353 00354 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00355 ** No loop unrolling is used. */ 00356 k = srcBLen % 0x4u; 00357 00358 if(k == 1u) 00359 { 00360 /* Read y[srcBLen - 5] */ 00361 c0 = *(py); 00362 #ifdef ARM_MATH_BIG_ENDIAN 00363 00364 // c0 = unallign_rev(p, c0); 00365 c0 = c0 << 16; 00366 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00367 00368 /* Read x[7] */ 00369 x3 = *(q31_t *) px++; 00370 00371 /* Perform the multiply-accumulates */ 00372 acc0 = __SMLAD(x0, c0, acc0); 00373 acc1 = __SMLAD(x1, c0, acc1); 00374 acc2 = __SMLADX(x1, c0, acc2); 00375 acc3 = __SMLADX(x3, c0, acc3); 00376 } 00377 00378 if(k == 2u) 00379 { 00380 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00381 c0 = *(pb); 00382 00383 /* Read x[7], x[8] */ 00384 x3 = *(q31_t *) px++; 00385 00386 /* Read x[9] */ 00387 x2 = *(q31_t *) px++; 00388 00389 /* Perform the multiply-accumulates */ 00390 acc0 = __SMLADX(x0, c0, acc0); 00391 acc1 = __SMLADX(x1, c0, acc1); 00392 acc2 = __SMLADX(x3, c0, acc2); 00393 acc3 = __SMLADX(x2, c0, acc3); 00394 } 00395 00396 if(k == 3u) 00397 { 00398 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00399 c0 = *pb--; 00400 00401 /* Read x[7], x[8] */ 00402 x3 = *(q31_t *) px++; 00403 00404 /* Read x[9] */ 00405 x2 = *(q31_t *) px++; 00406 00407 /* Perform the multiply-accumulates */ 00408 acc0 = __SMLADX(x0, c0, acc0); 00409 acc1 = __SMLADX(x1, c0, acc1); 00410 acc2 = __SMLADX(x3, c0, acc2); 00411 acc3 = __SMLADX(x2, c0, acc3); 00412 00413 /* Read y[srcBLen - 7] */ 00414 #ifdef ARM_MATH_BIG_ENDIAN 00415 00416 c0 = (*pb); 00417 // c0 = (c0 & 0x0000FFFF)<<16; 00418 c0 = (c0) << 16; 00419 00420 #else 00421 00422 c0 = (q15_t) (*pb >> 16); 00423 00424 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00425 00426 /* Read x[10] */ 00427 x3 = *(q31_t *) px++; 00428 00429 /* Perform the multiply-accumulates */ 00430 acc0 = __SMLADX(x1, c0, acc0); 00431 acc1 = __SMLAD(x2, c0, acc1); 00432 acc2 = __SMLADX(x2, c0, acc2); 00433 acc3 = __SMLADX(x3, c0, acc3); 00434 } 00435 00436 /* Store the results in the accumulators in the destination buffer. */ 00437 #ifndef ARM_MATH_BIG_ENDIAN 00438 00439 *__SIMD32(pOut)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16); 00440 *__SIMD32(pOut)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16); 00441 00442 #else 00443 00444 *__SIMD32(pOut)++ = __PKHBT((acc1 >> 15), (acc0 >> 15), 16); 00445 *__SIMD32(pOut)++ = __PKHBT((acc3 >> 15), (acc2 >> 15), 16); 00446 00447 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00448 /* Update the inputA and inputB pointers for next MAC calculation */ 00449 px = pIn1 + (count * 4u); 00450 py = pSrc2; 00451 pb = (q31_t *) (py - 1); 00452 00453 /* Increment the pointer pIn1 index, count by 1 */ 00454 count++; 00455 00456 /* Decrement the loop counter */ 00457 blkCnt--; 00458 } 00459 00460 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00461 ** No loop unrolling is used. */ 00462 blkCnt = blockSize2 % 0x4u; 00463 00464 while(blkCnt > 0u) 00465 { 00466 /* Accumulator is made zero for every iteration */ 00467 sum = 0; 00468 00469 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00470 k = srcBLen >> 2u; 00471 00472 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00473 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00474 while(k > 0u) 00475 { 00476 /* Perform the multiply-accumulates */ 00477 sum += ((q31_t) * px++ * *py--); 00478 sum += ((q31_t) * px++ * *py--); 00479 sum += ((q31_t) * px++ * *py--); 00480 sum += ((q31_t) * px++ * *py--); 00481 00482 /* Decrement the loop counter */ 00483 k--; 00484 } 00485 00486 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00487 ** No loop unrolling is used. */ 00488 k = srcBLen % 0x4u; 00489 00490 while(k > 0u) 00491 { 00492 /* Perform the multiply-accumulates */ 00493 sum += ((q31_t) * px++ * *py--); 00494 00495 /* Decrement the loop counter */ 00496 k--; 00497 } 00498 00499 /* Store the result in the accumulator in the destination buffer. */ 00500 *pOut++ = (q15_t) (sum >> 15); 00501 00502 /* Update the inputA and inputB pointers for next MAC calculation */ 00503 px = pIn1 + count; 00504 py = pSrc2; 00505 00506 /* Increment the pointer pIn1 index, count by 1 */ 00507 count++; 00508 00509 /* Decrement the loop counter */ 00510 blkCnt--; 00511 } 00512 } 00513 else 00514 { 00515 /* If the srcBLen is not a multiple of 4, 00516 * the blockSize2 loop cannot be unrolled by 4 */ 00517 blkCnt = blockSize2; 00518 00519 while(blkCnt > 0u) 00520 { 00521 /* Accumulator is made zero for every iteration */ 00522 sum = 0; 00523 00524 /* srcBLen number of MACS should be performed */ 00525 k = srcBLen; 00526 00527 while(k > 0u) 00528 { 00529 /* Perform the multiply-accumulate */ 00530 sum += ((q31_t) * px++ * *py--); 00531 00532 /* Decrement the loop counter */ 00533 k--; 00534 } 00535 00536 /* Store the result in the accumulator in the destination buffer. */ 00537 *pOut++ = (q15_t) (sum >> 15); 00538 00539 /* Update the inputA and inputB pointers for next MAC calculation */ 00540 px = pIn1 + count; 00541 py = pSrc2; 00542 00543 /* Increment the MAC count */ 00544 count++; 00545 00546 /* Decrement the loop counter */ 00547 blkCnt--; 00548 } 00549 } 00550 00551 00552 /* -------------------------- 00553 * Initializations of stage3 00554 * -------------------------*/ 00555 00556 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00557 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00558 * .... 00559 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00560 * sum += x[srcALen-1] * y[srcBLen-1] 00561 */ 00562 00563 /* In this stage the MAC operations are decreased by 1 for every iteration. 00564 The blockSize3 variable holds the number of MAC operations performed */ 00565 00566 /* Working pointer of inputA */ 00567 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00568 px = pSrc1; 00569 00570 /* Working pointer of inputB */ 00571 pSrc2 = pIn2 + (srcBLen - 1u); 00572 pIn2 = pSrc2 - 1u; 00573 py = pIn2; 00574 00575 /* ------------------- 00576 * Stage3 process 00577 * ------------------*/ 00578 00579 /* For loop unrolling by 4, this stage is divided into two. */ 00580 /* First part of this stage computes the MAC operations greater than 4 */ 00581 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 00582 00583 /* The first part of the stage starts here */ 00584 j = blockSize3 >> 2u; 00585 00586 while((j > 0u) && (blockSize3 > 0u)) 00587 { 00588 /* Accumulator is made zero for every iteration */ 00589 sum = 0; 00590 00591 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00592 k = blockSize3 >> 2u; 00593 00594 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00595 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00596 while(k > 0u) 00597 { 00598 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied 00599 * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00600 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00601 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied 00602 * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00603 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00604 00605 /* Decrement the loop counter */ 00606 k--; 00607 } 00608 00609 /* For the next MAC operations, the pointer py is used without SIMD 00610 * So, py is incremented by 1 */ 00611 py = py + 1u; 00612 00613 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00614 ** No loop unrolling is used. */ 00615 k = blockSize3 % 0x4u; 00616 00617 while(k > 0u) 00618 { 00619 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 00620 sum = __SMLAD(*px++, *py--, sum); 00621 00622 /* Decrement the loop counter */ 00623 k--; 00624 } 00625 00626 /* Store the result in the accumulator in the destination buffer. */ 00627 *pOut++ = (q15_t) (sum >> 15); 00628 00629 /* Update the inputA and inputB pointers for next MAC calculation */ 00630 px = ++pSrc1; 00631 py = pIn2; 00632 00633 /* Decrement the loop counter */ 00634 blockSize3--; 00635 00636 j--; 00637 } 00638 00639 /* The second part of the stage starts here */ 00640 /* SIMD is not used for the next MAC operations, 00641 * so pointer py is updated to read only one sample at a time */ 00642 py = py + 1u; 00643 00644 while(blockSize3 > 0u) 00645 { 00646 /* Accumulator is made zero for every iteration */ 00647 sum = 0; 00648 00649 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00650 k = blockSize3; 00651 00652 while(k > 0u) 00653 { 00654 /* Perform the multiply-accumulates */ 00655 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00656 sum = __SMLAD(*px++, *py--, sum); 00657 00658 /* Decrement the loop counter */ 00659 k--; 00660 } 00661 00662 /* Store the result in the accumulator in the destination buffer. */ 00663 *pOut++ = (q15_t) (sum >> 15); 00664 00665 /* Update the inputA and inputB pointers for next MAC calculation */ 00666 px = ++pSrc1; 00667 py = pSrc2; 00668 00669 /* Decrement the loop counter */ 00670 blockSize3--; 00671 } 00672 00673 } 00674