1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010 ARM Limited. All rights reserved.
7 * Project: CMSIS DSP Library
8 * Title: arm_mat_mult_q31.c
10 * Description: Q31 matrix multiplication.
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
14 * Version 1.0.10 2011/7/15
15 * Big Endian support added and Merged M0 and M3/M4 Source code.
17 * Version 1.0.3 2010/11/29
18 * Re-organized the CMSIS folders and updated documentation.
20 * Version 1.0.2 2010/11/11
21 * Documentation updated.
23 * Version 1.0.1 2010/10/05
24 * Production release and review comments incorporated.
26 * Version 1.0.0 2010/09/20
27 * Production release and review comments incorporated.
29 * Version 0.0.5 2010/04/26
30 * incorporated review comments and updated with latest CMSIS layer
32 * Version 0.0.3 2010/03/10
34 * -------------------------------------------------------------------- */
39 * @ingroup groupMatrix
43 * @addtogroup MatrixMult
48 * @brief Q31 matrix multiplication
49 * @param[in] *pSrcA points to the first input matrix structure
50 * @param[in] *pSrcB points to the second input matrix structure
51 * @param[out] *pDst points to output matrix structure
52 * @return The function returns either
53 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
56 * <b>Scaling and Overflow Behavior:</b>
59 * The function is implemented using an internal 64-bit accumulator.
60 * The accumulator has a 2.62 format and maintains full precision of the intermediate
61 * multiplication results but provides only a single guard bit. There is no saturation
62 * on intermediate additions. Thus, if the accumulator overflows it wraps around and
63 * distorts the result. The input signals should be scaled down to avoid intermediate
64 * overflows. The input is thus scaled down by log2(numColsA) bits
65 * to avoid overflows, as a total of numColsA additions are performed internally.
66 * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
69 * See <code>arm_mat_mult_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
73 arm_status arm_mat_mult_q31(
74 const arm_matrix_instance_q31 * pSrcA,
75 const arm_matrix_instance_q31 * pSrcB,
76 arm_matrix_instance_q31 * pDst)
78 q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
79 q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
80 q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
81 q31_t *pOut = pDst->pData; /* output data matrix pointer */
82 q31_t *px; /* Temporary output data matrix pointer */
83 q63_t sum; /* Accumulator */
84 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
85 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
86 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
90 /* Run the below code for Cortex-M4 and Cortex-M3 */
92 uint16_t col, i = 0u, j, row = numRowsA, colCnt; /* loop counters */
93 arm_status status; /* status of matrix multiplication */
96 #ifdef ARM_MATH_MATRIX_CHECK
99 /* Check for matrix mismatch condition */
100 if((pSrcA->numCols != pSrcB->numRows) ||
101 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
103 /* Set status as ARM_MATH_SIZE_MISMATCH */
104 status = ARM_MATH_SIZE_MISMATCH;
107 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
110 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
114 /* Output pointer is set to starting address of the row being processed */
117 /* For every row wise process, the column loop counter is to be initiated */
120 /* For every row wise process, the pIn2 pointer is set
121 ** to the starting address of the pSrcB data */
129 /* Set the variable sum, that acts as accumulator, to zero */
132 /* Initiate the pointer pIn1 to point to the starting address of pInA */
135 /* Apply loop unrolling and compute 4 MACs simultaneously. */
136 colCnt = numColsA >> 2;
139 /* matrix multiplication */
142 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
143 /* Perform the multiply-accumulates */
144 sum += (q63_t) * pIn1++ * *pIn2;
147 sum += (q63_t) * pIn1++ * *pIn2;
150 sum += (q63_t) * pIn1++ * *pIn2;
153 sum += (q63_t) * pIn1++ * *pIn2;
156 /* Decrement the loop counter */
160 /* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here.
161 ** No loop unrolling is used. */
162 colCnt = numColsA % 0x4u;
166 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
167 /* Perform the multiply-accumulates */
168 sum += (q63_t) * pIn1++ * *pIn2;
171 /* Decrement the loop counter */
175 /* Convert the result from 2.62 to 1.31 format and store in destination buffer */
176 *px++ = (q31_t) (sum >> 31);
178 /* Update the pointer pIn2 to point to the starting address of the next column */
180 pIn2 = (pSrcB->pData) + j;
182 /* Decrement the column loop counter */
189 /* Run the below code for Cortex-M0 */
191 q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
192 uint16_t col, i = 0u, row = numRowsA, colCnt; /* loop counters */
193 arm_status status; /* status of matrix multiplication */
196 #ifdef ARM_MATH_MATRIX_CHECK
198 /* Check for matrix mismatch condition */
199 if((pSrcA->numCols != pSrcB->numRows) ||
200 (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
202 /* Set status as ARM_MATH_SIZE_MISMATCH */
203 status = ARM_MATH_SIZE_MISMATCH;
206 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
209 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
213 /* Output pointer is set to starting address of the row being processed */
216 /* For every row wise process, the column loop counter is to be initiated */
219 /* For every row wise process, the pIn2 pointer is set
220 ** to the starting address of the pSrcB data */
226 /* Set the variable sum, that acts as accumulator, to zero */
229 /* Initiate the pointer pIn1 to point to the starting address of pInA */
232 /* Matrix A columns number of MAC operations are to be performed */
235 /* matrix multiplication */
238 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
239 /* Perform the multiply-accumulates */
240 sum += (q63_t) * pIn1++ * *pIn2;
243 /* Decrement the loop counter */
247 /* Convert the result from 2.62 to 1.31 format and store in destination buffer */
248 *px++ = (q31_t) (sum >> 31);
250 /* Decrement the column loop counter */
253 /* Update the pointer pIn2 to point to the starting address of the next column */
254 pIn2 = pInB + (numColsB - col);
260 /* Update the pointer pInA to point to the starting address of the next row */
262 pInA = pInA + numColsA;
264 /* Decrement the row loop counter */
269 /* set status as ARM_MATH_SUCCESS */
270 status = ARM_MATH_SUCCESS;
272 /* Return to application */
277 * @} end of MatrixMult group