git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_q31.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:            arm_mat_mult_q31.c
   9 *
  10 * Description:   Q31 matrix multiplication.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated.
  28 *
  29 * Version 0.0.5  2010/04/26
  30 *    incorporated review comments and updated with latest CMSIS layer
  31 *
  32 * Version 0.0.3  2010/03/10
  33 *    Initial version
  34 * -------------------------------------------------------------------- */
  35
  36 #include "arm_math.h"
  37
  38 /**
  39  * @ingroup groupMatrix
  40  */
  41
  42 /**
  43  * @addtogroup MatrixMult
  44  * @{
  45  */
  46
  47 /**
  48  * @brief Q31 matrix multiplication
  49  * @param[in]       *pSrcA points to the first input matrix structure
  50  * @param[in]       *pSrcB points to the second input matrix structure
  51  * @param[out]      *pDst points to output matrix structure
  52  * @return              The function returns either
  53  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  54  *
  55  * @details
  56  * <b>Scaling and Overflow Behavior:</b>
  57  *
  58  * \par
  59  * The function is implemented using an internal 64-bit accumulator.
  60  * The accumulator has a 2.62 format and maintains full precision of the intermediate
  61  * multiplication results but provides only a single guard bit. There is no saturation
  62  * on intermediate additions. Thus, if the accumulator overflows it wraps around and
  63  * distorts the result. The input signals should be scaled down to avoid intermediate
  64  * overflows. The input is thus scaled down by log2(numColsA) bits
  65  * to avoid overflows, as a total of numColsA additions are performed internally.
  66  * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
  67  *
  68  * \par
  69  * See <code>arm_mat_mult_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
  70  *
  71  */
  72
  73 arm_status arm_mat_mult_q31(
  74   const arm_matrix_instance_q31 * pSrcA,
  75   const arm_matrix_instance_q31 * pSrcB,
  76   arm_matrix_instance_q31 * pDst)
  77 {
  78   q31_t *pIn1 = pSrcA->pData;                    /* input data matrix pointer A */
  79   q31_t *pIn2 = pSrcB->pData;                    /* input data matrix pointer B */
  80   q31_t *pInA = pSrcA->pData;                    /* input data matrix pointer A */
  81   q31_t *pOut = pDst->pData;                     /* output data matrix pointer */
  82   q31_t *px;                                     /* Temporary output data matrix pointer */
  83   q63_t sum;                                     /* Accumulator */
  84   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
  85   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
  86   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
  87
  88 #ifndef ARM_MATH_CM0
  89
  90   /* Run the below code for Cortex-M4 and Cortex-M3 */
  91
  92   uint16_t col, i = 0u, j, row = numRowsA, colCnt;      /* loop counters */
  93   arm_status status;                             /* status of matrix multiplication */
  94
  95
  96 #ifdef ARM_MATH_MATRIX_CHECK
  97
  98
  99   /* Check for matrix mismatch condition */
 100   if((pSrcA->numCols != pSrcB->numRows) ||
 101      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
 102   {
 103     /* Set status as ARM_MATH_SIZE_MISMATCH */
 104     status = ARM_MATH_SIZE_MISMATCH;
 105   }
 106   else
 107 #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
 108
 109   {
 110     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
 111     /* row loop */
 112     do
 113     {
 114       /* Output pointer is set to starting address of the row being processed */
 115       px = pOut + i;
 116
 117       /* For every row wise process, the column loop counter is to be initiated */
 118       col = numColsB;
 119
 120       /* For every row wise process, the pIn2 pointer is set
 121        ** to the starting address of the pSrcB data */
 122       pIn2 = pSrcB->pData;
 123
 124       j = 0u;
 125
 126       /* column loop */
 127       do
 128       {
 129         /* Set the variable sum, that acts as accumulator, to zero */
 130         sum = 0;
 131
 132         /* Initiate the pointer pIn1 to point to the starting address of pInA */
 133         pIn1 = pInA;
 134
 135         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 136         colCnt = numColsA >> 2;
 137
 138
 139         /* matrix multiplication */
 140         while(colCnt > 0u)
 141         {
 142           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 143           /* Perform the multiply-accumulates */
 144           sum += (q63_t) * pIn1++ * *pIn2;
 145           pIn2 += numColsB;
 146
 147           sum += (q63_t) * pIn1++ * *pIn2;
 148           pIn2 += numColsB;
 149
 150           sum += (q63_t) * pIn1++ * *pIn2;
 151           pIn2 += numColsB;
 152
 153           sum += (q63_t) * pIn1++ * *pIn2;
 154           pIn2 += numColsB;
 155
 156           /* Decrement the loop counter */
 157           colCnt--;
 158         }
 159
 160         /* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here.
 161          ** No loop unrolling is used. */
 162         colCnt = numColsA % 0x4u;
 163
 164         while(colCnt > 0u)
 165         {
 166           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 167           /* Perform the multiply-accumulates */
 168           sum += (q63_t) * pIn1++ * *pIn2;
 169           pIn2 += numColsB;
 170
 171           /* Decrement the loop counter */
 172           colCnt--;
 173         }
 174
 175         /* Convert the result from 2.62 to 1.31 format and store in destination buffer */
 176         *px++ = (q31_t) (sum >> 31);
 177
 178         /* Update the pointer pIn2 to point to the  starting address of the next column */
 179         j++;
 180         pIn2 = (pSrcB->pData) + j;
 181
 182         /* Decrement the column loop counter */
 183         col--;
 184
 185       } while(col > 0u);
 186
 187 #else
 188
 189   /* Run the below code for Cortex-M0 */
 190
 191   q31_t *pInB = pSrcB->pData;                    /* input data matrix pointer B */
 192   uint16_t col, i = 0u, row = numRowsA, colCnt;  /* loop counters */
 193   arm_status status;                             /* status of matrix multiplication */
 194
 195
 196 #ifdef ARM_MATH_MATRIX_CHECK
 197
 198   /* Check for matrix mismatch condition */
 199   if((pSrcA->numCols != pSrcB->numRows) ||
 200      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
 201   {
 202     /* Set status as ARM_MATH_SIZE_MISMATCH */
 203     status = ARM_MATH_SIZE_MISMATCH;
 204   }
 205   else
 206 #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
 207
 208   {
 209     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
 210     /* row loop */
 211     do
 212     {
 213       /* Output pointer is set to starting address of the row being processed */
 214       px = pOut + i;
 215
 216       /* For every row wise process, the column loop counter is to be initiated */
 217       col = numColsB;
 218
 219       /* For every row wise process, the pIn2 pointer is set
 220        ** to the starting address of the pSrcB data */
 221       pIn2 = pSrcB->pData;
 222
 223       /* column loop */
 224       do
 225       {
 226         /* Set the variable sum, that acts as accumulator, to zero */
 227         sum = 0;
 228
 229         /* Initiate the pointer pIn1 to point to the starting address of pInA */
 230         pIn1 = pInA;
 231
 232         /* Matrix A columns number of MAC operations are to be performed */
 233         colCnt = numColsA;
 234
 235         /* matrix multiplication */
 236         while(colCnt > 0u)
 237         {
 238           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 239           /* Perform the multiply-accumulates */
 240           sum += (q63_t) * pIn1++ * *pIn2;
 241           pIn2 += numColsB;
 242
 243           /* Decrement the loop counter */
 244           colCnt--;
 245         }
 246
 247         /* Convert the result from 2.62 to 1.31 format and store in destination buffer */
 248         *px++ = (q31_t) (sum >> 31);
 249
 250         /* Decrement the column loop counter */
 251         col--;
 252
 253         /* Update the pointer pIn2 to point to the  starting address of the next column */
 254         pIn2 = pInB + (numColsB - col);
 255
 256       } while(col > 0u);
 257
 258 #endif
 259
 260       /* Update the pointer pInA to point to the  starting address of the next row */
 261       i = i + numColsB;
 262       pInA = pInA + numColsA;
 263
 264       /* Decrement the row loop counter */
 265       row--;
 266
 267     } while(row > 0u);
 268
 269     /* set status as ARM_MATH_SUCCESS */
 270     status = ARM_MATH_SUCCESS;
 271   }
 272   /* Return to application */
 273   return (status);
 274 }
 275
 276 /**
 277  * @} end of MatrixMult group
 278  */