git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_fast_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:            arm_mat_mult_fast_q15.c
   9 *
  10 * Description:   Q15 matrix multiplication (fast variant)
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated.
  28 * -------------------------------------------------------------------- */
  29
  30 #include "arm_math.h"
  31
  32 /**
  33  * @ingroup groupMatrix
  34  */
  35
  36 /**
  37  * @addtogroup MatrixMult
  38  * @{
  39  */
  40
  41
  42 /**
  43  * @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
  44  * @param[in]       *pSrcA points to the first input matrix structure
  45  * @param[in]       *pSrcB points to the second input matrix structure
  46  * @param[out]      *pDst points to output matrix structure
  47  * @param[in]           *pState points to the array for storing intermediate results
  48  * @return              The function returns either
  49  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  50  *
  51  * @details
  52  * <b>Scaling and Overflow Behavior:</b>
  53  *
  54  * \par
  55  * The difference between the function arm_mat_mult_q15() and this fast variant is that
  56  * the fast variant use a 32-bit rather than a 64-bit accumulator.
  57  * The result of each 1.15 x 1.15 multiplication is truncated to
  58  * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
  59  * format. Finally, the accumulator is saturated and converted to a 1.15 result.
  60  *
  61  * \par
  62  * The fast version has the same overflow behavior as the standard version but provides
  63  * less precision since it discards the low 16 bits of each multiplication result.
  64  * In order to avoid overflows completely the input signals must be scaled down.
  65  * Scale down one of the input matrices by log2(numColsA) bits to
  66  * avoid overflows, as a total of numColsA additions are computed internally for each
  67  * output element.
  68  *
  69  * \par
  70  * See <code>arm_mat_mult_q15()</code> for a slower implementation of this function
  71  * which uses 64-bit accumulation to provide higher precision.
  72  */
  73
  74 arm_status arm_mat_mult_fast_q15(
  75   const arm_matrix_instance_q15 * pSrcA,
  76   const arm_matrix_instance_q15 * pSrcB,
  77   arm_matrix_instance_q15 * pDst,
  78   q15_t * pState)
  79 {
  80   q31_t sum;                                     /* accumulator */
  81   q31_t in;                                      /* Temporary variable to hold the input value */
  82   q15_t *pSrcBT = pState;                        /* input data matrix pointer for transpose */
  83   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
  84   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
  85 //  q15_t *pDst = pDst->pData;                   /* output data matrix pointer */
  86   q15_t *px;                                     /* Temporary output data matrix pointer */
  87   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
  88   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
  89   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
  90   uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
  91   uint16_t col, i = 0u, row = numRowsB, colCnt;  /* loop counters */
  92   arm_status status;                             /* status of matrix multiplication */
  93
  94 #ifdef ARM_MATH_MATRIX_CHECK
  95
  96
  97   /* Check for matrix mismatch condition */
  98
  99   if((pSrcA->numCols != pSrcB->numRows) ||
 100      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
 101   {
 102     /* Set status as ARM_MATH_SIZE_MISMATCH */
 103     status = ARM_MATH_SIZE_MISMATCH;
 104   }
 105   else
 106 #endif /*      #ifdef ARM_MATH_MATRIX_CHECK    */
 107
 108   {
 109     /* Matrix transpose */
 110     do
 111     {
 112       /* Apply loop unrolling and exchange the columns with row elements */
 113       col = numColsB >> 2;
 114
 115       /* The pointer px is set to starting address of the column being processed */
 116       px = pSrcBT + i;
 117
 118       /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
 119        ** a second loop below computes the remaining 1 to 3 samples. */
 120       while(col > 0u)
 121       {
 122         /* Read two elements from the row */
 123         in = *__SIMD32(pInB)++;
 124
 125         /* Unpack and store one element in the destination */
 126 #ifndef ARM_MATH_BIG_ENDIAN
 127
 128         *px = (q15_t) in;
 129
 130 #else
 131
 132         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
 133
 134 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
 135
 136         /* Update the pointer px to point to the next row of the transposed matrix */
 137         px += numRowsB;
 138
 139         /* Unpack and store the second element in the destination */
 140 #ifndef ARM_MATH_BIG_ENDIAN
 141
 142         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
 143
 144 #else
 145
 146         *px = (q15_t) in;
 147
 148 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
 149
 150
 151         /* Update the pointer px to point to the next row of the transposed matrix */
 152         px += numRowsB;
 153
 154         /* Read two elements from the row */
 155         in = *__SIMD32(pInB)++;
 156
 157         /* Unpack and store one element in the destination */
 158 #ifndef ARM_MATH_BIG_ENDIAN
 159
 160         *px = (q15_t) in;
 161
 162 #else
 163
 164         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
 165
 166 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
 167
 168         /* Update the pointer px to point to the next row of the transposed matrix */
 169         px += numRowsB;
 170
 171         /* Unpack and store the second element in the destination */
 172
 173 #ifndef ARM_MATH_BIG_ENDIAN
 174
 175         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
 176
 177 #else
 178
 179         *px = (q15_t) in;
 180
 181 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
 182
 183         /* Update the pointer px to point to the next row of the transposed matrix */
 184         px += numRowsB;
 185
 186         /* Decrement the column loop counter */
 187         col--;
 188       }
 189
 190       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
 191        ** No loop unrolling is used. */
 192       col = numColsB % 0x4u;
 193
 194       while(col > 0u)
 195       {
 196         /* Read and store the input element in the destination */
 197         *px = *pInB++;
 198
 199         /* Update the pointer px to point to the next row of the transposed matrix */
 200         px += numRowsB;
 201
 202         /* Decrement the column loop counter */
 203         col--;
 204       }
 205
 206       i++;
 207
 208       /* Decrement the row loop counter */
 209       row--;
 210
 211     } while(row > 0u);
 212
 213     /* Reset the variables for the usage in the following multiplication process */
 214     row = numRowsA;
 215     i = 0u;
 216     px = pDst->pData;
 217
 218     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
 219     /* row loop */
 220     do
 221     {
 222       /* For every row wise process, the column loop counter is to be initiated */
 223       col = numColsB;
 224
 225       /* For every row wise process, the pIn2 pointer is set
 226        ** to the starting address of the transposed pSrcB data */
 227       pInB = pSrcBT;
 228
 229       /* column loop */
 230       do
 231       {
 232         /* Set the variable sum, that acts as accumulator, to zero */
 233         sum = 0;
 234
 235         /* Apply loop unrolling and compute 2 MACs simultaneously. */
 236         colCnt = numColsA >> 1;
 237
 238         /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
 239         pInA = pSrcA->pData + i;
 240
 241         /* matrix multiplication */
 242         while(colCnt > 0u)
 243         {
 244           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 245           sum = __SMLAD(*__SIMD32(pInA)++, *__SIMD32(pInB)++, sum);
 246
 247           /* Decrement the loop counter */
 248           colCnt--;
 249         }
 250
 251         /* process odd column samples */
 252         if((numColsA & 0x1u) > 0u)
 253         {
 254           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 255           sum += ((q31_t) * pInA * (*pInB++));
 256         }
 257
 258         /* Saturate and store the result in the destination buffer */
 259         *px = (q15_t) (sum >> 15);
 260         px++;
 261
 262         /* Decrement the column loop counter */
 263         col--;
 264
 265       } while(col > 0u);
 266
 267       i = i + numColsA;
 268
 269       /* Decrement the row loop counter */
 270       row--;
 271
 272     } while(row > 0u);
 273
 274     /* set status as ARM_MATH_SUCCESS */
 275     status = ARM_MATH_SUCCESS;
 276   }
 277
 278   /* Return to application */
 279   return (status);
 280 }
 281
 282 /**
 283  * @} end of MatrixMult group
 284  */