git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:            arm_mat_mult_q15.c
   9 *
  10 * Description:   Q15 matrix multiplication.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated.
  28 *
  29 * Version 0.0.5  2010/04/26
  30 *    incorporated review comments and updated with latest CMSIS layer
  31 *
  32 * Version 0.0.3  2010/03/10
  33 *    Initial version
  34 * -------------------------------------------------------------------- */
  35
  36 #include "arm_math.h"
  37
  38 /**
  39  * @ingroup groupMatrix
  40  */
  41
  42 /**
  43  * @addtogroup MatrixMult
  44  * @{
  45  */
  46
  47
  48 /**
  49  * @brief Q15 matrix multiplication
  50  * @param[in]       *pSrcA points to the first input matrix structure
  51  * @param[in]       *pSrcB points to the second input matrix structure
  52  * @param[out]      *pDst points to output matrix structure
  53  * @param[in]           *pState points to the array for storing intermediate results
  54  * @return              The function returns either
  55  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  56  *
  57  * @details
  58  * <b>Scaling and Overflow Behavior:</b>
  59  *
  60  * \par
  61  * The function is implemented using a 64-bit internal accumulator. The inputs to the
  62  * multiplications are in 1.15 format and multiplications yield a 2.30 result.
  63  * The 2.30 intermediate
  64  * results are accumulated in a 64-bit accumulator in 34.30 format. This approach
  65  * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
  66  * truncated to 34.15 format by discarding the low 15 bits and then saturated to
  67  * 1.15 format.
  68  *
  69  * \par
  70  * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
  71  *
  72  */
  73
  74 arm_status arm_mat_mult_q15(
  75   const arm_matrix_instance_q15 * pSrcA,
  76   const arm_matrix_instance_q15 * pSrcB,
  77   arm_matrix_instance_q15 * pDst,
  78   q15_t * pState)
  79 {
  80   q63_t sum;                                     /* accumulator */
  81
  82 #ifndef ARM_MATH_CM0
  83
  84   /* Run the below code for Cortex-M4 and Cortex-M3 */
  85
  86   q31_t in;                                      /* Temporary variable to hold the input value */
  87   q15_t *pSrcBT = pState;                        /* input data matrix pointer for transpose */
  88   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
  89   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
  90   q15_t *px;                                     /* Temporary output data matrix pointer */
  91   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
  92   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
  93   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
  94   uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
  95   uint16_t col, i = 0u, row = numRowsB, colCnt;  /* loop counters */
  96   arm_status status;                             /* status of matrix multiplication */
  97
  98 #ifdef ARM_MATH_MATRIX_CHECK
  99
 100
 101   /* Check for matrix mismatch condition */
 102
 103   if((pSrcA->numCols != pSrcB->numRows) ||
 104      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
 105   {
 106     /* Set status as ARM_MATH_SIZE_MISMATCH */
 107     status = ARM_MATH_SIZE_MISMATCH;
 108   }
 109   else
 110 #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
 111
 112   {
 113     /* Matrix transpose */
 114     do
 115     {
 116       /* Apply loop unrolling and exchange the columns with row elements */
 117       col = numColsB >> 2;
 118
 119       /* The pointer px is set to starting address of the column being processed */
 120       px = pSrcBT + i;
 121
 122       /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
 123        ** a second loop below computes the remaining 1 to 3 samples. */
 124       while(col > 0u)
 125       {
 126         /* Read two elements from the row */
 127         in = *__SIMD32(pInB)++;
 128
 129         /* Unpack and store one element in the destination */
 130 #ifndef ARM_MATH_BIG_ENDIAN
 131
 132         *px = (q15_t) in;
 133
 134 #else
 135
 136         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
 137
 138 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
 139
 140         /* Update the pointer px to point to the next row of the transposed matrix */
 141         px += numRowsB;
 142
 143         /* Unpack and store the second element in the destination */
 144 #ifndef ARM_MATH_BIG_ENDIAN
 145
 146         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
 147
 148 #else
 149
 150         *px = (q15_t) in;
 151
 152 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
 153
 154
 155         /* Update the pointer px to point to the next row of the transposed matrix */
 156         px += numRowsB;
 157
 158         /* Read two elements from the row */
 159         in = *__SIMD32(pInB)++;
 160
 161         /* Unpack and store one element in the destination */
 162 #ifndef ARM_MATH_BIG_ENDIAN
 163
 164         *px = (q15_t) in;
 165
 166 #else
 167
 168         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
 169
 170 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
 171
 172         /* Update the pointer px to point to the next row of the transposed matrix */
 173         px += numRowsB;
 174
 175         /* Unpack and store the second element in the destination */
 176
 177 #ifndef ARM_MATH_BIG_ENDIAN
 178
 179         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
 180
 181 #else
 182
 183         *px = (q15_t) in;
 184
 185 #endif /*    #ifndef ARM_MATH_BIG_ENDIAN    */
 186
 187         /* Update the pointer px to point to the next row of the transposed matrix */
 188         px += numRowsB;
 189
 190         /* Decrement the column loop counter */
 191         col--;
 192       }
 193
 194       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
 195        ** No loop unrolling is used. */
 196       col = numColsB % 0x4u;
 197
 198       while(col > 0u)
 199       {
 200         /* Read and store the input element in the destination */
 201         *px = *pInB++;
 202
 203         /* Update the pointer px to point to the next row of the transposed matrix */
 204         px += numRowsB;
 205
 206         /* Decrement the column loop counter */
 207         col--;
 208       }
 209
 210       i++;
 211
 212       /* Decrement the row loop counter */
 213       row--;
 214
 215     } while(row > 0u);
 216
 217     /* Reset the variables for the usage in the following multiplication process */
 218     row = numRowsA;
 219     i = 0u;
 220     px = pDst->pData;
 221
 222     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
 223     /* row loop */
 224     do
 225     {
 226       /* For every row wise process, the column loop counter is to be initiated */
 227       col = numColsB;
 228
 229       /* For every row wise process, the pIn2 pointer is set
 230        ** to the starting address of the transposed pSrcB data */
 231       pInB = pSrcBT;
 232
 233       /* column loop */
 234       do
 235       {
 236         /* Set the variable sum, that acts as accumulator, to zero */
 237         sum = 0;
 238
 239         /* Apply loop unrolling and compute 2 MACs simultaneously. */
 240         colCnt = numColsA >> 1;
 241
 242         /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
 243         pInA = pSrcA->pData + i;
 244
 245         /* matrix multiplication */
 246         while(colCnt > 0u)
 247         {
 248           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 249           sum = __SMLALD(*__SIMD32(pInA)++, *__SIMD32(pInB)++, sum);
 250
 251           /* Decrement the loop counter */
 252           colCnt--;
 253         }
 254
 255         /* process odd column samples */
 256         if((numColsA & 0x1u) > 0u)
 257         {
 258           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 259           sum += ((q31_t) * pInA * (*pInB++));
 260         }
 261
 262         /* Saturate and store the result in the destination buffer */
 263         *px = (q15_t) (__SSAT((sum >> 15), 16));
 264         px++;
 265
 266         /* Decrement the column loop counter */
 267         col--;
 268
 269       } while(col > 0u);
 270
 271       i = i + numColsA;
 272
 273       /* Decrement the row loop counter */
 274       row--;
 275
 276     } while(row > 0u);
 277
 278 #else
 279
 280   /* Run the below code for Cortex-M0 */
 281
 282   q15_t *pIn1 = pSrcA->pData;                    /* input data matrix pointer A */
 283   q15_t *pIn2 = pSrcB->pData;                    /* input data matrix pointer B */
 284   q15_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q15 type */
 285   q15_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q15 type */
 286   q15_t *pOut = pDst->pData;                     /* output data matrix pointer */
 287   q15_t *px;                                     /* Temporary output data matrix pointer */
 288   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
 289   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
 290   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
 291   uint16_t col, i = 0u, row = numRowsA, colCnt;  /* loop counters */
 292   arm_status status;                             /* status of matrix multiplication */
 293
 294 #ifdef ARM_MATH_MATRIX_CHECK
 295
 296   /* Check for matrix mismatch condition */
 297   if((pSrcA->numCols != pSrcB->numRows) ||
 298      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
 299   {
 300     /* Set status as ARM_MATH_SIZE_MISMATCH */
 301     status = ARM_MATH_SIZE_MISMATCH;
 302   }
 303   else
 304 #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
 305
 306   {
 307     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
 308     /* row loop */
 309     do
 310     {
 311       /* Output pointer is set to starting address of the row being processed */
 312       px = pOut + i;
 313
 314       /* For every row wise process, the column loop counter is to be initiated */
 315       col = numColsB;
 316
 317       /* For every row wise process, the pIn2 pointer is set
 318        ** to the starting address of the pSrcB data */
 319       pIn2 = pSrcB->pData;
 320
 321       /* column loop */
 322       do
 323       {
 324         /* Set the variable sum, that acts as accumulator, to zero */
 325         sum = 0;
 326
 327         /* Initiate the pointer pIn1 to point to the starting address of pSrcA */
 328         pIn1 = pInA;
 329
 330         /* Matrix A columns number of MAC operations are to be performed */
 331         colCnt = numColsA;
 332
 333         /* matrix multiplication */
 334         while(colCnt > 0u)
 335         {
 336           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 337           /* Perform the multiply-accumulates */
 338           sum += (q31_t) * pIn1++ * *pIn2;
 339           pIn2 += numColsB;
 340
 341           /* Decrement the loop counter */
 342           colCnt--;
 343         }
 344
 345         /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
 346         /* Saturate and store the result in the destination buffer */
 347         *px++ = (q15_t) __SSAT((sum >> 15), 16);
 348
 349         /* Decrement the column loop counter */
 350         col--;
 351
 352         /* Update the pointer pIn2 to point to the  starting address of the next column */
 353         pIn2 = pInB + (numColsB - col);
 354
 355       } while(col > 0u);
 356
 357       /* Update the pointer pSrcA to point to the  starting address of the next row */
 358       i = i + numColsB;
 359       pInA = pInA + numColsA;
 360
 361       /* Decrement the row loop counter */
 362       row--;
 363
 364     } while(row > 0u);
 365
 366 #endif /* #ifndef ARM_MATH_CM0 */
 367
 368     /* set status as ARM_MATH_SUCCESS */
 369     status = ARM_MATH_SUCCESS;
 370   }
 371
 372   /* Return to application */
 373   return (status);
 374 }
 375
 376 /**
 377  * @} end of MatrixMult group
 378  */