git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_fast_q31.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:            arm_mat_mult_fast_q31.c
   9 *
  10 * Description:   Q31 matrix multiplication (fast variant).
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated.
  28 * -------------------------------------------------------------------- */
  29
  30 #include "arm_math.h"
  31
  32 /**
  33  * @ingroup groupMatrix
  34  */
  35
  36 /**
  37  * @addtogroup MatrixMult
  38  * @{
  39  */
  40
  41 /**
  42  * @brief Q31 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
  43  * @param[in]       *pSrcA points to the first input matrix structure
  44  * @param[in]       *pSrcB points to the second input matrix structure
  45  * @param[out]      *pDst points to output matrix structure
  46  * @return              The function returns either
  47  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  48  *
  49  * @details
  50  * <b>Scaling and Overflow Behavior:</b>
  51  *
  52  * \par
  53  * The difference between the function arm_mat_mult_q31() and this fast variant is that
  54  * the fast variant use a 32-bit rather than a 64-bit accumulator.
  55  * The result of each 1.31 x 1.31 multiplication is truncated to
  56  * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
  57  * format. Finally, the accumulator is saturated and converted to a 1.31 result.
  58  *
  59  * \par
  60  * The fast version has the same overflow behavior as the standard version but provides
  61  * less precision since it discards the low 32 bits of each multiplication result.
  62  * In order to avoid overflows completely the input signals must be scaled down.
  63  * Scale down one of the input matrices by log2(numColsA) bits to
  64  * avoid overflows, as a total of numColsA additions are computed internally for each
  65  * output element.
  66  *
  67  * \par
  68  * See <code>arm_mat_mult_q31()</code> for a slower implementation of this function
  69  * which uses 64-bit accumulation to provide higher precision.
  70  */
  71
  72 arm_status arm_mat_mult_fast_q31(
  73   const arm_matrix_instance_q31 * pSrcA,
  74   const arm_matrix_instance_q31 * pSrcB,
  75   arm_matrix_instance_q31 * pDst)
  76 {
  77   q31_t *pIn1 = pSrcA->pData;                    /* input data matrix pointer A */
  78   q31_t *pIn2 = pSrcB->pData;                    /* input data matrix pointer B */
  79   q31_t *pInA = pSrcA->pData;                    /* input data matrix pointer A */
  80 //  q31_t *pSrcB = pSrcB->pData;                    /* input data matrix pointer B */
  81   q31_t *pOut = pDst->pData;                     /* output data matrix pointer */
  82   q31_t *px;                                     /* Temporary output data matrix pointer */
  83   q31_t sum;                                     /* Accumulator */
  84   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
  85   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
  86   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
  87   uint16_t col, i = 0u, j, row = numRowsA, colCnt;      /* loop counters */
  88   arm_status status;                             /* status of matrix multiplication */
  89
  90
  91 #ifdef ARM_MATH_MATRIX_CHECK
  92
  93
  94   /* Check for matrix mismatch condition */
  95   if((pSrcA->numCols != pSrcB->numRows) ||
  96      (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
  97   {
  98     /* Set status as ARM_MATH_SIZE_MISMATCH */
  99     status = ARM_MATH_SIZE_MISMATCH;
 100   }
 101   else
 102 #endif /*      #ifdef ARM_MATH_MATRIX_CHECK    */
 103
 104   {
 105     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
 106     /* row loop */
 107     do
 108     {
 109       /* Output pointer is set to starting address of the row being processed */
 110       px = pOut + i;
 111
 112       /* For every row wise process, the column loop counter is to be initiated */
 113       col = numColsB;
 114
 115       /* For every row wise process, the pIn2 pointer is set
 116        ** to the starting address of the pSrcB data */
 117       pIn2 = pSrcB->pData;
 118
 119       j = 0u;
 120
 121       /* column loop */
 122       do
 123       {
 124         /* Set the variable sum, that acts as accumulator, to zero */
 125         sum = 0;
 126
 127         /* Initiate the pointer pIn1 to point to the starting address of pInA */
 128         pIn1 = pInA;
 129
 130         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 131         colCnt = numColsA >> 2;
 132
 133
 134         /* matrix multiplication */
 135         while(colCnt > 0u)
 136         {
 137           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 138           /* Perform the multiply-accumulates */
 139           sum = (q31_t) ((((q63_t) sum << 32) +
 140                           ((q63_t) * pIn1++ * (*pIn2))) >> 32);
 141           pIn2 += numColsB;
 142           sum = (q31_t) ((((q63_t) sum << 32) +
 143                           ((q63_t) * pIn1++ * (*pIn2))) >> 32);
 144           pIn2 += numColsB;
 145           sum = (q31_t) ((((q63_t) sum << 32) +
 146                           ((q63_t) * pIn1++ * (*pIn2))) >> 32);
 147           pIn2 += numColsB;
 148           sum = (q31_t) ((((q63_t) sum << 32) +
 149                           ((q63_t) * pIn1++ * (*pIn2))) >> 32);
 150           pIn2 += numColsB;
 151
 152           /* Decrement the loop counter */
 153           colCnt--;
 154         }
 155
 156         /* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here.
 157          ** No loop unrolling is used. */
 158         colCnt = numColsA % 0x4u;
 159
 160         while(colCnt > 0u)
 161         {
 162           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
 163           /* Perform the multiply-accumulates */
 164           sum = (q31_t) ((((q63_t) sum << 32) +
 165                           ((q63_t) * pIn1++ * (*pIn2))) >> 32);
 166           pIn2 += numColsB;
 167
 168           /* Decrement the loop counter */
 169           colCnt--;
 170         }
 171
 172         /* Convert the result from 2.30 to 1.31 format and store in destination buffer */
 173         *px++ = sum << 1;
 174
 175         /* Update the pointer pIn2 to point to the  starting address of the next column */
 176         j++;
 177         pIn2 = pSrcB->pData + j;
 178
 179         /* Decrement the column loop counter */
 180         col--;
 181
 182       } while(col > 0u);
 183
 184       /* Update the pointer pInA to point to the  starting address of the next row */
 185       i = i + numColsB;
 186       pInA = pInA + numColsA;
 187
 188       /* Decrement the row loop counter */
 189       row--;
 190
 191     } while(row > 0u);
 192
 193     /* set status as ARM_MATH_SUCCESS */
 194     status = ARM_MATH_SUCCESS;
 195   }
 196   /* Return to application */
 197   return (status);
 198 }
 199
 200 /**
 201  * @} end of MatrixMult group
 202  */