git.gag.com Git - fw/stlink/blob - exampleF4/CMSIS/DSP_Lib/Source/TransformFunctions/arm_dct4_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        15. July 2011
   5 * $Revision:    V1.0.10
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:            arm_dct4_q15.c
   9 *
  10 * Description:  Processing function of DCT4 & IDCT4 Q15.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Version 1.0.10 2011/7/15
  15 *    Big Endian support added and Merged M0 and M3/M4 Source code.
  16 *
  17 * Version 1.0.3 2010/11/29
  18 *    Re-organized the CMSIS folders and updated documentation.
  19 *
  20 * Version 1.0.2 2010/11/11
  21 *    Documentation updated.
  22 *
  23 * Version 1.0.1 2010/10/05
  24 *    Production release and review comments incorporated.
  25 *
  26 * Version 1.0.0 2010/09/20
  27 *    Production release and review comments incorporated.
  28 * -------------------------------------------------------------------- */
  29
  30 #include "arm_math.h"
  31
  32 /**
  33  * @addtogroup DCT4_IDCT4
  34  * @{
  35  */
  36
  37 /**
  38  * @brief Processing function for the Q15 DCT4/IDCT4.
  39  * @param[in]       *S             points to an instance of the Q15 DCT4 structure.
  40  * @param[in]       *pState        points to state buffer.
  41  * @param[in,out]   *pInlineBuffer points to the in-place input and output buffer.
  42  * @return none.
  43  *
  44  * \par Input an output formats:
  45  * Internally inputs are downscaled in the RFFT process function to avoid overflows.
  46  * Number of bits downscaled, depends on the size of the transform.
  47  * The input and output formats for different DCT sizes and number of bits to upscale are mentioned in the table below:
  48  *
  49  * \image html dct4FormatsQ15Table.gif
  50  */
  51
  52 void arm_dct4_q15(
  53   const arm_dct4_instance_q15 * S,
  54   q15_t * pState,
  55   q15_t * pInlineBuffer)
  56 {
  57   uint32_t i;                                    /* Loop counter */
  58   q15_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */
  59   q15_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */
  60   q15_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */
  61   q15_t in;                                      /* Temporary variable */
  62
  63
  64   /* DCT4 computation involves DCT2 (which is calculated using RFFT)
  65    * along with some pre-processing and post-processing.
  66    * Computational procedure is explained as follows:
  67    * (a) Pre-processing involves multiplying input with cos factor,
  68    *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
  69    *              where,
  70    *                 r(n) -- output of preprocessing
  71    *                 u(n) -- input to preprocessing(actual Source buffer)
  72    * (b) Calculation of DCT2 using FFT is divided into three steps:
  73    *                  Step1: Re-ordering of even and odd elements of input.
  74    *                  Step2: Calculating FFT of the re-ordered input.
  75    *                  Step3: Taking the real part of the product of FFT output and weights.
  76    * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
  77    *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
  78    *                        where,
  79    *                           Y4 -- DCT4 output,   Y2 -- DCT2 output
  80    * (d) Multiplying the output with the normalizing factor sqrt(2/N).
  81    */
  82
  83         /*-------- Pre-processing ------------*/
  84   /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
  85   arm_mult_q15(pInlineBuffer, cosFact, pInlineBuffer, S->N);
  86   arm_shift_q15(pInlineBuffer, 1, pInlineBuffer, S->N);
  87
  88   /* ----------------------------------------------------------------
  89    * Step1: Re-ordering of even and odd elements as
  90    *             pState[i] =  pInlineBuffer[2*i] and
  91    *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
  92    ---------------------------------------------------------------------*/
  93
  94   /* pS1 initialized to pState */
  95   pS1 = pState;
  96
  97   /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
  98   pS2 = pState + (S->N - 1u);
  99
 100   /* pbuff initialized to input buffer */
 101   pbuff = pInlineBuffer;
 102
 103
 104 #ifndef ARM_MATH_CM0
 105
 106   /* Run the below code for Cortex-M4 and Cortex-M3 */
 107
 108   /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
 109   i = (uint32_t) S->Nby2 >> 2u;
 110
 111   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
 112    ** a second loop below computes the remaining 1 to 3 samples. */
 113   do
 114   {
 115     /* Re-ordering of even and odd elements */
 116     /* pState[i] =  pInlineBuffer[2*i] */
 117     *pS1++ = *pbuff++;
 118     /* pState[N-i-1] = pInlineBuffer[2*i+1] */
 119     *pS2-- = *pbuff++;
 120
 121     *pS1++ = *pbuff++;
 122     *pS2-- = *pbuff++;
 123
 124     *pS1++ = *pbuff++;
 125     *pS2-- = *pbuff++;
 126
 127     *pS1++ = *pbuff++;
 128     *pS2-- = *pbuff++;
 129
 130     /* Decrement the loop counter */
 131     i--;
 132   } while(i > 0u);
 133
 134   /* pbuff initialized to input buffer */
 135   pbuff = pInlineBuffer;
 136
 137   /* pS1 initialized to pState */
 138   pS1 = pState;
 139
 140   /* Initializing the loop counter to N/4 instead of N for loop unrolling */
 141   i = (uint32_t) S->N >> 2u;
 142
 143   /* Processing with loop unrolling 4 times as N is always multiple of 4.
 144    * Compute 4 outputs at a time */
 145   do
 146   {
 147     /* Writing the re-ordered output back to inplace input buffer */
 148     *pbuff++ = *pS1++;
 149     *pbuff++ = *pS1++;
 150     *pbuff++ = *pS1++;
 151     *pbuff++ = *pS1++;
 152
 153     /* Decrement the loop counter */
 154     i--;
 155   } while(i > 0u);
 156
 157
 158   /* ---------------------------------------------------------
 159    *     Step2: Calculate RFFT for N-point input
 160    * ---------------------------------------------------------- */
 161   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
 162   arm_rfft_q15(S->pRfft, pInlineBuffer, pState);
 163
 164  /*----------------------------------------------------------------------
 165   *  Step3: Multiply the FFT output with the weights.
 166   *----------------------------------------------------------------------*/
 167   arm_cmplx_mult_cmplx_q15(pState, weights, pState, S->N);
 168
 169   /* The output of complex multiplication is in 3.13 format.
 170    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */
 171   arm_shift_q15(pState, 2, pState, S->N * 2);
 172
 173   /* ----------- Post-processing ---------- */
 174   /* DCT-IV can be obtained from DCT-II by the equation,
 175    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
 176    *       Hence, Y4(0) = Y2(0)/2  */
 177   /* Getting only real part from the output and Converting to DCT-IV */
 178
 179   /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
 180   i = ((uint32_t) S->N - 1u) >> 2u;
 181
 182   /* pbuff initialized to input buffer. */
 183   pbuff = pInlineBuffer;
 184
 185   /* pS1 initialized to pState */
 186   pS1 = pState;
 187
 188   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
 189   in = *pS1++ >> 1u;
 190   /* input buffer acts as inplace, so output values are stored in the input itself. */
 191   *pbuff++ = in;
 192
 193   /* pState pointer is incremented twice as the real values are located alternatively in the array */
 194   pS1++;
 195
 196   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
 197    ** a second loop below computes the remaining 1 to 3 samples. */
 198   do
 199   {
 200     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
 201     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
 202     in = *pS1++ - in;
 203     *pbuff++ = in;
 204     /* points to the next real value */
 205     pS1++;
 206
 207     in = *pS1++ - in;
 208     *pbuff++ = in;
 209     pS1++;
 210
 211     in = *pS1++ - in;
 212     *pbuff++ = in;
 213     pS1++;
 214
 215     in = *pS1++ - in;
 216     *pbuff++ = in;
 217     pS1++;
 218
 219     /* Decrement the loop counter */
 220     i--;
 221   } while(i > 0u);
 222
 223   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
 224    ** No loop unrolling is used. */
 225   i = ((uint32_t) S->N - 1u) % 0x4u;
 226
 227   while(i > 0u)
 228   {
 229     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
 230     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
 231     in = *pS1++ - in;
 232     *pbuff++ = in;
 233     /* points to the next real value */
 234     pS1++;
 235
 236     /* Decrement the loop counter */
 237     i--;
 238   }
 239
 240
 241    /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
 242
 243   /* Initializing the loop counter to N/4 instead of N for loop unrolling */
 244   i = (uint32_t) S->N >> 2u;
 245
 246   /* pbuff initialized to the pInlineBuffer(now contains the output values) */
 247   pbuff = pInlineBuffer;
 248
 249   /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */
 250   do
 251   {
 252     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
 253     in = *pbuff;
 254     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
 255
 256     in = *pbuff;
 257     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
 258
 259     in = *pbuff;
 260     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
 261
 262     in = *pbuff;
 263     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
 264
 265     /* Decrement the loop counter */
 266     i--;
 267   } while(i > 0u);
 268
 269
 270 #else
 271
 272   /* Run the below code for Cortex-M0 */
 273
 274   /* Initializing the loop counter to N/2 */
 275   i = (uint32_t) S->Nby2;
 276
 277   do
 278   {
 279     /* Re-ordering of even and odd elements */
 280     /* pState[i] =  pInlineBuffer[2*i] */
 281     *pS1++ = *pbuff++;
 282     /* pState[N-i-1] = pInlineBuffer[2*i+1] */
 283     *pS2-- = *pbuff++;
 284
 285     /* Decrement the loop counter */
 286     i--;
 287   } while(i > 0u);
 288
 289   /* pbuff initialized to input buffer */
 290   pbuff = pInlineBuffer;
 291
 292   /* pS1 initialized to pState */
 293   pS1 = pState;
 294
 295   /* Initializing the loop counter */
 296   i = (uint32_t) S->N;
 297
 298   do
 299   {
 300     /* Writing the re-ordered output back to inplace input buffer */
 301     *pbuff++ = *pS1++;
 302
 303     /* Decrement the loop counter */
 304     i--;
 305   } while(i > 0u);
 306
 307
 308   /* ---------------------------------------------------------
 309    *     Step2: Calculate RFFT for N-point input
 310    * ---------------------------------------------------------- */
 311   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
 312   arm_rfft_q15(S->pRfft, pInlineBuffer, pState);
 313
 314  /*----------------------------------------------------------------------
 315   *  Step3: Multiply the FFT output with the weights.
 316   *----------------------------------------------------------------------*/
 317   arm_cmplx_mult_cmplx_q15(pState, weights, pState, S->N);
 318
 319   /* The output of complex multiplication is in 3.13 format.
 320    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */
 321   arm_shift_q15(pState, 2, pState, S->N * 2);
 322
 323   /* ----------- Post-processing ---------- */
 324   /* DCT-IV can be obtained from DCT-II by the equation,
 325    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
 326    *       Hence, Y4(0) = Y2(0)/2  */
 327   /* Getting only real part from the output and Converting to DCT-IV */
 328
 329   /* Initializing the loop counter */
 330   i = ((uint32_t) S->N - 1u);
 331
 332   /* pbuff initialized to input buffer. */
 333   pbuff = pInlineBuffer;
 334
 335   /* pS1 initialized to pState */
 336   pS1 = pState;
 337
 338   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
 339   in = *pS1++ >> 1u;
 340   /* input buffer acts as inplace, so output values are stored in the input itself. */
 341   *pbuff++ = in;
 342
 343   /* pState pointer is incremented twice as the real values are located alternatively in the array */
 344   pS1++;
 345
 346   do
 347   {
 348     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
 349     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
 350     in = *pS1++ - in;
 351     *pbuff++ = in;
 352     /* points to the next real value */
 353     pS1++;
 354
 355     /* Decrement the loop counter */
 356     i--;
 357   } while(i > 0u);
 358
 359    /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
 360
 361   /* Initializing the loop counter */
 362   i = (uint32_t) S->N;
 363
 364   /* pbuff initialized to the pInlineBuffer(now contains the output values) */
 365   pbuff = pInlineBuffer;
 366
 367   do
 368   {
 369     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
 370     in = *pbuff;
 371     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
 372
 373     /* Decrement the loop counter */
 374     i--;
 375   } while(i > 0u);
 376
 377 #endif /* #ifndef ARM_MATH_CM0 */
 378
 379 }
 380
 381 /**
 382    * @} end of DCT4_IDCT4 group
 383    */