#include <ao_fec.h>
#include <stdio.h>
+#ifdef TELEMEGA
+#include <ao.h>
+#endif
+
+#if AO_PROFILE
+#include <ao_profile.h>
+
+uint32_t ao_fec_decode_start, ao_fec_decode_end;
+#endif
+
/*
* byte order repeats through 3 2 1 0
*
* 18/19 10/11 08/09 00/01
*/
+static const uint8_t ao_interleave_order[] = {
+ 0x1e, 0x16, 0x0e, 0x06,
+ 0x1c, 0x14, 0x0c, 0x04,
+ 0x1a, 0x12, 0x0a, 0x02,
+ 0x18, 0x10, 0x08, 0x00
+};
+
static inline uint16_t ao_interleave_index(uint16_t i) {
- uint8_t l = i & 0x1e;
- uint16_t h = i & ~0x1e;
- uint8_t o = 0x1e ^ (((l >> 2) & 0x6) | ((l << 2) & 0x18));
- return h | o;
+ return (i & ~0x1e) | ao_interleave_order[(i & 0x1e) >> 1];
}
-struct ao_soft_sym {
- uint8_t a, b;
-};
-
#define NUM_STATE 8
-#define NUM_HIST 8
-#define MOD_HIST(b) ((b) & 7)
-
-#define V_0 0xc0
-#define V_1 0x40
-
-static const struct ao_soft_sym ao_fec_decode_table[NUM_STATE][2] = {
-/* next 0 1 state */
- { { V_0, V_0 }, { V_1, V_1 } } , /* 000 */
- { { V_0, V_1 }, { V_1, V_0 } }, /* 001 */
- { { V_1, V_1 }, { V_0, V_0 } }, /* 010 */
- { { V_1, V_0 }, { V_0, V_1 } }, /* 011 */
- { { V_1, V_1 }, { V_0, V_0 } }, /* 100 */
- { { V_1, V_0 }, { V_0, V_1 } }, /* 101 */
- { { V_0, V_0 }, { V_1, V_1 } }, /* 110 */
- { { V_0, V_1 }, { V_1, V_0 } } /* 111 */
+#define NUM_HIST 24
+
+typedef uint32_t bits_t;
+
+#define V_0 0xff
+#define V_1 0x00
+
+/*
+ * These are just the 'zero' states; the 'one' states mirror them
+ */
+static const uint8_t ao_fec_decode_table[NUM_STATE*2] = {
+ V_0, V_0, /* 000 */
+ V_0, V_1, /* 001 */
+ V_1, V_1, /* 010 */
+ V_1, V_0, /* 011 */
+ V_1, V_1, /* 100 */
+ V_1, V_0, /* 101 */
+ V_0, V_0, /* 110 */
+ V_0, V_1 /* 111 */
};
static inline uint8_t
return ((state << 1) | bit) & 0x7;
}
-static inline uint16_t ao_abs(int16_t x) { return x < 0 ? -x : x; }
-
-static inline uint16_t
-ao_cost(struct ao_soft_sym a, struct ao_soft_sym b)
-{
- return ao_abs(a.a - b.a) + ao_abs(a.b - b.b);
-}
-
/*
* 'in' is 8-bits per symbol soft decision data
* 'len' is input byte length. 'out' must be
*/
uint8_t
-ao_fec_decode(uint8_t *in, uint16_t len, uint8_t *out, uint8_t out_len, uint16_t (*callback)())
+ao_fec_decode(const uint8_t *in, uint16_t len, uint8_t *out, uint8_t out_len, uint16_t (*callback)(void))
{
- static uint16_t cost[2][NUM_STATE]; /* path cost */
- static uint16_t bits[2][NUM_STATE]; /* save bits to quickly output them */
+ static uint32_t cost[2][NUM_STATE]; /* path cost */
+ static bits_t bits[2][NUM_STATE]; /* save bits to quickly output them */
+
uint16_t i; /* input byte index */
uint16_t b; /* encoded symbol index (bytes/2) */
uint16_t o; /* output bit index */
uint8_t p; /* previous cost/bits index */
uint8_t n; /* next cost/bits index */
uint8_t state; /* state index */
- uint8_t bit; /* original encoded bit index */
const uint8_t *whiten = ao_fec_whiten_table;
uint16_t interleave; /* input byte array index */
- struct ao_soft_sym s; /* input symbol pair */
+ uint8_t s0, s1;
uint16_t avail;
+ uint16_t crc = AO_FEC_CRC_INIT;
+#if AO_PROFILE
+ uint32_t start_tick;
+#endif
p = 0;
for (state = 0; state < NUM_STATE; state++) {
- cost[0][state] = 0xffff;
+ cost[0][state] = 0x7fffffff;
bits[0][state] = 0;
}
cost[0][0] = 0;
else
avail = len;
+#if AO_PROFILE
+ if (!avail) {
+ avail = callback();
+ if (!avail)
+ return 0;
+ }
+ start_tick = ao_profile_tick();
+#endif
o = 0;
for (i = 0; i < len; i += 2) {
b = i/2;
if (!avail) {
avail = callback();
if (!avail)
- break;
+ return 0;
}
/* Fetch one pair of input bytes, de-interleaving
* the input.
*/
interleave = ao_interleave_index(i);
- s.a = in[interleave];
- s.b = in[interleave+1];
+ s0 = in[interleave];
+ s1 = in[interleave+1];
- /* Reset next costs to 'impossibly high' values so that
- * the first path through this state is cheaper than this
- */
- for (state = 0; state < NUM_STATE; state++)
- cost[n][state] = 0xffff;
+ avail -= 2;
/* Compute path costs and accumulate output bit path
- * for each state and encoded bit value
+ * for each state and encoded bit value. Unrolling
+ * this loop is worth about > 30% performance boost.
+ * Decoding 76-byte remote access packets is reduced
+ * from 14.700ms to 9.3ms. Redoing the loop to
+ * directly compare the two pasts for each future state
+ * reduces this down to 5.7ms
*/
- for (state = 0; state < NUM_STATE; state++) {
- for (bit = 0; bit < 2; bit++) {
- int bit_cost = cost[p][state] + ao_cost(s, ao_fec_decode_table[state][bit]);
- uint8_t bit_state = ao_next_state(state, bit);
-
- /* Only track the minimal cost to reach
- * this state; the best path can never
- * go through the higher cost paths as
- * total path cost is cumulative
- */
- if (bit_cost < cost[n][bit_state]) {
- cost[n][bit_state] = bit_cost;
- bits[n][bit_state] = (bits[p][state] << 1) | (state & 1);
- }
- }
+
+ /* Ok, of course this is tricky, it's optimized.
+ *
+ * First, it's important to realize that we have 8
+ * states representing the combinations of the three
+ * most recent bits from the encoder. Flipping any
+ * of these three bits flips both output bits.
+ *
+ * 'state<<1' represents the target state for a new
+ * bit value of 0. '(state<<1)+1' represents the
+ * target state for a new bit value of 1.
+ *
+ * 'state' is the previous state with an oldest bit
+ * value of 0. 'state + 4' is the previous state with
+ * an oldest bit value of 1. These two states will
+ * either lead to 'state<<1' or '(state<<1)+1', depending
+ * on whether the next encoded bit was a zero or a one.
+ *
+ * m0 and m1 are the cost of coming to 'state<<1' from
+ * one of the two possible previous states 'state' and
+ * 'state + 4'.
+ *
+ * Because we know the expected values of each
+ * received bit are flipped between these two previous
+ * states:
+ *
+ * bitcost(state+4) = 510 - bitcost(state)
+ *
+ * With those two total costs in hand, we then pick
+ * the lower as the cost of the 'state<<1', and compute
+ * the path of bits leading to that state.
+ *
+ * Then, do the same for '(state<<1) + 1'. This time,
+ * instead of computing the m0 and m1 values from
+ * scratch, because the only difference is that we're
+ * expecting a one bit instead of a zero bit, we just
+ * flip the bitcost values around to match the
+ * expected transmitted bits with some tricky
+ * arithmetic which is equivalent to:
+ *
+ * m0 = cost[p][state] + (510 - bitcost);
+ * m1 = cost[p][state+4] + bitcost
+ *
+ * Then, the lowest cost and bit trace of the new state
+ * is saved.
+ */
+
+#define DO_STATE(state) { \
+ uint32_t bitcost; \
+ \
+ uint32_t m0; \
+ uint32_t m1; \
+ uint32_t bit; \
+ \
+ bitcost = ((uint32_t) (s0 ^ ao_fec_decode_table[(state<<1)]) + \
+ (uint32_t) (s1 ^ ao_fec_decode_table[(state<<1)|1])); \
+ \
+ m0 = cost[p][state] + bitcost; \
+ m1 = cost[p][state+4] + (510 - bitcost); \
+ bit = m0 > m1; \
+ cost[n][state<<1] = bit ? m1 : m0; \
+ bits[n][state<<1] = (bits[p][state + (bit<<2)] << 1) | (state&1); \
+ \
+ m0 -= (bitcost+bitcost-510); \
+ m1 += (bitcost+bitcost-510); \
+ bit = m0 > m1; \
+ cost[n][(state<<1)+1] = bit ? m1 : m0; \
+ bits[n][(state<<1)+1] = (bits[p][state + (bit<<2)] << 1) | (state&1); \
}
+ DO_STATE(0);
+ DO_STATE(1);
+ DO_STATE(2);
+ DO_STATE(3);
+
#if 0
- printf ("bit %3d symbol %2x %2x:", i/2, s.a, s.b);
+ printf ("bit %3d symbol %2x %2x:", i/2, s0, s1);
for (state = 0; state < NUM_STATE; state++) {
- printf (" %5d(%04x)", cost[n][state], bits[n][state]);
+ printf (" %8u(%08x)", cost[n][state], bits[n][state]);
}
printf ("\n");
#endif
* it will be seven.
*/
int8_t dist = b - (o + 8); /* distance to last ready-for-writing bit */
- uint16_t min_cost; /* lowest cost */
+ uint32_t min_cost; /* lowest cost */
uint8_t min_state; /* lowest cost state */
+ uint8_t byte;
/* Find the best fit at the current point
* of the decode.
printf ("\tbit %3d min_cost %5d old bit %3d old_state %x bits %02x whiten %0x\n",
i/2, min_cost, o + 8, min_state, (bits[p][min_state] >> dist) & 0xff, *whiten);
#endif
- if (out_len) {
- *out++ = (bits[p][min_state] >> dist) ^ *whiten++;
- --out_len;
+ byte = (bits[p][min_state] >> dist) ^ *whiten++;
+ *out++ = byte;
+ if (out_len > 2)
+ crc = ao_fec_crc_byte(byte, crc);
+
+ if (!--out_len) {
+ if ((out[-2] == (uint8_t) (crc >> 8)) &&
+ out[-1] == (uint8_t) crc)
+ out[-1] = AO_FEC_DECODE_CRC_OK;
+ else
+ out[-1] = 0;
+ out[-2] = 0;
+ goto done;
}
o += 8;
}
}
- return len/16;
+done:
+#if AO_PROFILE
+ ao_fec_decode_start = start_tick;
+ ao_fec_decode_end = ao_profile_tick();
+#endif
+ return 1;
}