uint32_t *ww = (uint32_t *)(w);
uint32_t *wend = (uint32_t *)(w + gg);
uint32_t *xx = (uint32_t *)&(x[128 + h - 4]);
int64_t yhat64 = 0;
do {
// TODO: avoid slow unaligned access when h is odd
uint32_t w1 = *ww++; // bring in 4 values from w[]
uint32_t w2 = *ww++;
uint32_t x1 = *xx++; // bring in 4 values from x[]
uint32_t x2 = *xx++;
xx -= 8;
// x order: w1b, w1t, w2b, w2t
// w order: x2t, x2b, x1t, x1b
yhat64 = multiply_accumulate_16tx16b_add_16bx16t(yhat64, x2, w1);
yhat64 = multiply_accumulate_16tx16b_add_16bx16t(yhat64, w2, x1);
xtdl = multiply_accumulate_16tx16t_add_16bx16b(xtdl, x1, x1);
xtdl = multiply_accumulate_16tx16t_add_16bx16b(xtdl, x2, x2);
} while (ww < wend);
yhat = yhat64 / 32768;