Originally Posted by

**kpc**
I think by unrolling loops and improving memory access, some 5000 cycles can still be gained.

I hope I caught all overflow problems due to bit-growth. If anybody wants to try, i would like to hear any of these problems.

you may remove one ROR in "untangle_256" by using SHSAX instead of SHASX

Test code is

Code:

#include "arm_math.h"
// separate two spectra
// Fn = (Hn + conj(HN-n)/2
// Gn = -i(Hn - conj(HN-n))/2
//
// Z1 = (X + conj(Y)/2
// z1r = (xr+yr)/2
// z1i = (xi-yi)/2
//
// Z2 = -i(Z - conj(Y))/2
// z2r = (xi+yi)/2
// z2i = -(xr-yr)/2 = (yr-xr)/2
//T3.1 is little endian [0],[1] = [r],[i] = [bottom,top]
//
void setup()
{
while(!Serial);
short xx[2]; xx[0]=4; xx[1]=8;
short yy[2]; yy[0]=2; yy[1]=4;
short z1[2];
short z2[2];
// direct method
z1[0]=(xx[0]+yy[0])/2;
z1[1]=(xx[1]-yy[1])/2;
z2[0]=(xx[1]+yy[1])/2;
z2[1]=(yy[0]-xx[0])/2;
Serial.printf("%d %d, %d %d: %d %d, %d %d\n",
xx[0],xx[1],yy[0],yy[1],z1[0],z1[1],z2[0],z2[1]);
// using dsp
int rn= *(int*)xx;
int rm= *(int*)yy;
int *r1 = (int*)z1;
int *r2 = (int*)z2;
int rm1=__ROR(rm,16);
*r1=__SHSAX(rn,rm1);
*r2=__SHSAX(rm1,rn);
Serial.printf("%d %d, %d %d: %d %d, %d %d\n",
xx[0],xx[1],yy[0],yy[1],z1[0],z1[1],z2[0],z2[1]);
}
void loop()
{}