2 For 1 CMSIS FFT

Status
Not open for further replies.

duff

Well-known member
By using a N point complex FFT you can calculate two real valued FFT's. There is a little overhead in setting up the input buffer to the complex FFT and recovering the 2 outputs. I'm sure the combine and recover stages can be optimized but I'll leave that for later, the performance increase is fairly large considering. The sketch below shows how I did it. I used the "complex q15" data type in this example.
Code:
#include "arm_math.h"
#include <complex.h>
#define FFT_SIZE 1024


arm_cfft_radix4_instance_q15 fft_inst;
// q15 complex data type
typedef q15_t __complex__ cplx;


// buffers for the FFT's
cplx Standard_FFT_Buffer1[FFT_SIZE];
cplx Standard_FFT_Buffer2[FFT_SIZE];
cplx Combined_FFT_Buffer[FFT_SIZE];
cplx Two_For_One_FFT_Buffer1[FFT_SIZE];
cplx Two_For_One_FFT_Buffer2[FFT_SIZE];


void setup() {
  while(!Serial);
  delay(1000);
  Serial.println("Two for one Real FFT test");
  // standard FFT setup
  arm_cfft_radix4_init_q15(&fft_inst, FFT_SIZE, 0, 1);
  // -------------------------------------------------------------------------
  // fill standard FFT buffers, these are later interleaved into the "Combined_FFT_Buffer"
  for (int i = 0; i < FFT_SIZE; i++) {
    __real__ Standard_FFT_Buffer1[i] = random(-0x7FFF, 0x7FFF);
    __real__ Standard_FFT_Buffer2[i] = random(-0x7FFF, 0x7FFF);
    __imag__ Standard_FFT_Buffer1[i] = 0;
    __imag__ Standard_FFT_Buffer2[i] = 0;
  }
  elapsedMicros time = 0;
  // -------------------------------------------------------------------------
  // interleave the two buffers into real and imaginary parts of the "Combined_FFT_Buffer"
  // Standard_FFT_Buffer1 -> real
  // Standard_FFT_Buffer2 -> imag
  combine_buffers(Standard_FFT_Buffer1, Standard_FFT_Buffer2, Combined_FFT_Buffer, FFT_SIZE);
  // -------------------------------------------------------------------------
  // perform 2 for 1 FFT
  arm_cfft_radix4_q15(&fft_inst, (q15_t*)Combined_FFT_Buffer);
  // -------------------------------------------------------------------------
  // extracts the two Real val FFT's from the one complex FFT output
  extract_buffers(Combined_FFT_Buffer, Two_For_One_FFT_Buffer1, Two_For_One_FFT_Buffer2, FFT_SIZE);
  uint32_t t1 = time;
  
  time = 0;
  // -------------------------------------------------------------------------
  arm_cfft_radix4_q15(&fft_inst, (q15_t*)Standard_FFT_Buffer1);
  // -------------------------------------------------------------------------
  arm_cfft_radix4_q15(&fft_inst, (q15_t*)Standard_FFT_Buffer2);
  // -------------------------------------------------------------------------
  uint32_t t2 = time;
  Serial.println();
  Serial.printf("2 For One FFT Time:\t%i\n", t1);
  Serial.printf("Two Standard FFTs Time:\t%i\n\n", t2);
  Serial.println("Test to see if the 2 for 1 FFT gives the same output as two standard Real FFTs");
  Serial.println();
  Serial.println("Standard_FFT_Buffer1 and Two_For_One_FFT_Buffer1");
  Serial.println();
  Serial.println("Standard_FFT_|_2_For_1_FFT__||_Standard_FFT_|_2_For_1_FFT_");
  for (int i = 0; i < FFT_SIZE; i++) {
    Serial.printf("real: %6i | ", __real__ Standard_FFT_Buffer1[i]);
    Serial.printf("real: %6i || ", __real__ Two_For_One_FFT_Buffer1[i]);


    Serial.printf("imag: %6i | ", __imag__ Standard_FFT_Buffer1[i]);
    Serial.printf("imag: %6i\n", __imag__ Two_For_One_FFT_Buffer1[i]);
  }
  Serial.println("-----------------------------------------------------------");
  // -------------------------------------------------------------------------
  Serial.println();
  Serial.println("Standard_FFT_Buffer2 and Two_For_One_FFT_Buffer2");
  Serial.println();
  Serial.println("Standard_FFT_|_2_For_1_FFT__||_Standard_FFT_|_2_For_1_FFT_");
  for (int i = 0; i < FFT_SIZE; i++) {
    Serial.printf("real: %6i | ", __real__ Standard_FFT_Buffer2[i]);
    Serial.printf("real: %6i || ", __real__ Two_For_One_FFT_Buffer2[i]);


    Serial.printf("imag: %6i | ", __imag__ Standard_FFT_Buffer2[i]);
    Serial.printf("imag: %6i\n", __imag__ Two_For_One_FFT_Buffer2[i]);
  }
  Serial.println("-----------------------------------------------------------");
}


void loop() {


}


//----------------------------------------------------------------
void combine_buffers(cplx *buf1, cplx *buf2, cplx *out_buf, uint16_t fft_size) {
  for (int i = 0; i < fft_size; i++) {
    __real__ out_buf[i] = __real__ buf1[i];
    __imag__ out_buf[i] = __real__ buf2[i];
  }
}
//----------------------------------------------------------------
void extract_buffers(cplx *fft_buf, cplx *buf1, cplx *buf2, uint16_t fft_size) {
  __real__ buf1[0] = __real__ fft_buf[0];
  __imag__ buf1[0] = 0;


  __real__ buf2[0] = __imag__ fft_buf[0];
  __imag__ buf2[0] = 0;


  for (int i = 1; i < fft_size / 2; i++) {
    q15_t x1r = (__real__ fft_buf[i] + __real__ fft_buf[fft_size - i]) >> 1;
    q15_t x1i = (__imag__ fft_buf[i] - __imag__ fft_buf[fft_size - i]) >> 1;


    q15_t x2r = (__imag__ fft_buf[i] + __imag__ fft_buf[fft_size - i]) >> 1;
    q15_t x2i = -(__real__ fft_buf[i] - __real__ fft_buf[fft_size - i]) >> 1;


    __real__ buf1[i] = x1r;
    __imag__ buf1[i] = x1i;
    __real__ buf1[fft_size - i] = x1r;
    __imag__ buf1[fft_size - i] = -x1i;


    __real__ buf2[i] = x2r;
    __imag__ buf2[i] = x2i;
    __real__ buf2[fft_size - i] = x2r;
    __imag__ buf2[fft_size - i] = -x2i;
  }
  __real__ buf1[fft_size / 2] = (__real__ fft_buf[fft_size / 2]);
  __imag__ buf1[fft_size / 2] = 0;


  __real__ buf2[FFT_SIZE / 2] = __imag__ fft_buf[fft_size / 2];
  __imag__ buf2[FFT_SIZE / 2] = 0;
}
You will see that for q15 data type there is some round off error that needs to worked out but the 2 for 1 FFT outputs should be theoretically exact.
 
Last edited:
For completeness:
there is a real to complex CMSIS FFT that take a 2N long real sequence and form a N-size complex spectrum

and here is my own version of Duff's dual-channel FFT.
As my application was limited to 256 point FFT, the code is hard coded to this. However, it may serve as example

Code:
// cfft instance
arm_cfft_instance_q15 S_cfft_q15_len256 = 
{	256, 
	twiddleCoef_256_q15, 
	armBitRevIndexTable_fixed_256, 
	ARMBITREVINDEXTABLE_FIXED__256_TABLE_LENGTH
};

// some useful inline functions 
__attribute__( ( always_inline ) ) static_inline uint32_t __ROR(uint32_t op1, uint32_t op2)
{__asm volatile ("ror %0, %0, %1" : "+r" (op1) : "r" (op2) );
  return(op1);
}

__attribute__( ( always_inline ) ) static_inline uint32_t __SHSAX(uint32_t op1, uint32_t op2)
{ uint32_t result;
  __asm volatile ("shsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
  return(result);
}

// here comes the dual channel rfft function
FASTRUN void rfft_256(short *xx, short *yy)
{
// xx input vector length 2*n
// xx[0:2:2*n-2] first time series
// xx[1:2:2*n-1] second time series
// assume n=4^m  or m=log2(n)/log2(4)
//
// yy[0:4:2*n-2] first spectrum (real,imag interleaved) {yy[0], yy[1], yy[4], yy[5] ...}
// yy[2:4:2*n-1] second spectrum (real, imag interleaved) {yy[2], yy[3], yy[6], yy[7] ...}

// size(xx) = 2*nn shorts = 4*nn bytes
// size(yy) = 2*nn shorts = 4*nn bytes

	// separate two spectra
	// Fn = (Hn + conj(HN-n)/2
	// Gn = -i(Hn - conj(HN-n))/2
	//top
	// u1r(ii) = (yr(ii)+yr(jj))/2
	// u1i(ii) = (yi(ii)-yi(jj))/2
	//bot
	// u2r(ii) = (yi(ii)+yi(jj))/2
	// u2i(ii) = -(yr(ii)-yr(jj))/2
	// or
	// u2r(ii) = (yi(ii)+yi(jj))/2
	// u2i(ii) = (yr(jj)-yr(ii))/2
	// 
	//in1 = ((uint32_t) in1 >> 16) | ((uint32_t) in1 << 16);

	int nn;
	nn=256; //have a 256 point complex FFT
	arm_cfft_q15(&S_cfft_q15_len256, xx,0,1);
	
	
	yy[0]=xx[0];
	yy[2]=xx[1];

	int *rna=(int*) &xx[2];
	int *rma=(int*) &xx[2*nn-2];
	int *rr1 = (int*)&yy[4];
	int *rr2 = (int*)&yy[4+2];

	for(int ii=1; ii<nn/2;ii+=4)
	{ 	int rn,rm,rm1;
		//
		rn = *rna;
		rm = *rma;
		rm1=__ROR(rm,16);
		*rr1=__SHSAX(rn,rm1);
		*rr2=__SHSAX(rm1,rn);
		rna++;
		rma--;
		rr1+=2;
		rr2+=2;
		//
		rn = *rna;
		rm = *rma;
		rm1=__ROR(rm,16);
		*rr1=__SHSAX(rn,rm1);
		*rr2=__SHSAX(rm1,rn);
		rna++;
		rma--;
		rr1+=2;
		rr2+=2;
		//
		rn = *rna;
		rm = *rma;
		rm1=__ROR(rm,16);
		*rr1=__SHSAX(rn,rm1);
		*rr2=__SHSAX(rm1,rn);
		rna++;
		rma--;
		rr1+=2;
		rr2+=2;
		//
		rn = *rna;
		rm = *rma;
		rm1=__ROR(rm,16);
		*rr1=__SHSAX(rn,rm1);
		*rr2=__SHSAX(rm1,rn);
		rna++;
		rma--;
		rr1+=2;
		rr2+=2;
	}
	// yy1 is n words long (n/2 complex spectra)
	// yy2 is n words long (n/2 complex spectra)
}

One should note that that to generate acceptable results, the two channels should have similar energy distribution.
 
I tried to get yours to compile but to many errors so I gave up. ( i did add the setup and loop functions and included the arm cmsis library).

I'm also working on another version where you can do two partial FFT's and combine them to get a longer FFT. So for example you can do two 16 pt FFT's and get 32 pt FFT, this would add support for sizes not in the cmsis library.
 
I tried to get yours to compile but to many errors so I gave up. ( i did add the setup and loop functions and included the arm cmsis library).
Sorry, It was a cut of an non-Arduino environment, but I will try to make an Arduino version.
 
Sorry, It was a cut of an non-Arduino environment, but I will try to make an Arduino version.

Done,
I added test_rfft to my 'mini CMSIS' library https://github.com/WMXZ-EU/wmxzDSP

There are some issues compiling CMSIS on top of TD. So I had to do some tricks to overwrite Paul's core library.
This may be because my CMSIS is based on CMSIS_5 and therefore there are some incompatibilities.
I compiled and tested it on T3.6 , A1.82, TD1.36b4
 
Status
Not open for further replies.
Back
Top