willie.from.texas
Well-known member
I haven't seen any discussion on this forum on using CMSIS Version 5.3. I wanted to try utilizing the floating point FFTs in a signal filtering application. I'm familiar with the work that has been discussed on the Software Defined Radio and the Guitar Cabinet Simulation blogs using CMSIS Version 4, along with some of the reservations in integrating into the Teensy libraries due to overhead concerns. I was successful in integrating that library into my environment and was able to design a simple overlap-and-save filtering algorithm using a 200 point impulse response filter. I tried it out using 512 and 1024 point transforms. I was impressed with the floating point performance. For a forward FFT (using arm_rfft_fast_f32), complex multiply and inverse FFT using 512 points it took about 500 microseconds. For a 1024 point FFT it took approximately 900 microseconds. With regard to the program storage space for my demo program, here is the result of the compilation:
Sketch uses 106892 bytes (10%) of program storage space. Maximum is 1048576 bytes.
Global variables use 83612 bytes (31%) of dynamic memory, leaving 178532 bytes for local variables. Maximum is 262144 bytes.
I'm currently working on a realtime version of this filter and have the sampling rate running up to 3 microseconds/sample. The data are digitized using the onboard ADC, filtered, with the filtered results being sent through the DAC. The phase delay is running about 7.8 milliseconds using a 512 point FFT filter.
Here is the code I used to evaluate the performance described above.
/*
Filter a signal containing two frequencies using the overlap-save method.
Uses CMSIS Version 5.3.0 f32 routines on a Teensy 3.6
*/
// Common libraries:
#include <arm_math.h>
#include <arm_const_structs.h>
//#include <ref.h>
#define bufSize 8192
#define numpts 512
#define impulse_length_minus_1 199
static arm_status status;
static arm_rfft_fast_instance_f32 fft_instance;
boolean out_pin; // Use for timing
float32_t input_waveform[bufSize];
float32_t output_waveform[bufSize];
float32_t fft_aray[numpts]; // Array used to store fft result
float32_t window[numpts]; // Array used to store fft of filter data
float32_t cmplx_product[numpts]; // Array used to store complex product of fft result and window
float32_t Ak[numpts]; // Used to store data being sent to the fft routines
float32_t fft_result[numpts]; // Output from inverse fft
float32_t filter[numpts]; // Will contain the impulse that is terminated with zero padding
float32_t impulse[impulse_length_minus_1 + 1] = { // Impulse response for low-pass filter beginning at frequency bin 20 out of 128. Stop band > -100 dB
9.4978700E-08, -1.0099240E-06, -3.9555098E-06, -8.1674212E-06, -1.2233856E-05, -1.4147944E-05, -1.1836013E-05, -3.8747080E-06,
9.7844092E-06, 2.7311901E-05, 4.4937360E-05, 5.7465704E-05, 5.9374463E-05, 4.6323969E-05, 1.6752608E-05, -2.6873229E-05,
-7.7589710E-05, -1.2456752E-04, -1.5493964E-04, -1.5661931E-04, -1.2159463E-04, -4.8950325E-05, 5.3220769E-05, 1.6774265E-04,
2.7052492E-04, 3.3489880E-04, 3.3748708E-04, 2.6447337E-04, 1.1682254E-04, -8.7008106E-05, -3.1198579E-04, -5.1149393E-04,
-6.3594928E-04, -6.4365675E-04, -5.1175053E-04, -2.4469101E-04, 1.2205630E-04, 5.2479704E-04, 8.8124334E-04, 1.1058973E-03,
1.1285844E-03, 9.1242952E-04, 4.6714264E-04, -1.4606171E-04, -8.2023298E-04, -1.4193952E-03, -1.8040441E-03, -1.8610211E-03,
-1.5317851E-03, -8.3265360E-04, 1.3849318E-04, 1.2123019E-03, 2.1748259E-03, 2.8075305E-03, 2.9334421E-03, 2.4602629E-03,
1.4107964E-03, -6.7401987E-05, -1.7177495E-03, -3.2154444E-03, -4.2276791E-03, -4.4827877E-03, -3.8347517E-03, -2.3089211E-03,
-1.1716140E-04, 2.3642611E-03, 4.6536868E-03, 6.2518534E-03, 6.7434843E-03, 5.8931952E-03, 3.7152958E-03, 5.0023185E-04,
-3.2125723E-03, -6.7158793E-03, -9.2593249E-03, -1.0197256E-02, -9.1325789E-03, -6.0274401E-03, -1.2551129E-03, 4.4236323E-03,
9.9633829E-03, 1.4204956E-02, 1.6088985E-02, 1.4876395E-02, 1.0335310E-02, 2.8564602E-03, -6.5310283E-03, -1.6255904E-02,
-2.4406433E-02, -2.9011491E-02, -2.8359337E-02, -2.1302867E-02, -7.4992025E-03, 1.2460427E-02, 3.7058422E-02, 6.4002438E-02,
9.0503604E-02, 1.1364256E-01, 1.3076897E-01, 1.3987339E-01, 1.3987339E-01, 1.3076897E-01, 1.1364256E-01, 9.0503604E-02,
6.4002438E-02, 3.7058422E-02, 1.2460427E-02, -7.4992025E-03, -2.1302867E-02, -2.8359337E-02, -2.9011491E-02, -2.4406433E-02,
-1.6255904E-02, -6.5310283E-03, 2.8564602E-03, 1.0335310E-02, 1.4876395E-02, 1.6088985E-02, 1.4204956E-02, 9.9633829E-03,
4.4236323E-03, -1.2551129E-03, -6.0274401E-03, -9.1325789E-03, -1.0197256E-02, -9.2593249E-03, -6.7158793E-03, -3.2125723E-03,
5.0023185E-04, 3.7152958E-03, 5.8931952E-03, 6.7434843E-03, 6.2518534E-03, 4.6536868E-03, 2.3642611E-03, -1.1716140E-04,
-2.3089211E-03, -3.8347517E-03, -4.4827877E-03, -4.2276791E-03, -3.2154444E-03, -1.7177495E-03, -6.7401987E-05, 1.4107964E-03,
2.4602629E-03, 2.9334421E-03, 2.8075305E-03, 2.1748259E-03, 1.2123019E-03, 1.3849318E-04, -8.3265360E-04, -1.5317851E-03,
-1.8610211E-03, -1.8040441E-03, -1.4193952E-03, -8.2023298E-04, -1.4606171E-04, 4.6714264E-04, 9.1242952E-04, 1.1285844E-03,
1.1058973E-03, 8.8124334E-04, 5.2479704E-04, 1.2205630E-04, -2.4469101E-04, -5.1175053E-04, -6.4365675E-04, -6.3594928E-04,
-5.1149393E-04, -3.1198579E-04, -8.7008106E-05, 1.1682254E-04, 2.6447337E-04, 3.3748708E-04, 3.3489880E-04, 2.7052492E-04,
1.6774265E-04, 5.3220769E-05, -4.8950325E-05, -1.2159463E-04, -1.5661931E-04, -1.5493964E-04, -1.2456752E-04, -7.7589710E-05,
-2.6873229E-05, 1.6752608E-05, 4.6323969E-05, 5.9374463E-05, 5.7465704E-05, 4.4937360E-05, 2.7311901E-05, 9.7844092E-06,
-3.8747080E-06, -1.1836013E-05, -1.4147944E-05, -1.2233856E-05, -8.1674212E-06, -3.9555098E-06, -1.0099240E-06, 9.4978700E-08
};
float pi = 3.1415926535897932f;
float freq1; // Want to keep freq1
float freq2; // Want to filter out freq2
void setup()
{
Serial.begin (230400);
// Set up output ports for timing purposes
PORTC_PCR4 = ( unsigned long ) 0x00000101; // Section 12.5.1, pg. 221 - 222, see Pin Mux Control
// Bit 0 -> Pull Select -> Internal pullup resistor is enabled
// Bit 8 -> ALT1 -> GPIO
PORTC_PCR5 = ( unsigned long ) 0x00000101; // Section 12.5.1, pg. 221 - 222, see Pin Mux Control
// Bit 0 -> Pull Select -> Internal pullup resistor is enabled
// Bit 8 -> ALT1 -> GPIO
GPIOC_PDDR = ( unsigned long ) 0x00000020; // Configure pin 13 as output. See Section 63.3.6. Pin 13 is PTC5 (on-board LED)
// Build waveform for processing
freq1 = 5. * (float)numpts/(float)(impulse_length_minus_1 + 1); //Want to keep freq1
freq2 = 4. * freq1; //Want to filter out freq2. For a 256 point FFT the 3 dB cutoff frequency is at about frequency bin #19
for (int i=0; i < (bufSize); i ++) // Calculate signal to filter over bufSize points
{
input_waveform = sin(2. * pi * freq1 * (float)i/(float)numpts) + sin(2. * pi * freq2 * (float)i/(float)numpts);
}
// Build impulse filter array. Need to zero pad from the end of the impulse to numpts (end of the array).
arm_fill_f32(0.0, filter, numpts);
for (int i = 0; i < impulse_length_minus_1 + 1; i++) filter = impulse;
status = arm_rfft_fast_init_f32(&fft_instance, numpts); // Initialize the fft routines
arm_rfft_fast_f32(&fft_instance, filter, window, 0); // Calculate the spectral response of the filter.
GPIOC_PSOR |= (1 << 5); // Section 63.3.2 - Set pin 13 HIGH
out_pin = true;
delay(5000); // Give plenty of time to trigger scope and to select either the Serial Monitor or Serial Plotter
}
void loop()
{
arm_fill_f32(0.0, output_waveform, bufSize);
arm_fill_f32(0.0, input_waveform, impulse_length_minus_1); //Need to zero pad the beginning of the signal so the circular convolution matches a linear convolution.
// This loop for processing input_waveform data completes in about 500 microseconds per pass for numpts = 512
// 900 microseconds per pass for numpts = 1024
for (int i = 0; i < bufSize/(numpts - impulse_length_minus_1); i++) { // Use for overlap to avoid circular convolution
if (out_pin) GPIOC_PCOR |= (1 << 5); // Section 63.3.1 - Set pin 13 LOW -- Use for scope timing
else
GPIOC_PSOR |= (1 << 5); // Section 63.3.2 - Set pin 13 HIGH
out_pin = !out_pin;
arm_copy_f32(input_waveform + i * (numpts - impulse_length_minus_1), Ak, numpts);
arm_rfft_fast_f32(&fft_instance, Ak, fft_aray, 0);
arm_cmplx_mult_cmplx_f32(fft_aray, window, cmplx_product, numpts/2); // Multiplication in frequency domain is a convolution in the time domain
arm_rfft_fast_f32(&fft_instance, cmplx_product, fft_result, 1); // Inverse transform
arm_copy_f32(fft_result + impulse_length_minus_1, output_waveform + i * (numpts - impulse_length_minus_1), numpts - impulse_length_minus_1);
}
for (int i = 0; i < 1000; i++) Serial.println(output_waveform); //Use to drive the serial plotter to display the second set of 500 points
while(1) {}
}
Sketch uses 106892 bytes (10%) of program storage space. Maximum is 1048576 bytes.
Global variables use 83612 bytes (31%) of dynamic memory, leaving 178532 bytes for local variables. Maximum is 262144 bytes.
I'm currently working on a realtime version of this filter and have the sampling rate running up to 3 microseconds/sample. The data are digitized using the onboard ADC, filtered, with the filtered results being sent through the DAC. The phase delay is running about 7.8 milliseconds using a 512 point FFT filter.
Here is the code I used to evaluate the performance described above.
/*
Filter a signal containing two frequencies using the overlap-save method.
Uses CMSIS Version 5.3.0 f32 routines on a Teensy 3.6
*/
// Common libraries:
#include <arm_math.h>
#include <arm_const_structs.h>
//#include <ref.h>
#define bufSize 8192
#define numpts 512
#define impulse_length_minus_1 199
static arm_status status;
static arm_rfft_fast_instance_f32 fft_instance;
boolean out_pin; // Use for timing
float32_t input_waveform[bufSize];
float32_t output_waveform[bufSize];
float32_t fft_aray[numpts]; // Array used to store fft result
float32_t window[numpts]; // Array used to store fft of filter data
float32_t cmplx_product[numpts]; // Array used to store complex product of fft result and window
float32_t Ak[numpts]; // Used to store data being sent to the fft routines
float32_t fft_result[numpts]; // Output from inverse fft
float32_t filter[numpts]; // Will contain the impulse that is terminated with zero padding
float32_t impulse[impulse_length_minus_1 + 1] = { // Impulse response for low-pass filter beginning at frequency bin 20 out of 128. Stop band > -100 dB
9.4978700E-08, -1.0099240E-06, -3.9555098E-06, -8.1674212E-06, -1.2233856E-05, -1.4147944E-05, -1.1836013E-05, -3.8747080E-06,
9.7844092E-06, 2.7311901E-05, 4.4937360E-05, 5.7465704E-05, 5.9374463E-05, 4.6323969E-05, 1.6752608E-05, -2.6873229E-05,
-7.7589710E-05, -1.2456752E-04, -1.5493964E-04, -1.5661931E-04, -1.2159463E-04, -4.8950325E-05, 5.3220769E-05, 1.6774265E-04,
2.7052492E-04, 3.3489880E-04, 3.3748708E-04, 2.6447337E-04, 1.1682254E-04, -8.7008106E-05, -3.1198579E-04, -5.1149393E-04,
-6.3594928E-04, -6.4365675E-04, -5.1175053E-04, -2.4469101E-04, 1.2205630E-04, 5.2479704E-04, 8.8124334E-04, 1.1058973E-03,
1.1285844E-03, 9.1242952E-04, 4.6714264E-04, -1.4606171E-04, -8.2023298E-04, -1.4193952E-03, -1.8040441E-03, -1.8610211E-03,
-1.5317851E-03, -8.3265360E-04, 1.3849318E-04, 1.2123019E-03, 2.1748259E-03, 2.8075305E-03, 2.9334421E-03, 2.4602629E-03,
1.4107964E-03, -6.7401987E-05, -1.7177495E-03, -3.2154444E-03, -4.2276791E-03, -4.4827877E-03, -3.8347517E-03, -2.3089211E-03,
-1.1716140E-04, 2.3642611E-03, 4.6536868E-03, 6.2518534E-03, 6.7434843E-03, 5.8931952E-03, 3.7152958E-03, 5.0023185E-04,
-3.2125723E-03, -6.7158793E-03, -9.2593249E-03, -1.0197256E-02, -9.1325789E-03, -6.0274401E-03, -1.2551129E-03, 4.4236323E-03,
9.9633829E-03, 1.4204956E-02, 1.6088985E-02, 1.4876395E-02, 1.0335310E-02, 2.8564602E-03, -6.5310283E-03, -1.6255904E-02,
-2.4406433E-02, -2.9011491E-02, -2.8359337E-02, -2.1302867E-02, -7.4992025E-03, 1.2460427E-02, 3.7058422E-02, 6.4002438E-02,
9.0503604E-02, 1.1364256E-01, 1.3076897E-01, 1.3987339E-01, 1.3987339E-01, 1.3076897E-01, 1.1364256E-01, 9.0503604E-02,
6.4002438E-02, 3.7058422E-02, 1.2460427E-02, -7.4992025E-03, -2.1302867E-02, -2.8359337E-02, -2.9011491E-02, -2.4406433E-02,
-1.6255904E-02, -6.5310283E-03, 2.8564602E-03, 1.0335310E-02, 1.4876395E-02, 1.6088985E-02, 1.4204956E-02, 9.9633829E-03,
4.4236323E-03, -1.2551129E-03, -6.0274401E-03, -9.1325789E-03, -1.0197256E-02, -9.2593249E-03, -6.7158793E-03, -3.2125723E-03,
5.0023185E-04, 3.7152958E-03, 5.8931952E-03, 6.7434843E-03, 6.2518534E-03, 4.6536868E-03, 2.3642611E-03, -1.1716140E-04,
-2.3089211E-03, -3.8347517E-03, -4.4827877E-03, -4.2276791E-03, -3.2154444E-03, -1.7177495E-03, -6.7401987E-05, 1.4107964E-03,
2.4602629E-03, 2.9334421E-03, 2.8075305E-03, 2.1748259E-03, 1.2123019E-03, 1.3849318E-04, -8.3265360E-04, -1.5317851E-03,
-1.8610211E-03, -1.8040441E-03, -1.4193952E-03, -8.2023298E-04, -1.4606171E-04, 4.6714264E-04, 9.1242952E-04, 1.1285844E-03,
1.1058973E-03, 8.8124334E-04, 5.2479704E-04, 1.2205630E-04, -2.4469101E-04, -5.1175053E-04, -6.4365675E-04, -6.3594928E-04,
-5.1149393E-04, -3.1198579E-04, -8.7008106E-05, 1.1682254E-04, 2.6447337E-04, 3.3748708E-04, 3.3489880E-04, 2.7052492E-04,
1.6774265E-04, 5.3220769E-05, -4.8950325E-05, -1.2159463E-04, -1.5661931E-04, -1.5493964E-04, -1.2456752E-04, -7.7589710E-05,
-2.6873229E-05, 1.6752608E-05, 4.6323969E-05, 5.9374463E-05, 5.7465704E-05, 4.4937360E-05, 2.7311901E-05, 9.7844092E-06,
-3.8747080E-06, -1.1836013E-05, -1.4147944E-05, -1.2233856E-05, -8.1674212E-06, -3.9555098E-06, -1.0099240E-06, 9.4978700E-08
};
float pi = 3.1415926535897932f;
float freq1; // Want to keep freq1
float freq2; // Want to filter out freq2
void setup()
{
Serial.begin (230400);
// Set up output ports for timing purposes
PORTC_PCR4 = ( unsigned long ) 0x00000101; // Section 12.5.1, pg. 221 - 222, see Pin Mux Control
// Bit 0 -> Pull Select -> Internal pullup resistor is enabled
// Bit 8 -> ALT1 -> GPIO
PORTC_PCR5 = ( unsigned long ) 0x00000101; // Section 12.5.1, pg. 221 - 222, see Pin Mux Control
// Bit 0 -> Pull Select -> Internal pullup resistor is enabled
// Bit 8 -> ALT1 -> GPIO
GPIOC_PDDR = ( unsigned long ) 0x00000020; // Configure pin 13 as output. See Section 63.3.6. Pin 13 is PTC5 (on-board LED)
// Build waveform for processing
freq1 = 5. * (float)numpts/(float)(impulse_length_minus_1 + 1); //Want to keep freq1
freq2 = 4. * freq1; //Want to filter out freq2. For a 256 point FFT the 3 dB cutoff frequency is at about frequency bin #19
for (int i=0; i < (bufSize); i ++) // Calculate signal to filter over bufSize points
{
input_waveform = sin(2. * pi * freq1 * (float)i/(float)numpts) + sin(2. * pi * freq2 * (float)i/(float)numpts);
}
// Build impulse filter array. Need to zero pad from the end of the impulse to numpts (end of the array).
arm_fill_f32(0.0, filter, numpts);
for (int i = 0; i < impulse_length_minus_1 + 1; i++) filter = impulse;
status = arm_rfft_fast_init_f32(&fft_instance, numpts); // Initialize the fft routines
arm_rfft_fast_f32(&fft_instance, filter, window, 0); // Calculate the spectral response of the filter.
GPIOC_PSOR |= (1 << 5); // Section 63.3.2 - Set pin 13 HIGH
out_pin = true;
delay(5000); // Give plenty of time to trigger scope and to select either the Serial Monitor or Serial Plotter
}
void loop()
{
arm_fill_f32(0.0, output_waveform, bufSize);
arm_fill_f32(0.0, input_waveform, impulse_length_minus_1); //Need to zero pad the beginning of the signal so the circular convolution matches a linear convolution.
// This loop for processing input_waveform data completes in about 500 microseconds per pass for numpts = 512
// 900 microseconds per pass for numpts = 1024
for (int i = 0; i < bufSize/(numpts - impulse_length_minus_1); i++) { // Use for overlap to avoid circular convolution
if (out_pin) GPIOC_PCOR |= (1 << 5); // Section 63.3.1 - Set pin 13 LOW -- Use for scope timing
else
GPIOC_PSOR |= (1 << 5); // Section 63.3.2 - Set pin 13 HIGH
out_pin = !out_pin;
arm_copy_f32(input_waveform + i * (numpts - impulse_length_minus_1), Ak, numpts);
arm_rfft_fast_f32(&fft_instance, Ak, fft_aray, 0);
arm_cmplx_mult_cmplx_f32(fft_aray, window, cmplx_product, numpts/2); // Multiplication in frequency domain is a convolution in the time domain
arm_rfft_fast_f32(&fft_instance, cmplx_product, fft_result, 1); // Inverse transform
arm_copy_f32(fft_result + impulse_length_minus_1, output_waveform + i * (numpts - impulse_length_minus_1), numpts - impulse_length_minus_1);
}
for (int i = 0; i < 1000; i++) Serial.println(output_waveform); //Use to drive the serial plotter to display the second set of 500 points
while(1) {}
}