Fast Convolution Filtering with Teensy 4.0 and audio board

DD4WH · Aug 17, 2019

To test T4 performance I extracted the fast convolution filtering code from my SDR and tested it with the Teensy 4.0 and the Teensy audio board. Both were connected with flying wires (MCLK with 100ohms series resistor).

Code:

/*
 * just a small sketch to play around with audio processing with the Teensy 4
 * 
 * FAST CONVOLUTION FILTERING in Stereo
 * 
 * uses Teensy 4.0 and the Teensy audio board connected with flying wires
 * 
 * Frank DD4WH 2019-08-17
 * 
 * in the public domain GNU GPL
 */

#include <Audio.h>
#include <Wire.h>
#include <SPI.h>
#include <arm_math.h>
#include <arm_const_structs.h> // in the Teensy 4.0 audio library, the ARM CMSIS DSP lib is already a newer version

#define DEBUG
#define FOURPI  (2.0 * TWO_PI)
#define SIXPI   (3.0 * TWO_PI)
#define BUFFER_SIZE 128
int32_t sum;
int idx_t = 0;
float32_t mean;
int16_t *sp_L;
int16_t *sp_R;
uint8_t FIR_filter_window = 1;
uint8_t first_block = 1; 
//double FLoCut = 150.0;
//double FHiCut = 4500.0;
double FLoCut = 5.0;
double FHiCut = 20000.0;
double SAMPLE_RATE = (double)AUDIO_SAMPLE_RATE_EXACT;  
//const uint32_t FFT_L = 256; 
//const uint32_t FFT_L = 512; 
//const uint32_t FFT_L = 1024;
//const uint32_t FFT_L = 2048;
const uint32_t FFT_L = 4096; 
double FIR_Coef_I[(FFT_L / 2) + 1]; // 2048 * 4 = 8kb
double FIR_Coef_Q[(FFT_L / 2) + 1]; // 2048 * 4 = 8kb
#define MAX_NUMCOEF (FFT_L / 2) + 1
uint32_t m_NumTaps = (FFT_L / 2) + 1;
uint32_t FFT_length = FFT_L;
const uint32_t N_B = FFT_L / 2 / BUFFER_SIZE;
uint32_t N_BLOCKS = N_B;
float32_t float_buffer_L [BUFFER_SIZE * N_B];  // 2048 * 4 = 8kb
float32_t float_buffer_R [BUFFER_SIZE * N_B]; // 2048 * 4 = 8kb

float32_t FFT_buffer [FFT_L * 2] __attribute__ ((aligned (4)));  // 4096 * 4 = 16kb
float32_t last_sample_buffer_L [BUFFER_SIZE * N_B];  // 2048 * 4 = 8kb
float32_t last_sample_buffer_R [BUFFER_SIZE * N_B]; // 2048 * 4 = 8kb
// complex FFT with the new library CMSIS V4.5
const static arm_cfft_instance_f32 *S;
// complex iFFT with the new library CMSIS V4.5
const static arm_cfft_instance_f32 *iS;
float32_t iFFT_buffer [FFT_L * 2] __attribute__ ((aligned (4)));  // 4096 * 4 = 16kb

// FFT instance for direct calculation of the filter mask
// from the impulse response of the FIR - the coefficients
const static arm_cfft_instance_f32 *maskS;
float32_t FIR_filter_mask [FFT_L * 2] __attribute__ ((aligned (4)));  // 4096 * 4 = 16kb

// this audio comes from the codec by I2S
AudioInputI2S            i2s_in;
AudioRecordQueue         Q_in_L;
AudioRecordQueue         Q_in_R;
AudioMixer4              mixleft;
AudioMixer4              mixright;
AudioPlayQueue           Q_out_L;
AudioPlayQueue           Q_out_R;
AudioOutputI2S           i2s_out;

AudioControlSGTL5000     sgtl5000_1;

AudioConnection          patchCord1(i2s_in, 0, Q_in_L, 0);
AudioConnection          patchCord2(i2s_in, 1, Q_in_R, 0);

AudioConnection          patchCord3(Q_out_L, 0, mixleft, 0);
AudioConnection          patchCord4(Q_out_R, 0, mixright, 0);
AudioConnection          patchCord9(mixleft, 0,  i2s_out, 1);
AudioConnection          patchCord10(mixright, 0, i2s_out, 0);


void setup() {
  Serial.begin(115200);
  delay(100);

  AudioMemory(60); // must be high enough to deliver 16 stereo blocks = 32 * 128 samples at any point in time! 
  delay(100);

  /****************************************************************************************
     Audio Shield Setup
  ****************************************************************************************/
  sgtl5000_1.enable();
  sgtl5000_1.inputSelect(AUDIO_INPUT_LINEIN);
  sgtl5000_1.adcHighPassFilterDisable(); // does not help too much!
  sgtl5000_1.lineInLevel(14);
  sgtl5000_1.lineOutLevel(24);
  mixleft.gain(0, 1.0);
  mixright.gain(0, 1.0);
  sgtl5000_1.volume(0.5); 

  /****************************************************************************************
     set filter bandwidth
  ****************************************************************************************/
  // this routine does all the magic of calculating the FIR coeffs
  calc_cplx_FIR_coeffs (FIR_Coef_I, FIR_Coef_Q, m_NumTaps, FLoCut, FHiCut, SAMPLE_RATE);

  /****************************************************************************************
     init complex FFTs
  ****************************************************************************************/
  switch (FFT_length)
  {
    case 256:
      S = &arm_cfft_sR_f32_len256;
      iS = &arm_cfft_sR_f32_len256;
      maskS = &arm_cfft_sR_f32_len256;
      break;
    case 512:
      S = &arm_cfft_sR_f32_len512;
      iS = &arm_cfft_sR_f32_len512;
      maskS = &arm_cfft_sR_f32_len512;
      break;
    case 1024:
      S = &arm_cfft_sR_f32_len1024;
      iS = &arm_cfft_sR_f32_len1024;
      maskS = &arm_cfft_sR_f32_len1024;
      break;
    case 2048:
      S = &arm_cfft_sR_f32_len2048;
      iS = &arm_cfft_sR_f32_len2048;
      maskS = &arm_cfft_sR_f32_len2048;
      break;
    case 4096:
      S = &arm_cfft_sR_f32_len4096;
      iS = &arm_cfft_sR_f32_len4096;
      maskS = &arm_cfft_sR_f32_len4096;
      break;
  }

  /****************************************************************************************
     Calculate the FFT of the FIR filter coefficients once to produce the FIR filter mask
  ****************************************************************************************/
  init_filter_mask();

  /****************************************************************************************
     begin to queue the audio from the audio library
  ****************************************************************************************/
  delay(100);
  Q_in_L.begin();
  Q_in_R.begin();

} // END OF SETUP


void loop() {
  elapsedMicros usec = 0;
  // are there at least N_BLOCKS buffers in each channel available ?
    if (Q_in_L.available() > N_BLOCKS + 0 && Q_in_R.available() > N_BLOCKS + 0)
    {
      // get audio samples from the audio  buffers and convert them to float
      for (unsigned i = 0; i < N_BLOCKS; i++)
      {
        sp_L = Q_in_L.readBuffer();
        sp_R = Q_in_R.readBuffer();

        // convert to float one buffer_size
        // float_buffer samples are now standardized from > -1.0 to < 1.0
        arm_q15_to_float (sp_L, &float_buffer_L[BUFFER_SIZE * i], BUFFER_SIZE); // convert int_buffer to float 32bit
        arm_q15_to_float (sp_R, &float_buffer_R[BUFFER_SIZE * i], BUFFER_SIZE); // convert int_buffer to float 32bit
        Q_in_L.freeBuffer();
        Q_in_R.freeBuffer();
      }
 
      /**********************************************************************************
          Digital convolution
       **********************************************************************************/
      //  basis for this was Lyons, R. (2011): Understanding Digital Processing.
      //  "Fast FIR Filtering using the FFT", pages 688 - 694
      //  numbers for the steps taken from that source
      //  Method used here: overlap-and-save

      // ONLY FOR the VERY FIRST FFT: fill first samples with zeros
      if (first_block) // fill real & imaginaries with zeros for the first BLOCKSIZE samples
      {
        for (unsigned i = 0; i < BUFFER_SIZE * N_BLOCKS; i++)
        {
          FFT_buffer[i] = 0.0;
        }
        first_block = 0;
      }
      else
      {  // HERE IT STARTS for all other instances
        // fill FFT_buffer with last events audio samples
        for (unsigned i = 0; i < BUFFER_SIZE * N_BLOCKS; i++)
        {
          FFT_buffer[i * 2] = last_sample_buffer_L[i]; // real
          FFT_buffer[i * 2 + 1] = last_sample_buffer_R[i]; // imaginary
        }
      }
      // copy recent samples to last_sample_buffer for next time!
      for (unsigned i = 0; i < BUFFER_SIZE * N_BLOCKS; i++)
      {
        last_sample_buffer_L [i] = float_buffer_L[i];
        last_sample_buffer_R [i] = float_buffer_R[i];
      }

      // now fill recent audio samples into FFT_buffer (left channel: re, right channel: im)
      for (unsigned i = 0; i < BUFFER_SIZE * N_BLOCKS; i++)
      {
        FFT_buffer[FFT_length + i * 2] = float_buffer_L[i]; // real
        FFT_buffer[FFT_length + i * 2 + 1] = float_buffer_R[i]; // imaginary
      }

      /**********************************************************************************
          Complex Forward FFT
       **********************************************************************************/
      // calculation is performed in-place the FFT_buffer [re, im, re, im, re, im . . .]
      arm_cfft_f32(S, FFT_buffer, 0, 1);

      /**********************************************************************************
          Complex multiplication with filter mask (precalculated coefficients subjected to an FFT)
       **********************************************************************************/
      arm_cmplx_mult_cmplx_f32 (FFT_buffer, FIR_filter_mask, iFFT_buffer, FFT_length);

      /**********************************************************************************
          Complex inverse FFT
       **********************************************************************************/
      arm_cfft_f32(iS, iFFT_buffer, 1, 1);

      /**********************************************************************************
          Overlap and save algorithm, which simply means yóu take only the right part of the buffer and discard the left part
       **********************************************************************************/
        for (unsigned i = 0; i < FFT_length / 2; i++)
        {
          float_buffer_L[i] = iFFT_buffer[FFT_length + i * 2];
          float_buffer_R[i] = iFFT_buffer[FFT_length + i * 2 + 1];
        }
       /**********************************************************************************
          Demodulation / manipulation / do whatever you want 
       **********************************************************************************/
      //    at this time, just put filtered audio (interleaved format, overlap & save) into left and right channel       

       /**********************************************************************
          CONVERT TO INTEGER AND PLAY AUDIO - Push audio into I2S audio chain
       **********************************************************************/
      for (int i = 0; i < N_BLOCKS; i++)
        {
          sp_L = Q_out_L.getBuffer();    
          sp_R = Q_out_R.getBuffer();
          arm_float_to_q15 (&float_buffer_L[BUFFER_SIZE * i], sp_L, BUFFER_SIZE); 
          arm_float_to_q15 (&float_buffer_R[BUFFER_SIZE * i], sp_R, BUFFER_SIZE);
          Q_out_L.playBuffer(); // play it !  
          Q_out_R.playBuffer(); // play it !
        }

       /**********************************************************************************
          PRINT ROUTINE FOR ELAPSED MICROSECONDS
       **********************************************************************************/
#ifdef DEBUG
      sum = sum + usec;
      idx_t++;
      if (idx_t > 40) {
        mean = sum / idx_t;
        if (mean / 29.00 / N_BLOCKS * SAMPLE_RATE / AUDIO_SAMPLE_RATE_EXACT < 100.0)
        {
          Serial.print("processor load:  ");
          Serial.print (mean / 29.00 / N_BLOCKS * SAMPLE_RATE / AUDIO_SAMPLE_RATE_EXACT);
          Serial.println("%");
        }
        else
        {
          Serial.println("100%");
        }
        Serial.print (mean);
        Serial.print (" microsec for ");
        Serial.print (N_BLOCKS);
        Serial.print ("  stereo blocks    ");
        Serial.print("FFT-length = "); Serial.print(FFT_length);
        Serial.print(";   FIR filter length = "); Serial.println(m_NumTaps);

        idx_t = 0;
        sum = 0;
      }
#endif
    } // end of audio process loop
    
      /**********************************************************************************
          Add button check etc. here
       **********************************************************************************/

}

//////////////////////////////////////////////////////////////////////
//  Call to setup complex filter parameters [can process two channels at the same time!]
//  The two channels could be stereo audio or I and Q channel in a Software Defined Radio
// SampleRate in Hz
// FLowcut is low cutoff frequency of filter in Hz
// FHicut is high cutoff frequency of filter in Hz
// Offset is the CW tone offset frequency
// cutoff frequencies range from -SampleRate/2 to +SampleRate/2
//  HiCut must be greater than LowCut
//    example to make 2700Hz USB filter:
//  SetupParameters( 100, 2800, 0, 48000);
//////////////////////////////////////////////////////////////////////

void calc_cplx_FIR_coeffs (double * coeffs_I, double * coeffs_Q, int numCoeffs, double FLoCut, double FHiCut, double SampleRate)
// pointer to coefficients variable, no. of coefficients to calculate, frequency where it happens, stopband attenuation in dB,
// filter type, half-filter bandwidth (only for bandpass and notch)
{
  //calculate some normalized filter parameters
  double nFL = FLoCut / SampleRate;
  double nFH = FHiCut / SampleRate;
  double nFc = (nFH - nFL) / 2.0; //prototype LP filter cutoff
  double nFs = PI * (nFH + nFL); //2 PI times required frequency shift (FHiCut+FLoCut)/2
  double fCenter = 0.5 * (double)(numCoeffs - 1); //floating point center index of FIR filter

  for (int i = 0; i < numCoeffs; i++) //zero pad entire coefficient buffer
  {
    coeffs_I[i] = 0.0;
    coeffs_Q[i] = 0.0;
  }

  //create LP FIR windowed sinc, sin(x)/x complex LP filter coefficients
  for (int i = 0; i < numCoeffs; i++)
  {
    double x = (float32_t)i - fCenter;
    double z;
    if ( abs((double)i - fCenter) < 0.01) //deal with odd size filter singularity where sin(0)/0==1
      z = 2.0 * nFc;
    else
      switch (FIR_filter_window) {
        case 1:    // 4-term Blackman-Harris --> this is what Power SDR uses
          z = (double)sin(TWO_PI * x * nFc) / (PI * x) *
              (0.35875 - 0.48829 * cos( (TWO_PI * i) / (numCoeffs - 1) )
               + 0.14128 * cos( (FOURPI * i) / (numCoeffs - 1) )
               - 0.01168 * cos( (SIXPI * i) / (numCoeffs - 1) ) );
          break;
        case 2:
          z = (double)sin(TWO_PI * x * nFc) / (PI * x) *
              (0.355768 - 0.487396 * cos( (TWO_PI * i) / (numCoeffs - 1) )
               + 0.144232 * cos( (FOURPI * i) / (numCoeffs - 1) )
               - 0.012604 * cos( (SIXPI * i) / (numCoeffs - 1) ) );
          break;
        case 3: // cosine
          z = (double)sin(TWO_PI * x * nFc) / (PI * x) *
              cos((PI * (float32_t)i) / (numCoeffs - 1));
          break;
        case 4: // Hann
          z = (double)sin(TWO_PI * x * nFc) / (PI * x) *
              0.5 * (double)(1.0 - (cos(PI * 2 * (double)i / (double)(numCoeffs - 1))));
          break;
        default: // Blackman-Nuttall window
          z = (double)sin(TWO_PI * x * nFc) / (PI * x) *
              (0.3635819
               - 0.4891775 * cos( (TWO_PI * i) / (numCoeffs - 1) )
               + 0.1365995 * cos( (FOURPI * i) / (numCoeffs - 1) )
               - 0.0106411 * cos( (SIXPI * i) / (numCoeffs - 1) ) );
          break;
      }
    //shift lowpass filter coefficients in frequency by (hicut+lowcut)/2 to form bandpass filter anywhere in range
    coeffs_I[i]  =  z * cos(nFs * x);
    coeffs_Q[i] = z * sin(nFs * x);
  }
}


void init_filter_mask()
{
  /****************************************************************************************
     Calculate the FFT of the FIR filter coefficients once to produce the FIR filter mask
  ****************************************************************************************/
  // the FIR has exactly m_NumTaps and a maximum of (FFT_length / 2) + 1 taps = coefficients, so we have to add (FFT_length / 2) -1 zeros before the FFT
  // in order to produce a FFT_length point input buffer for the FFT
  // copy coefficients into real values of first part of buffer, rest is zero
  for (unsigned i = 0; i < m_NumTaps; i++)
  {
    FIR_filter_mask[i * 2] = FIR_Coef_I [i];
    FIR_filter_mask[i * 2 + 1] = FIR_Coef_Q [i];
  }

  for (unsigned i = FFT_length + 1; i < FFT_length * 2; i++)
  {
    FIR_filter_mask[i] = 0.0;
  }
  // FFT of FIR_filter_mask
  // perform FFT (in-place), needs only to be done once (or every time the filter coeffs change)
  arm_cfft_f32(maskS, FIR_filter_mask, 0, 1);

} // end init_filter_mask

The code takes stereo I2S input and performs bandpass filtering of both channels simultaneously (in one complex FFT/cplxmultiply/iFFT operation) and outputs them as I2S output to the SGTL5000 outputs.

You can choose the FFT size (equivalent to steepness of the filter skirts / number of FIR filter taps). It uses fast convolution with overlap & save and 50% overlap and 4-term-Blackman-Harris-windowing. You can alter the FFT size [256, 512, 1024, 2048, 4096], which is equivalent to number of FIR taps [129, 513, 1025, 2049] and the FFT window. Also try to change the lower and upper bandpass cutoff frequency for the filter.

The filter response with a 4096-sized FFT has skirts so steep, that you will never want any larger filter response ;-).

The processor load is so amazingly low, that I can hardly believe it (they are output in the serial monitor). Maybe someone would be willing to check, if I did the math right in the sketch . . . Here are the figures:

FFT size	no. of FIR taps	processing time	no. of AUDIO Blocks	processor load	memory used
256	129	69µsec	1	2.38%	160kbytes
512	257	133µsec	2	2.29%	173kbytes
1024	513	303µsec	4	2.61%	193kbytes
2048	1025	639µsec	8	2.75%	234kbytes
4096	2049	1249µsec	16	2.69%	316kbytes

Amazingly, the processor load does not differ much between the different FFT sizes and even drops when going from 1025 to 2049 FIR taps. However, memory usage is excessive!!! Can anybody explain this? It is not really fully explained by sum of the sizes of the assigned variables in the sketch . . .

All the best,

Frank DD4WH

P.S.: if the figures for the processor load are really calculated correctly, we would really have to think about using fast convolution filtering for even the smallest filtering tasks, eg. a parametric Equalizer, this could solve the problems associated with low frequency IIR filtering, for example.

manitou · Aug 18, 2019

Do you have results for same test on T3.5 or T3.6 (with DSP lib upgraded to same version as T4)? I think Paul had complained that the newer versions of DSP lib consume a lot more RAM.

tschrama · Aug 18, 2019

Following this thread... thx for teating the fast convolution!

bmillier · Aug 18, 2019

Hi Frank: Good to see you have tried the convolution filter on T4. You might recall from another, older thread that I had "wrapped" your convolution routine into an Audio library object (filter_convolution). I just received a couple of T4s and have updated Arduino IDE/Teensyduino/Visual Micro to the new versions that support T4. Once I added my filter_convolution files to the Audio library folder in the new Arduino 1.8.9 folder, I was able to compile my test program with no errors, so it is looking good. Its great that you don't have to change the arm_math.h file to use CMSIS4/5 like you used to have to do with the T3.5/6.
I don't have an audio adapter board wired up to the T4 yet, so I can't actually test that the code is working, but I'm optimistic.
Cheers

DD4WH · Aug 18, 2019

Hi Brian!
yes, first I tried your convolution object with the T4 and it worked nicely!

The reason why I put up this sketch was:

* it is Stereo
* it divides the processor load evenly between audio blocks
* it is flexible in the size of the FFT (and thus in the filter length)

However, your sketch has the large advantage of being compatible with the audio library, I have not managed to do this with my code, because it would all have to be in one audio lib object in order not to have to convert from float to int and vice versa . . .

At the moment I am working on uniformly partitioned convolution, because the latency of the above code is really high, especially with long filter lengths and thus FFTs of length 2048/4096.

But at the moment I have to spend some time on the interplay between the T4.0 and the audio board. I do not seem to be able to run that combination reliably, it arbitrarily causes crackling and drizzling noises and I do not seem to be able to find out why. Eg. at the moment I cannot run even the StereoPassThrough example reliably.

@manitou: thanks for your suggestion!
Yes, I will try the code with the T3.6 and report back my figures.
IIRC, Paul was refering to the use of Flash memory by the CMSIS lib (for lookup tables), not RAM (but I may be wrong here). The figures above refer to the use of RAM and the figures do not seem to fit my defined variables, so that is what I wondered about.

All the best, Frank DD4WH

manitou · Aug 19, 2019

I ran your sketch (no audio shield attached), optimize Faster

Code:

4K FFT
  T4    1254 us   3%   313KB  arm_math.h v1.5.1
  T3.6  7073 us  15%   188KB  v1.5.3
  T3.5 11538 us  25%

I also have some DSP FFT benchmark numbers at
https://forum.pjrc.com/threads/54711-Teensy-4-0-First-Beta-Test?p=194187&viewfull=1#post194187

bmillier · Aug 19, 2019

Hi Frank: Good to converse with you again. Thanks for trying out my convolution audio lib. routine. I was going to do that as soon as I got the Audio shield hooked up (I was using only a PT8211 I2S DAC til now)
So, I just finished wiring up the Audio adapter. I did so on a Vector 8001 prototype PC board where the T4 was already mounted. The T4 and Audio adapter are adjacent and the interconnecting wires are 5-9 cm long (and soldered of course). The Audio adapter (mine is an older version) works fine for me in this configuration- tried the Stereo passthrough (which doesn't involve passing any data into the T4 and back out again- just sets up the SGTL5000 via I2C). Also the "guitar" example works fine, which does use T4 processing of course. I'll try my convolution lib object next, but you've already tested that so it should be OK.
I am guessing that your "flying wires" may be the problem. as the T4 interface seems fine for me.
I am questioning what you mean by your statement "it divides the processor load evenly between audio blocks " I believe that your code blocks until N_Blocks have arrived in the queue, then it does all of the int/float conversion, FFT/iFFT, float/int and then sends N_blocks to output queue. Granted, the input and output memory blocks are being filled/emptied via DMA, more or less concurrently, in the background. But, the whole filtering process is occurring in one chunk of time, with little interruptions when an ISR fires to re-configure the DMA controller for each block.
My version, by necessity, must handle everything during the filter.update routine. Each incoming audio block is converted to float and stored in FFT memory, during each filter.update call. Each filter.update I also convert 1 block of filtered audio (from the last FFT/iFFT) from float/int and then send it out. Of course, the big processing job is the FFT/iFFT and I can only do that once every 4 filter.updates (when there is enough data to process). So, I am spreading my processing out somewhat, but the majority of the processing time (47% of T3.5/6) is indeed spent in the filter.update block that actually does the FFT/iFFT routine. This must fit into the time interval between blocks (2.9 ms). With the T3.5/6 this does indeed limit how much else the processor can handle, but seemed like the only way to do it and have it work as a library object.

Like you, the latency is what really limits how useful this filter is. Its a while back, but I think the latency from my audio lib convolution object was about 14 mS for the 1024-pt FFT.

I think the segmented FFT idea came up during the older thread we were involved with, but I can't say that I understand this concept.

However, the T4 is so fast that it occurred to me that one could:
1) at each filter.update, add the latest incoming block to the end of 3 older blocks (that have been stored in an array)
2) do the 1024-pt FFT/iFFT using overlap/save
3) pick out the proper(??) 128-sample segment from the iFFT routine and send it to the output queue.
4) do 1,2,3 for every block that comes in. This would seem very inefficient in terms of MCU utilization, but it looks like the T4 has plenty of horsepower to handle it. Plus, it should reduce the latency to a bit more than 1 or 2 (??) audio block times (2.9 mS).
Does this make any sense to you? Of course, if you can figure out how to code the segmented FFT idea, then that is probably a much better path.
Re the large RAM utilization of the T4 FFT/iFFT routines: I know that CMSIS has large constant tables that they use to do the FFT routine. These would start out in Flash of course, but maybe they move them to RAM as that would speed things up considerably. This would explain the large RAM usage, I believe, and may also be why the T4 runs FFTs so much quicker. Just a thought...
Cheers

manitou · Aug 19, 2019

bmillier said:
Re the large RAM utilization of the T4 FFT/iFFT routines: I know that CMSIS has large constant tables that they use to do the FFT routine. These would start out in Flash of course, but maybe they move them to RAM as that would speed things up considerably. This would explain the large RAM usage, I believe, and may also be why the T4 runs FFTs so much quicker. Just a thought...
Cheers

Indeed, for the T4, all of the flash is copied to RAM (hardware/teensy/avr/cores/teensy4/startup.c), so FASTRUN has no meaning for T4. You can keep things in flash by using PROGMEM. RAM is further divided into ITCM and DTCM, and the remaining OCRAM is cache-controlled. There are many posts about managing malloc/DMAMEM and the cache with respect to peripheral IO.

bmillier · Aug 19, 2019

Thanks manitou- I could understand the CMSIS math/DSP routines being in RAM, as speed is so important there. I just got my T4 yesterday, so I hadn't considered the fact that ALL of the program, const in Flash would be copied to RAM. Good to know.
I think the same thing happens with my ESP32 programs, as they take up much more RAM than I expect. But, ESP32 uses QSPI Flash for program storage, so there is a big difference between thel flash and RAM access times.
Cheers

DD4WH · Aug 19, 2019

bmillier said:
I am guessing that your "flying wires" may be the problem. as the T4 interface seems fine for me.

You are probably right, I now soldered the T4 and the audio board with very short wires and its still quite sensitive, I use a lower sample rate (and a lower MCLK freq) for the moment in order to be able to experiment.

I am questioning what you mean by your statement "it divides the processor load evenly between audio blocks " I believe that your code blocks until N_Blocks have arrived in the queue, then it does all of the int/float conversion, FFT/iFFT, float/int and then sends N_blocks to output queue. Granted, the input and output memory blocks are being filled/emptied via DMA, more or less concurrently, in the background. But, the whole filtering process is occurring in one chunk of time, with little interruptions when an ISR fires to re-configure the DMA controller for each block.

Yes, I think I was wrong with my statement, will think about that a little more.

Like you, the latency is what really limits how useful this filter is. Its a while back, but I think the latency from my audio lib convolution object was about 14 mS for the 1024-pt FFT. I think the segmented FFT idea came up during the older thread we were involved with, but I can't say that I understand this concept.

Me neither, but I have been going over this again and again and also I found some wonderful code in the wdsp lib by Warren Pratt to do this.

Here is my very first attempt at a

LOW LATENCY UNIFORMLY PARTITIONED FAST CONVOLUTION FILTER

It should have no more than 2.9msec latency and can do FIR filtering with up to 8192 taps.

(please test and search for bugs and improvements!)

Code:

/*
 * First attempt
 * 
 * Real Time PARTITIONED BLOCK CONVOLUTION FILTERING in Stereo
 * 
 * uses Teensy 4.0 and the Teensy audio board connected with flying wires
 * 
 * Frank DD4WH 2019-08-18
 * 
 * uses code from wdsp library by Warren Pratt
 * https://github.com/g0orx/wdsp/blob/master/firmin.c
 * 
 * ATTENTION: connecting the T4 and the audio board with flying wires is a bad idea (MCLK connected via a 100 Ohms resistor)
 * I had to keep the sample rate low in order to make it work at all --> MCLK is lower then and that means less interference . . .
 * waiting for the T4 audio board ;-)
 * 
 * in the public domain GNU GPL
 */

#include <Audio.h>
#include <Wire.h>
#include <SPI.h>
#include <arm_math.h>
#include <arm_const_structs.h> // in the Teensy 4.0 audio library, the ARM CMSIS DSP lib is already a newer version
#include <utility/imxrt_hw.h> // necessary for setting the sample rate, thanks FrankB !


//////////////////////////////////////////////////////////////////////////////////////////////////////////////
// USER DEFINES
//uncomment for pass-thru
//#define PASSTHRU_AUDIO
// define your lower and upper frequency for the bandpass filter
//double FLoCut = 150.0;
//double FHiCut = 2700.0;
double FLoCut = 5.0;
double FHiCut = 20000.0;
// define your sample rate
double SAMPLE_RATE = 48000;  
// define the number of FIR taps of your filter
//const int nc = 1024; // number of taps for the FIR filter
//const int nc = 4096; // number of taps for the FIR filter
const int nc = 8192; // number of taps for the FIR filter
// the latency of the filter is ALWAYS the same regardless of the number of taps for the filter
// this translates to a latency of 128/sample rate, ie. to 2.9msec with 44.1ksps
const int partitionsize = 128; 
///////////////////////////////////////////////////////////////////////////////////////////////////////////////


#define DEBUG
#define FOURPI  (2.0 * TWO_PI)
#define SIXPI   (3.0 * TWO_PI)
#define BUFFER_SIZE 128
int32_t sum;
int idx_t = 0;
float32_t mean;
int16_t *sp_L;
int16_t *sp_R;
uint8_t FIR_filter_window = 1;
uint8_t first_block = 1; 
const uint32_t FFT_L = 256; 
uint32_t FFT_length = FFT_L;
const int nfor = nc / partitionsize; // number of partition blocks --> nfor = nc / partitionsize
double cplxcoeffs[nc * 2]; // this holds the initial complex coefficients for the filter BEFORE partitioning
float32_t maskgen[FFT_L * 2];
float32_t fmask[nfor][FFT_L * 2] DMAMEM; // otherwise it wont fit into the RAM
float32_t fftin[FFT_L * 2];
float32_t fftout[nfor][FFT_L * 2];
float32_t accum[FFT_L * 2];

int buffidx = 0;
int k = 0;
//int idxmask = nfor - 1;

const uint32_t N_B = FFT_L / 2 / BUFFER_SIZE;
uint32_t N_BLOCKS = N_B;
float32_t float_buffer_L [BUFFER_SIZE * N_B];  // 2048 * 4 = 8kb
float32_t float_buffer_R [BUFFER_SIZE * N_B]; // 2048 * 4 = 8kb

float32_t last_sample_buffer_L [BUFFER_SIZE * N_B];  // 2048 * 4 = 8kb
float32_t last_sample_buffer_R [BUFFER_SIZE * N_B]; // 2048 * 4 = 8kb
// complex FFT with the new library CMSIS V4.5
const static arm_cfft_instance_f32 *S;
// complex iFFT with the new library CMSIS V4.5
const static arm_cfft_instance_f32 *iS;
// FFT instance for direct calculation of the filter mask
// from the impulse response of the FIR - the coefficients
const static arm_cfft_instance_f32 *maskS;

// this audio comes from the codec by I2S
AudioInputI2S            i2s_in;
AudioRecordQueue         Q_in_L;
AudioRecordQueue         Q_in_R;
AudioMixer4              mixleft;
AudioMixer4              mixright;
AudioPlayQueue           Q_out_L;
AudioPlayQueue           Q_out_R;
AudioOutputI2S           i2s_out;

AudioControlSGTL5000     sgtl5000_1;

AudioConnection          patchCord1(i2s_in, 0, Q_in_L, 0);
AudioConnection          patchCord2(i2s_in, 1, Q_in_R, 0);

AudioConnection          patchCord3(Q_out_L, 0, mixleft, 0);
AudioConnection          patchCord4(Q_out_R, 0, mixright, 0);
AudioConnection          patchCord9(mixleft, 0,  i2s_out, 1);
AudioConnection          patchCord10(mixright, 0, i2s_out, 0);


void setup() {
  Serial.begin(115200);
  delay(100);

  AudioMemory(10); 
  delay(100);

  /****************************************************************************************
     Audio Shield Setup
  ****************************************************************************************/
  sgtl5000_1.enable();
  sgtl5000_1.inputSelect(AUDIO_INPUT_LINEIN);
  sgtl5000_1.lineInLevel(9);
  sgtl5000_1.volume(0.5); 

  mixleft.gain(0, 1.0);
  mixright.gain(0, 1.0);

  setI2SFreq(SAMPLE_RATE);

  /****************************************************************************************
     set filter bandwidth
  ****************************************************************************************/
  // this routine does all the magic of calculating the FIR coeffs
  // calc_cplx_FIR_coeffs_interleaved (cplxcoeffs, nc, FLoCut, FHiCut, SAMPLE_RATE);
  fir_bandpass (cplxcoeffs, nc, FLoCut, FHiCut, SAMPLE_RATE, 0, 1, 1.0);

  /****************************************************************************************
     init complex FFTs
  ****************************************************************************************/
  switch (FFT_length)
  {
    case 256:
      S = &arm_cfft_sR_f32_len256;
      iS = &arm_cfft_sR_f32_len256;
      maskS = &arm_cfft_sR_f32_len256;
      break;
    case 512:
      S = &arm_cfft_sR_f32_len512;
      iS = &arm_cfft_sR_f32_len512;
      maskS = &arm_cfft_sR_f32_len512;
      break;
    case 1024:
      S = &arm_cfft_sR_f32_len1024;
      iS = &arm_cfft_sR_f32_len1024;
      maskS = &arm_cfft_sR_f32_len1024;
      break;
    case 2048:
      S = &arm_cfft_sR_f32_len2048;
      iS = &arm_cfft_sR_f32_len2048;
      maskS = &arm_cfft_sR_f32_len2048;
      break;
    case 4096:
      S = &arm_cfft_sR_f32_len4096;
      iS = &arm_cfft_sR_f32_len4096;
      maskS = &arm_cfft_sR_f32_len4096;
      break;
  }

  /****************************************************************************************
     Calculate the FFT of the FIR filter coefficients once to produce the FIR filter mask
  ****************************************************************************************/
  //init_filter_mask();
    init_partitioned_filter_masks();
    
  /****************************************************************************************
     begin to queue the audio from the audio library
  ****************************************************************************************/
  delay(100);
  Q_in_L.begin();
  Q_in_R.begin();

} // END OF SETUP


void loop() {
  elapsedMicros usec = 0;
  // are there at least N_BLOCKS buffers in each channel available ?
    if (Q_in_L.available() > N_BLOCKS + 0 && Q_in_R.available() > N_BLOCKS + 0)
    {
      // get audio samples from the audio  buffers and convert them to float
      for (unsigned i = 0; i < N_BLOCKS; i++)
      {
        sp_L = Q_in_L.readBuffer();
        sp_R = Q_in_R.readBuffer();

        // convert to float one buffer_size
        // float_buffer samples are now standardized from > -1.0 to < 1.0
        arm_q15_to_float (sp_L, &float_buffer_L[BUFFER_SIZE * i], BUFFER_SIZE); // convert int_buffer to float 32bit
        arm_q15_to_float (sp_R, &float_buffer_R[BUFFER_SIZE * i], BUFFER_SIZE); // convert int_buffer to float 32bit
        Q_in_L.freeBuffer();
        Q_in_R.freeBuffer();
      }
 
      /**********************************************************************************
          Digital convolution
       **********************************************************************************/
      //  basis for this was Lyons, R. (2011): Understanding Digital Processing.
      //  "Fast FIR Filtering using the FFT", pages 688 - 694
      //  numbers for the steps taken from that source
      //  Method used here: overlap-and-save

      // ONLY FOR the VERY FIRST FFT: fill first samples with zeros
      if (first_block) // fill real & imaginaries with zeros for the first BLOCKSIZE samples
      {
        for (unsigned i = 0; i < partitionsize * 4; i++)
        {
          //FFT_buffer[i] = 0.0;
          fftin[i] = 0.0;
        }
        first_block = 0;
      }
      else
      {  // HERE IT STARTS for all other instances
        // fill FFT_buffer with last events audio samples
        for (unsigned i = 0; i < partitionsize; i++)
        {
//          FFT_buffer[i * 2] = last_sample_buffer_L[i]; // real
//          FFT_buffer[i * 2 + 1] = last_sample_buffer_R[i]; // imaginary
          fftin[i * 2] = last_sample_buffer_L[i]; // real
          fftin[i * 2 + 1] = last_sample_buffer_R[i]; // imaginary
        }
      }
      // copy recent samples to last_sample_buffer for next time!
      for (unsigned i = 0; i < partitionsize; i++)
      {
        last_sample_buffer_L [i] = float_buffer_L[i];
        last_sample_buffer_R [i] = float_buffer_R[i];
      }

      // now fill recent audio samples into FFT_buffer (left channel: re, right channel: im)
      for (unsigned i = 0; i < partitionsize; i++)
      {
//        FFT_buffer[FFT_length + i * 2] = float_buffer_L[i]; // real
//        FFT_buffer[FFT_length + i * 2 + 1] = float_buffer_R[i]; // imaginary
        fftin[FFT_length + i * 2] = float_buffer_L[i]; // real
        fftin[FFT_length + i * 2 + 1] = float_buffer_R[i]; // imaginary
      }
      
      /**********************************************************************************
          Complex Forward FFT
       **********************************************************************************/
      // calculation is performed in-place the FFT_buffer [re, im, re, im, re, im . . .]
      arm_cfft_f32(S, fftin, 0, 1);
      for(unsigned i = 0; i < partitionsize * 4; i++)
      {
          fftout[buffidx][i] = fftin[i];
      }

      /**********************************************************************************
          Complex multiplication with filter mask (precalculated coefficients subjected to an FFT)
       **********************************************************************************/
      k = buffidx;

      for(unsigned i = 0; i < partitionsize * 4; i++)
      {
          accum[i] = 0.0;
      }
      
      for(unsigned j = 0; j < nfor; j++)
      { 
          for(unsigned i = 0; i < 2 * partitionsize; i++)
          {
              accum[2 * i + 0] += fftout[k][2 * i + 0] * fmask[j][2 * i + 0] -
                                  fftout[k][2 * i + 1] * fmask[j][2 * i + 1];
              
              accum[2 * i + 1] += fftout[k][2 * i + 0] * fmask[j][2 * i + 1] +
                                  fftout[k][2 * i + 1] * fmask[j][2 * i + 0]; 
          }
          k = k - 1;
          if(k < 0)
          {
            k = nfor - 1;
          }
          //k = (k + idxmask) & idxmask;
      } // end nfor loop

      buffidx = buffidx + 1;
      if(buffidx >= nfor)
      {
          buffidx = 0;    
      }
        //buffidx = (buffidx + 1) & idxmask;
            
      /**********************************************************************************
          Complex inverse FFT
       **********************************************************************************/
      arm_cfft_f32(iS, accum, 1, 1);

      /**********************************************************************************
          Overlap and save algorithm, which simply means yóu take only the right part of the buffer and discard the left part
       **********************************************************************************/
/*        for (unsigned i = 0; i < FFT_length / 2; i++)
        {
          float_buffer_L[i] = accum[FFT_length + i * 2];
          float_buffer_R[i] = accum[FFT_length + i * 2 + 1];
        }
*/      // sonehow it seems it is reversed: I discard the right part and take the left part . . .
        for (unsigned i = 0; i < partitionsize; i++)
        {
          //float_buffer_L[i] = accum[partitionsize * 2 + i * 2 + 0];
          //float_buffer_R[i] = accum[partitionsize * 2 + i * 2 + 1];
          float_buffer_L[i] = accum[i * 2 + 0];
          float_buffer_R[i] = accum[i * 2 + 1];
        }
      
       /**********************************************************************************
          Demodulation / manipulation / do whatever you want 
       **********************************************************************************/
      //    at this time, just put filtered audio (interleaved format, overlap & save) into left and right channel       

       /**********************************************************************
          CONVERT TO INTEGER AND PLAY AUDIO - Push audio into I2S audio chain
       **********************************************************************/
      for (int i = 0; i < N_BLOCKS; i++)
        {
          sp_L = Q_out_L.getBuffer();    
          sp_R = Q_out_R.getBuffer();
          arm_float_to_q15 (&float_buffer_L[BUFFER_SIZE * i], sp_L, BUFFER_SIZE); 
          arm_float_to_q15 (&float_buffer_R[BUFFER_SIZE * i], sp_R, BUFFER_SIZE);
          Q_out_L.playBuffer(); // play it !  
          Q_out_R.playBuffer(); // play it !
        }

       /**********************************************************************************
          PRINT ROUTINE FOR ELAPSED MICROSECONDS
       **********************************************************************************/
#ifdef DEBUG
      sum = sum + usec;
      idx_t++;
      if (idx_t > 40) {
        mean = sum / idx_t;
        if (mean / 29.00 / N_BLOCKS * SAMPLE_RATE / AUDIO_SAMPLE_RATE_EXACT < 100.0)
        {
          Serial.print("processor load:  ");
          Serial.print (mean / 29.00 / N_BLOCKS * SAMPLE_RATE / AUDIO_SAMPLE_RATE_EXACT);
          Serial.println("%");
        }
        else
        {
          Serial.println("100%");
        }
        Serial.print (mean);
        Serial.print (" microsec for ");
        Serial.print (N_BLOCKS);
        Serial.print ("  stereo blocks    ");
        Serial.print("FFT-length = "); Serial.print(FFT_length);
        Serial.print(";   FIR filter length = "); Serial.println(nc);
        Serial.print("k = "); Serial.println(k);
        Serial.print("buffidx = "); Serial.println(buffidx);
        idx_t = 0;
        sum = 0;
      }
#endif
    } // end of audio process loop
    
      /**********************************************************************************
          Add button check etc. here
       **********************************************************************************/

}

//////////////////////////////////////////////////////////////////////
//  Call to setup complex filter parameters [can process two channels at the same time!]
//  The two channels could be stereo audio or I and Q channel in a Software Defined Radio
// SampleRate in Hz
// FLowcut is low cutoff frequency of filter in Hz
// FHicut is high cutoff frequency of filter in Hz
// Offset is the CW tone offset frequency
// cutoff frequencies range from -SampleRate/2 to +SampleRate/2
//  HiCut must be greater than LowCut
//    example to make 2700Hz USB filter:
//  SetupParameters( 100, 2800, 0, 48000);
//////////////////////////////////////////////////////////////////////

void  calc_cplx_FIR_coeffs_interleaved (double *impulse,  int numCoeffs, double FLoCut, double FHiCut, double SampleRate)
//void calc_cplx_FIR_coeffs (double * coeffs_I, double * coeffs_Q, int numCoeffs, double FLoCut, double FHiCut, double SampleRate)
// pointer to coefficients variable, no. of coefficients to calculate, frequency where it happens, stopband attenuation in dB,
// filter type, half-filter bandwidth (only for bandpass and notch)
{
  //calculate some normalized filter parameters
  double nFL = FLoCut / SampleRate;
  double nFH = FHiCut / SampleRate;
  double nFc = (nFH - nFL) / 2.0; //prototype LP filter cutoff
  double nFs = PI * (nFH + nFL); //2 PI times required frequency shift (FHiCut+FLoCut)/2
  double fCenter = 0.5 * (double)(numCoeffs - 1); //floating point center index of FIR filter

  for (int i = 0; i < numCoeffs * 2; i++) //zero pad entire coefficient buffer
  {
    impulse[i] = 0.0;
  }

  //create LP FIR windowed sinc, sin(x)/x complex LP filter coefficients
  for (int i = 0; i < numCoeffs; i++)
  {
    double x = (float32_t)i - fCenter;
    double z;
    if ( abs((double)i - fCenter) < 0.01) //deal with odd size filter singularity where sin(0)/0==1
      z = 2.0 * nFc;
    else
      switch (FIR_filter_window) {
        case 1:    // 4-term Blackman-Harris --> this is what Power SDR uses
          z = (double)sin(TWO_PI * x * nFc) / (PI * x) *
              (0.35875 - 0.48829 * cos( (TWO_PI * i) / (numCoeffs - 1) )
               + 0.14128 * cos( (FOURPI * i) / (numCoeffs - 1) )
               - 0.01168 * cos( (SIXPI * i) / (numCoeffs - 1) ) );
          break;
        case 2:
          z = (double)sin(TWO_PI * x * nFc) / (PI * x) *
              (0.355768 - 0.487396 * cos( (TWO_PI * i) / (numCoeffs - 1) )
               + 0.144232 * cos( (FOURPI * i) / (numCoeffs - 1) )
               - 0.012604 * cos( (SIXPI * i) / (numCoeffs - 1) ) );
          break;
        case 3: // cosine
          z = (double)sin(TWO_PI * x * nFc) / (PI * x) *
              cos((PI * (float32_t)i) / (numCoeffs - 1));
          break;
        case 4: // Hann
          z = (double)sin(TWO_PI * x * nFc) / (PI * x) *
              0.5 * (double)(1.0 - (cos(PI * 2 * (double)i / (double)(numCoeffs - 1))));
          break;
        default: // Blackman-Nuttall window
          z = (double)sin(TWO_PI * x * nFc) / (PI * x) *
              (0.3635819
               - 0.4891775 * cos( (TWO_PI * i) / (numCoeffs - 1) )
               + 0.1365995 * cos( (FOURPI * i) / (numCoeffs - 1) )
               - 0.0106411 * cos( (SIXPI * i) / (numCoeffs - 1) ) );
          break;
      }
    //shift lowpass filter coefficients in frequency by (hicut+lowcut)/2 to form bandpass filter anywhere in range
    impulse[i * 2 + 0] = z * cos(nFs * x);
    impulse[i * 2 + 1] = z * sin(nFs * x);
    //coeffs_I[i]  =  z * cos(nFs * x);
    //coeffs_Q[i] = z * sin(nFs * x);
  }
}


void init_partitioned_filter_masks()
{
#ifndef PASSTHRU_AUDIO
    for(unsigned j = 0; j < nfor;j++)
    {
      // fill with zeroes
      for (unsigned i = 0; i < partitionsize * 4; i++)
      {
          maskgen[i] = 0.0;  
      }
      // take part of impulse response and fill into maskgen
      for (unsigned i = 0; i < partitionsize * 2; i++)
      {
          maskgen[i + partitionsize * 2] = cplxcoeffs[i + j * partitionsize * 2];  
          //maskgen[i] = cplxcoeffs[i + j * partitionsize * 2];  
          //maskgen[i] = cplxcoeffs[i + j * partitionsize * 2];  
      }
      // perform complex FFT on maskgen
      arm_cfft_f32(maskS, maskgen, 0, 1);
      // fill into fmask array
      for (unsigned i = 0; i < partitionsize * 4; i++)
      {
          fmask[j][i] = maskgen[i];  
          //fmask[j][i + partitionsize * 2] = 0.0;
      }    
    }
    
#else
// passthru
    
    for(unsigned j = 0; j < nfor;j++)
    {
      // fill with zeroes
      for (unsigned i = 0; i < partitionsize * 4; i++)
      {
          maskgen[i] = 0.0;  
      }
      if(j==0) maskgen[0] = 1.0; 
      arm_cfft_f32(maskS, maskgen, 0, 1);      
      for (unsigned i = 0; i < partitionsize * 4; i++)
      {
          fmask[j][i] = maskgen[i];  
      } 
    }
#endif
}


// taken from wdsp library by Warren Pratt
void fir_bandpass (double * impulse, int N, double f_low, double f_high, double samplerate, int wintype, int rtype, double scale)
{
  double ft = (f_high - f_low) / (2.0 * samplerate);
  double ft_rad = TWO_PI * ft;
  double w_osc = PI * (f_high + f_low) / samplerate;
  int i, j;
  double m = 0.5 * (double)(N - 1);
  double delta = PI / m;
  double cosphi;
  double posi, posj;
  double sinc, window, coef;

  if (N & 1)
  {
    switch (rtype)
    {
    case 0:
      impulse[N >> 1] = scale * 2.0 * ft;
      break;
    case 1:
      impulse[N - 1] = scale * 2.0 * ft;
      impulse[  N  ] = 0.0;
      break;
    }
  }
  for (i = (N + 1) / 2, j = N / 2 - 1; i < N; i++, j--)
  {
    posi = (double)i - m;
    posj = (double)j - m;
    sinc = sin (ft_rad * posi) / (PI * posi);
    switch (wintype)
    {
    case 0: // Blackman-Harris 4-term
      cosphi = cos (delta * i);
      window  =             + 0.21747
          + cosphi *  ( - 0.45325
          + cosphi *  ( + 0.28256
          + cosphi *  ( - 0.04672 )));
      break;
    case 1: // Blackman-Harris 7-term
      cosphi = cos (delta * i);
      window  =       + 6.3964424114390378e-02
          + cosphi *  ( - 2.3993864599352804e-01
          + cosphi *  ( + 3.5015956323820469e-01
          + cosphi *  ( - 2.4774111897080783e-01
          + cosphi *  ( + 8.5438256055858031e-02
          + cosphi *  ( - 1.2320203369293225e-02
          + cosphi *  ( + 4.3778825791773474e-04 ))))));
      break;
    }
    coef = scale * sinc * window;
    switch (rtype)
    {
    case 0:
      impulse[i] = + coef * cos (posi * w_osc);
      impulse[j] = + coef * cos (posj * w_osc);
      break;
    case 1:
      impulse[2 * i + 0] = + coef * cos (posi * w_osc);
      impulse[2 * i + 1] = - coef * sin (posi * w_osc);
      impulse[2 * j + 0] = + coef * cos (posj * w_osc);
      impulse[2 * j + 1] = - coef * sin (posj * w_osc);
      break;
    }
  }
}

void setI2SFreq(int freq) {
  // thanks FrankB !
  // PLL between 27*24 = 648MHz und 54*24=1296MHz
  int n1 = 4; //SAI prescaler 4 => (n1*n2) = multiple of 4
  int n2 = 1 + (24000000 * 27) / (freq * 256 * n1);
  double C = ((double)freq * 256 * n1 * n2) / 24000000;
  int c0 = C;
  int c2 = 10000;
  int c1 = C * c2 - (c0 * c2);
  set_audioClock(c0, c1, c2, true);
  CCM_CS1CDR = (CCM_CS1CDR & ~(CCM_CS1CDR_SAI1_CLK_PRED_MASK | CCM_CS1CDR_SAI1_CLK_PODF_MASK))
       | CCM_CS1CDR_SAI1_CLK_PRED(n1-1) // &0x07
       | CCM_CS1CDR_SAI1_CLK_PODF(n2-1); // &0x3f 
//Serial.printf("SetI2SFreq(%d)\n",freq);
}

The code uses 537kbytes of RAM and has 12.8% of processor load with a 8192 tap FIR filter.

Have fun with the Teensy 4.0!

Frank DD4WH

bmillier · Aug 19, 2019

Hi Frank: I’m surprised your audio board isn’t working with short wires. I’m running at 44100 Hz. I notice that unlike the T3x that the T4 provides exactly 44100- I checked on my freq. counter and it was within 1 Hz.
My Vector board has really wide GND and Vcc rails- that is something that might be different in my setup from yours. I haven’t yet ‘scoped the various I2S lines, as mine worked right from the star, but I will do this.
Wow- that’s a lot of code you have in that new routine!! I’m on my iPad, so its too hard to examine it until I get on my PC with the 2 big screens! The 1 audio block latency is excellent however- theoretically you couldn’t do better than that in a block transfer system like the Teensy library. I’m not sure how you measure the latency. I feed in a short pulse to the Audio input and use the ‘scope to see the time delay from input to output. Also, I measure the processor usage by putting a DigitalWrite (pin, HIGH) at the beginning and DigitalWrite(pin,LOW) at the end of a routine, and watch the GPIO pin with my ‘scope. This is a pretty accurate way of knowing how long the routine will take, but it doesn’t take into consideration any other ISR tasks the MCU may be doing concurrently, and of course the DMA is totally independent too.
I will try your code as soon as I get a chance, and am feeling bright and alert!
Cheers

bmillier · Aug 20, 2019

Hi Frank: I loaded your program into my setup and it works. I can set Fhigh at 1000 Hz, and the vocal track coming thru has all the high freqs filtered out. I am assuming that the filter cutoff is "brick-wall" as you say, at such high FIR lengths. I haven't yet hooked up a signal generator to measure the skirt steepness.
However, the latency is definitely not 2.9 ms!
I fed in a short pulse and measured the delay time from input to output. At 8192 it was 107.6 mS, and dropped to 26 mS @ 1024, 20 ms at 512. This is about twice the latency of my library routine which uses 512 tap filter
(1024 FFT)

This is my 'scope capture for the 1024 tap setting- 26.8 ms latency
I can see from the code that your routine only waits til 1 audio block is available, Also at the end, it only sends 1 block at a time. So that looks great. However, I am thinking that the signal, (short pulse in test case) is slowly working its way through the 8 segments (1024/128) and that is what results in the latency.
I may be doing something wrong, but my measurement setup is the same as I used last year to test my fconv. filter library, and it came out at 15 ms, just a few ms more than the 4 block times (I processed 4 blocks at a time). So, I think it is OK
Any thoughts?
cheers

DD4WH · Aug 20, 2019

Hi Brian,

thats indeed strange. Thanks for your measurement effort, thats not possible here at my work bench

.

As you said, one block of 128 samples is fed into the audio chain, and one block is sent out. With 8192 taps, the algorithm is processing all the 8192/128 = 64 filter mask blocks in the complex multiply for each of the 128 sample blocks, so there should be no more latency caused by the algorithm.

I am still struggling with the noise caused by the T4 MCLK signal, I do not seem to be able to run the Teensy audio board properly with the T4 . . .

I will have to wait until the T4-compatible audio board is out or until I have understood how to alter drive strength in the I2S signals.

BTW, I put together some info and resources on the algorithm:

https://github.com/df8oe/UHSDR/wiki/Low-latency-filtering:-Partitioned-Convolution

bmillier · Aug 20, 2019

Hi Frank: I noticed that you were running the new filtering routine at 48000 SR where I use 44100. But, I changed my SR to 48000, and everything continued to work with no pops/crackles. I don't know if a new version audio board will work any better- mine is a 2-3 year old model. You did insert a 100 ohm resistor in series with the MCLK lead, right?
Here is what my board looks like from the bottom- so you can see the lead lengths etc.

I'll look at your resources link. Thanks
cheers

bmillier · Aug 21, 2019

Hi Frank: I looked over your resources link. The pseudocode you listed is easier to follow than straight C code. Before proceeding, I have to say that my experience is in electronics first, then programming, with advanced math a distant third!
I have noticed a few things:
1) The term latency is defined as the time needed to acquire the required number of samples. In that respect, the 2.9 ms you quote is definitely true at SR of 44100 and 128 s/block. However, when I refer to latency, I do so in terms of what an audiophile or a musician would- i.e. the time delay of a signal from input to output of a given device. That is what I was measuring on the scope capture I gave you.
I have put my measured time delay numbers into an excel spreadsheet and the data defines a straight line with an equation:
latency (ms) = 14.45 + (#of filter taps X 0.0114)
That implies that there would be a fixed 14.45 ms delay in the signal processing even if taps = 0, and 11.4 ms delay for every 1000 taps (just rounding off here- you would be using multiples of 512)
So, I ran the same test on the Teensy audio example "PassThroughStereo". I had earlier mentioned that I thought that this example just configured the SGTL5000 to pass its ADC signal to the DAC directly. You can configure it this way if you want, But,I was wrong- that example actually passes the audio through the audio library without any processing. In this case there is a 6.4 ms delay, or latency as I call it. That looks like 2.9 ms input latency, some other misc. delay in re-setting the DMA controllers, and 2.9 ms latency in the output queue.

So 6.4 ms is "the bottom line"- no filter algorithm will do better than that if its using the Teensy Audio library infrastructure. I pretty sure I'm correct about this, unless I've done something stupid in my scope measurements.

2) That being said, I also confirm your 686 us total execution time for the code in the algorithm (in hardware, using my 'scope). So, where is all the delay coming from? I like to look at it in the time domain.
For the 8192 tap filter, for example, you have 64 discrete blocks forming a circular input buffer, containing 63 "older" signal blocks + the current one. I am thinking that until the block representing the "current" time lines up in that buffer with the centre of the FIR image array, during the convolution, you won't see any appreciable output. I am specifically referring here to the short (2 ms) pulse that I am feeding in. This is shorter than the 128 sample block time of 2.9 ms, and models the "unit impulse" that the DSP textbooks refer to. So, out of 64 blocks, only one will be filled with the pulse height value, all others will be zero. A Low pass FIR image, at least in the time domain, is a high amplitude peak in the centre of the filter image array, with only small spurs on either side. So, you can see my idea that even though the processing time of your routine is only 680 us, the latency (which increases linearly with # of taps) must result from the time it takes the input signal to work its way through to the centre of that 64 block input array. This agrees well with the measurements I took. 32 blocks X 2.9 ms = 92.8 ms. Adding in the 14.5 ms inherent delay that I got from my Excel formula, makes 107.3, which is what I measured on the scope for 8192 tap filter.
3) You must be doing all of the math correctly, or the music signal that I can send in from my iPhone wouldn't be getting through the filter un-modified (with your defaults of LP=5, HP= 20000) . ( I know from hours of fooling around with my library version of the conv. filter, that any little error results in a "trash" signal!!) . Possibly you are choosing the wrong pointer(s) into the input data array, which give a perfect output signal, but with a big delay in the signal itself. If not, then I don't see how those other people you cite, can be achieving lower input-output latency values that they claim.
I'll be looking at this some more, but I don't have the math skills you (and the authors you cite) do, so I may not get anywhere.
Cheers

bmillier · Aug 21, 2019

Hi Frank: Well, I've made some progress. First, I just noticed that if I only feed in 1 channel of the stereo pair, I still get the identical output on both channels. So, I think there is something wrong in the code which does the 2 accumulations of the complex multiplication with the filter mask.

Per my last post, I mentioned that I thought that a pointer might be off by 1/2 of the number of blocks. So, I substituted [(k+32) % nfor] in place of [k] in the following lines of code (here I'm using taps=8192) :
for (unsigned i = 0; i < 2 * partitionsize; i++)
{
accum[2 * i + 0] += fftout[k][2 * i + 0] * fmask[j][2 * i + 0] -
fftout[k][2 * i + 1] * fmask[j][2 * i + 1];

accum[2 * i + 1] += fftout[k][2 * i + 0] * fmask[j][2 * i + 1] +
fftout[k][2 * i + 1] * fmask[j][2 * i + 0];

When I did this- Success! The latency was reduced to 14.5 ms, the minimum delay time I saw in the formula from the Excel chart. Well, not completely a success- one channel had only 14.5 ms latency, while the other channel had about 200 ms. Especially easy to hear when I ran mono sound in from my iPhone- there was a distinct ~200 ms echo in one channel. While I was playing around with those modifications, is when I noticed the issue that I mentioned before about 2 identical output channels with only 1 channel being fed in. I think whatever cures that problem may also correct the 200 ms delay on the second channel too.
Cheers

DD4WH · Aug 21, 2019

bmillier said:
So, I substituted [(k+32) % nfor] in place of [k] in the following lines of code (here I'm using taps=8192)

Brian, thats excellent news! I am far from understanding the code by Warren Pratt from his wdsp lib, so playing around until the result is what we want is exactly the right way to find the optimal solution in this case

.

Maybe there is something wrong with the indices k and buffidx ? Maybe wrong initialization . . .

Also, maybe I misinterpreted the following lines in the original code:

1.) k = (k + idxmask) & idxmask;

2.) buffidx = (buffidx + 1) & idxmask;

BTW: idxmask = nfor - 1;

So, I interpret no. 1 as:

1.) k = k - 1;
if(k < 0)
{
k = nfor - 1;
}

and 2.)

buffidx = buffidx + 1;
if(buffidx >= nfor)
{
buffidx = 0;
}

But maybe I am wrong ???

I am travelling at the moment, so I have no way to test code for the next week.

bmillier · Aug 21, 2019

Hi Frank:
1) k= (k+idxmask) & idxmask - this can’t be right. Probably you meant
K= (k-1) & idxmask. - this expession is identical to your alternate if statement construct
2) is correct as stated, and your alternate code is identical
So, there is something else wrong.
In your SDR application, the two channels are in quadrature (90 deg out of phase) So, the complex math used in the FFT, iFFT matches the I,Q input signal. But, for stereo, this is not the case. So I am not sure your routines will work properly for stereo. I am seeing identical mono output with only 1 input channel, so this seems to affirm my doubts.
I’ll keep playing around. I had looked at one of the other citations, but not Pratt’s yet.

Cheers

DD4WH · Aug 21, 2019

hi Brian,
re: 1.) have a look at

https://github.com/g0orx/wdsp/blob/master/firmin.c

line 243: "k = (k + a->idxmask) & a->idxmask;"

I am not sure how I should interpret that (and maybe this causes the faults that we are observing !?)

Not sure about Stereo / I&Q differences. Will have to think about that

.

bmillier · Aug 21, 2019

Hi Frank:
I’ll have to look at his code.
Turns out I was wrong about 1)
“a->idxmask” refers to the idxmask variable in structure “a” . The expression (k+ idxmask) & idxmask seems strange but it does equal (k-1) & idxmask. Let’s assume idxmask is 63, which would be the proper mask for 64 blocks. Assume k =1. Then expression is (1+63) AND 63 =0. If K =2, then (2+63) AND 63 =1.
Even though his is a strange way to express it, his expression is exactly the same as the “if” construct you used.
So, it’s something different.
Cheers

bmillier · Aug 27, 2019

Hi Frank: I have a bit more info about this filter algorithm. I had earlier said that if I fed on a pulse signal to ONLY left (or right) channel, I got identical outputs on both channels. That's not exactly correct. The pulse shows up on both channels, with an identical latency of ~ 105 ms (@8192 taps). However, the output channel corresponding to the channel receiving the input pulse comes out un-distorted, and at the same amplitude. The other channel is also a pulse, but looks like it was differentiated. Here are the 2 scope captures, with the second one being the output from the channel that has no input signal pulse.

So, this routine can't be used to process 2 stereo channels. It only works properly when an I,Q pair are fed in,as the math is treating the two channels as Real, imaginary.
In thread #16, I said I had success in achieving low latency with the 8192 filter, by moving one of the pointers that I described. Here again, there is more to it. While I did see the pulse go thru the filter in 14.5 ms (for the output channel corresponding to the channel I fed the pulse into) , later, I expanded the scope readout, and found there were "Artifacts" later on in time. So, that is not the answer, or all of it at least.
While I was playing around with the code, I made an error and wiped out the T4s bootloader. I couldn't restore it at first, because the switch on my T4 is bad. However jumpering the PROG pin to ground worked. I put up a post about this issue just now.
Cheers

DD4WH · Oct 2, 2019

Hi Brian,

thanks for your measurements, they provide nice insight! And I would not have been able to do similar things!

I managed to fix my crackling noise by using a proper hardware setup on a proper groundplane, that seems to be the key to getting the signals from the T4 to the ADC/DAC and back

. I also used 100Ohm series resistors in the BCLK, LRCLK and MCLK lines to the ADC and a 100Ohms resistor in the MCLK line to the DAC.

Now the T4 runs with the ADC PCM1808 and the DACPCM5102a. I managed to use the ADC with 256ksps sample rate (it is rated for 96ksps), so there is potential for people interested in ultrasound (or in FM radio demodulation, where you need large bandwidth).

I have not managed to investigate further in the partitioned convolution, just used the sketch to push the T4.0 to its limits (processor load AND RAM ;-)). The T4.0 can run a partitioned convolution filter with a maximum of 16256 FIR filter taps at 48ksps sample rate (processor load: 70%, RAM usage: 79%).

Code:

/*
 * Real Time PARTITIONED BLOCK CONVOLUTION FILTERING (two channels)
 * 
 * uses Teensy 4.0 and external ADC / DAC connected on perf board with ground plane
 * 
 *  PCM5102A DAC module
    VCC = Vin
    3.3v = NC
    GND = GND
    FLT = GND
    SCL = 23 / MCLK via series 100 Ohm
    BCK = BCLK (21)
    DIN = TX (7)
    LCK = LRCLK (20)
    FMT = GND
    XMT = 3.3V (HIGH)
    
    PCM1808 ADC module:    
    FMT = GND
    MD1 = GND
    MD0 = GND
    GND = GND
    3.3V = 3.3V --> ADC needs both: 5V AND 3V3
    5V = VIN
    BCK = BCLK (21) via series 100 Ohm
    OUT = RX (8)
    LRC = LRCLK (20) via series 100 Ohm
    SCK = MCLK (23) via series 100 Ohm
    GND = GND
    3.3V = 3.3V
 *  
 * Frank DD4WH 2019-10-02
 * 
 * uses code from wdsp library by Warren Pratt
 * https://github.com/g0orx/wdsp/blob/master/firmin.c
 * 
 * in the public domain GNU GPL
 */

#include <Audio.h>
#include <Wire.h>
#include <SPI.h>
#include <arm_math.h>
#include <arm_const_structs.h> // in the Teensy 4.0 audio library, the ARM CMSIS DSP lib is already a newer version
#include <utility/imxrt_hw.h> // necessary for setting the sample rate, thanks FrankB !


//////////////////////////////////////////////////////////////////////////////////////////////////////////////
// USER DEFINES
//uncomment for pass-thru
//#define PASSTHRU_AUDIO
// define your lower and upper frequency for the bandpass filter
//double FLoCut = 150.0;
//double FHiCut = 2700.0;
double FLoCut = 15.0;
double FHiCut = 19000.0;
float32_t audio_gain = 1.5;
// define your sample rate
//double SAMPLE_RATE = 256000;  // somewhere here is the limit for the PCM1808 ADC . . . [datasheet says 96ksps MAX !!! so please experiment on your own risk]
double SAMPLE_RATE = 48000;  
// define the number of FIR taps of your filter
//const int nc = 1024; // number of taps for the FIR filter
//const int nc = 2048; // number of taps for the FIR filter
//const int nc = 4096; // number of taps for the FIR filter
//const int nc = 8192; // number of taps for the FIR filter
const int nc = 16256; // seems to be MAX number of taps for the FIR filter, because I do not seem to be able to get the RAM settings right
// the latency of the filter is meant to be the same regardless of the number of taps for the filter
// partition size of 128 translates to a latency of 128/sample rate, ie. to 2.9msec with 44.1ksps
// however, measurements by bmillier contradict this . . .
const int partitionsize = 128; 
///////////////////////////////////////////////////////////////////////////////////////////////////////////////


#define DEBUG
#define FOURPI  (2.0 * TWO_PI)
#define SIXPI   (3.0 * TWO_PI)
#define BUFFER_SIZE 128
int32_t sum;
int idx_t = 0;
float32_t mean;
int16_t *sp_L;
int16_t *sp_R;
uint8_t FIR_filter_window = 1;
uint8_t first_block = 1; 
const uint32_t FFT_L = 256; 
uint32_t FFT_length = FFT_L;
const int nfor = nc / partitionsize; // number of partition blocks --> nfor = nc / partitionsize
double cplxcoeffs[nc * 2]; // this holds the initial complex coefficients for the filter BEFORE partitioning
float32_t maskgen[FFT_L * 2];
float32_t fmask[nfor][FFT_L * 2] DMAMEM; // otherwise it wont fit into the RAM
float32_t fftin[FFT_L * 2];
float32_t fftout[nfor][FFT_L * 2] DMAMEM;
float32_t accum[FFT_L * 2];

int buffidx = 0;
int k = 0;
//int idxmask = nfor - 1;

const uint32_t N_B = FFT_L / 2 / BUFFER_SIZE;
uint32_t N_BLOCKS = N_B;
float32_t float_buffer_L [BUFFER_SIZE * N_B];  // 2048 * 4 = 8kb
float32_t float_buffer_R [BUFFER_SIZE * N_B]; // 2048 * 4 = 8kb

float32_t last_sample_buffer_L [BUFFER_SIZE * N_B];  // 2048 * 4 = 8kb
float32_t last_sample_buffer_R [BUFFER_SIZE * N_B]; // 2048 * 4 = 8kb
// complex FFT with the new library CMSIS V4.5
const static arm_cfft_instance_f32 *S;
// complex iFFT with the new library CMSIS V4.5
const static arm_cfft_instance_f32 *iS;
// FFT instance for direct calculation of the filter mask
// from the impulse response of the FIR - the coefficients
const static arm_cfft_instance_f32 *maskS;

// this audio comes from the codec by I2S
AudioInputI2S            i2s_in;
AudioRecordQueue         Q_in_L;
AudioRecordQueue         Q_in_R;
AudioMixer4              mixleft;
AudioMixer4              mixright;
AudioPlayQueue           Q_out_L;
AudioPlayQueue           Q_out_R;
AudioOutputI2S           i2s_out;

AudioConnection          patchCord1(i2s_in, 0, Q_in_L, 0);
AudioConnection          patchCord2(i2s_in, 1, Q_in_R, 0);

AudioConnection          patchCord3(Q_out_L, 0, mixleft, 0);
AudioConnection          patchCord4(Q_out_R, 0, mixright, 0);
AudioConnection          patchCord9(mixleft, 0,  i2s_out, 1);
AudioConnection          patchCord10(mixright, 0, i2s_out, 0);


void setup() {
  Serial.begin(115200);
  delay(100);

  AudioMemory(10); 
  delay(100);

  /****************************************************************************************
     Audio Setup
  ****************************************************************************************/
  mixleft.gain(0, 1.0);
  mixright.gain(0, 1.0);

  setI2SFreq(SAMPLE_RATE);

  /****************************************************************************************
     set filter bandwidth
  ****************************************************************************************/
  // this routine does all the magic of calculating the FIR coeffs
  //calc_cplx_FIR_coeffs_interleaved (cplxcoeffs, nc, FLoCut, FHiCut, SAMPLE_RATE);
  fir_bandpass (cplxcoeffs, nc, FLoCut, FHiCut, SAMPLE_RATE, 0, 1, 1.0);
//  fir_bandpass (cplxcoeffs, nc, FLoCut, FHiCut, SAMPLE_RATE, 0, 0, 1.0);

  /****************************************************************************************
     init complex FFTs
  ****************************************************************************************/
  switch (FFT_length)
  {
    case 256:
      S = &arm_cfft_sR_f32_len256;
      iS = &arm_cfft_sR_f32_len256;
      maskS = &arm_cfft_sR_f32_len256;
      break;
  }

  /****************************************************************************************
     Calculate the FFT of the FIR filter coefficients once to produce the FIR filter mask
  ****************************************************************************************/
    init_partitioned_filter_masks();
    
  /****************************************************************************************
     begin to queue the audio from the audio library
  ****************************************************************************************/
  delay(100);
  Q_in_L.begin();
  Q_in_R.begin();

// this makes it even worse at higher sample rates
/*
  IOMUXC_SW_PAD_CTL_PAD_GPIO_AD_B1_09 = 0x8; //0x8; // MCLK, low speed, drive strength at R0 (150 ohm).
  IOMUXC_SW_PAD_CTL_PAD_GPIO_AD_B1_10 = 0x8; // BCLK
  IOMUXC_SW_PAD_CTL_PAD_GPIO_AD_B1_11 = 0x8; // LRCLK
  IOMUXC_SW_PAD_CTL_PAD_GPIO_B1_01 = 0x8; // OUT1A
*/
} // END OF SETUP


void loop() {
  elapsedMicros usec = 0;
  // are there at least N_BLOCKS buffers in each channel available ?
    if (Q_in_L.available() > N_BLOCKS + 0 && Q_in_R.available() > N_BLOCKS + 0)
    {

      // get audio samples from the audio  buffers and convert them to float
      for (unsigned i = 0; i < N_BLOCKS; i++)
      {
        sp_L = Q_in_L.readBuffer();
        sp_R = Q_in_R.readBuffer();

        // convert to float one buffer_size
        // float_buffer samples are now standardized from > -1.0 to < 1.0
        arm_q15_to_float (sp_L, &float_buffer_L[BUFFER_SIZE * i], BUFFER_SIZE); // convert int_buffer to float 32bit
        arm_q15_to_float (sp_R, &float_buffer_R[BUFFER_SIZE * i], BUFFER_SIZE); // convert int_buffer to float 32bit
        Q_in_L.freeBuffer();
        Q_in_R.freeBuffer();
      }
 
      /**********************************************************************************
          Digital convolution
       **********************************************************************************/
      //  basis for this was Lyons, R. (2011): Understanding Digital Processing.
      //  "Fast FIR Filtering using the FFT", pages 688 - 694
      //  numbers for the steps taken from that source
      //  Method used here: overlap-and-save

      // ONLY FOR the VERY FIRST FFT: fill first samples with zeros
      if (first_block) // fill real & imaginaries with zeros for the first BLOCKSIZE samples
      {
        for (unsigned i = 0; i < partitionsize * 4; i++)
        {
          fftin[i] = 0.0;
        }
        first_block = 0;
      }
      else
      {  // HERE IT STARTS for all other instances
        // fill FFT_buffer with last events audio samples
        for (unsigned i = 0; i < partitionsize; i++)
        {
          fftin[i * 2] = last_sample_buffer_L[i]; // real
          fftin[i * 2 + 1] = last_sample_buffer_R[i]; // imaginary
        }
      }
      // copy recent samples to last_sample_buffer for next time!
      for (unsigned i = 0; i < partitionsize; i++)
      {
        last_sample_buffer_L [i] = float_buffer_L[i];
        last_sample_buffer_R [i] = float_buffer_R[i];
      }

      // now fill recent audio samples into FFT_buffer (left channel: re, right channel: im)
      for (unsigned i = 0; i < partitionsize; i++)
      {
        fftin[FFT_length + i * 2] = float_buffer_L[i]; // real
        fftin[FFT_length + i * 2 + 1] = float_buffer_R[i]; // imaginary
      }
      
      /**********************************************************************************
          Complex Forward FFT
       **********************************************************************************/
      // calculation is performed in-place the FFT_buffer [re, im, re, im, re, im . . .]
      arm_cfft_f32(S, fftin, 0, 1);
      for(unsigned i = 0; i < partitionsize * 4; i++)
      {
          fftout[buffidx][i] = fftin[i];
      }

      /**********************************************************************************
          Complex multiplication with filter mask (precalculated coefficients subjected to an FFT)
          this is taken from wdsp library by Warren Pratt firmin.c
       **********************************************************************************/
      k = buffidx;

      for(unsigned i = 0; i < partitionsize * 4; i++)
      {
          accum[i] = 0.0;
      }
      
      for(unsigned j = 0; j < nfor; j++)
      { 
          for(unsigned i = 0; i < 2 * partitionsize; i++)
          {
              accum[2 * i + 0] += fftout[k][2 * i + 0] * fmask[j][2 * i + 0] -
                                  fftout[k][2 * i + 1] * fmask[j][2 * i + 1];
              
              accum[2 * i + 1] += fftout[k][2 * i + 0] * fmask[j][2 * i + 1] +
                                  fftout[k][2 * i + 1] * fmask[j][2 * i + 0]; 
          }
          k = k - 1;
          if(k < 0)
          {
            k = nfor - 1;
          } 
//          k = (k + idxmask) & idxmask;
      } // end nfor loop

      buffidx = buffidx + 1;
      if(buffidx >= nfor)
      {
          buffidx = 0;    
      } 
//      buffidx = (buffidx + 1) & idxmask;            
      /**********************************************************************************
          Complex inverse FFT
       **********************************************************************************/
      arm_cfft_f32(iS, accum, 1, 1);

      /**********************************************************************************
          Overlap and save algorithm, which simply means yóu take only the right part of the buffer and discard the left part
       **********************************************************************************/
/*        for (unsigned i = 0; i < FFT_length / 2; i++)
        {
          float_buffer_L[i] = accum[FFT_length + i * 2];
          float_buffer_R[i] = accum[FFT_length + i * 2 + 1];
        }
*/
// somehow it seems it is reversed: I discard the right part and take the left part . . .
        for (unsigned i = 0; i < partitionsize; i++)
        {
          //float_buffer_L[i] = accum[partitionsize * 2 + i * 2 + 0];
          //float_buffer_R[i] = accum[partitionsize * 2 + i * 2 + 1];
          float_buffer_L[i] = accum[i * 2 + 0] * audio_gain;
          float_buffer_R[i] = accum[i * 2 + 1] * audio_gain;
        }
      
       /**********************************************************************************
          Demodulation / manipulation / do whatever you want 
       **********************************************************************************/
      //    at this time, just put filtered audio (interleaved format, overlap & save) into left and right channel       

       /**********************************************************************
          CONVERT TO INTEGER AND PLAY AUDIO - Push audio into I2S audio chain
       **********************************************************************/
      for (int i = 0; i < N_BLOCKS; i++)
        {
          sp_L = Q_out_L.getBuffer();    
          sp_R = Q_out_R.getBuffer();
          arm_float_to_q15 (&float_buffer_L[BUFFER_SIZE * i], sp_L, BUFFER_SIZE); 
          arm_float_to_q15 (&float_buffer_R[BUFFER_SIZE * i], sp_R, BUFFER_SIZE);
          Q_out_L.playBuffer(); // play it !  
          Q_out_R.playBuffer(); // play it !
        }

       /**********************************************************************************
          PRINT ROUTINE FOR ELAPSED MICROSECONDS
       **********************************************************************************/
#ifdef DEBUG
      sum = sum + usec;
      idx_t++;
      if (idx_t > 40) {
        mean = sum / idx_t;
        if (mean / 29.00 / N_BLOCKS * SAMPLE_RATE / AUDIO_SAMPLE_RATE_EXACT < 100.0)
        {
          Serial.print("processor load:  ");
          Serial.print (mean / 29.00 / N_BLOCKS * SAMPLE_RATE / AUDIO_SAMPLE_RATE_EXACT);
          Serial.println("%");
        }
        else
        {
          Serial.println("100%");
        }
        Serial.print (mean);
        Serial.print (" microsec for ");
        Serial.print (N_BLOCKS);
        Serial.print ("  stereo blocks    ");
        Serial.print("FFT-length = "); Serial.print(FFT_length);
        Serial.print(";   FIR filter length = "); Serial.println(nc);
        Serial.print("k = "); Serial.println(k);
        Serial.print("buffidx = "); Serial.println(buffidx);
        idx_t = 0;
        sum = 0;
      }
#endif
    } // end of audio process loop
    
      /**********************************************************************************
          Add button check etc. here
       **********************************************************************************/

}

//////////////////////////////////////////////////////////////////////
//  Call to setup complex filter parameters [can process two channels at the same time]
//  The two channels could be I and Q channel in a Software Defined Radio
// SampleRate in Hz
// FLowcut is low cutoff frequency of filter in Hz
// FHicut is high cutoff frequency of filter in Hz
// Offset is the CW tone offset frequency
// cutoff frequencies range from -SampleRate/2 to +SampleRate/2
//  HiCut must be greater than LowCut
//    example to make 2700Hz USB filter:
//  SetupParameters( 100, 2800, 0, 48000);
//////////////////////////////////////////////////////////////////////

void  calc_cplx_FIR_coeffs_interleaved (double *impulse,  int numCoeffs, double FLoCut, double FHiCut, double SampleRate)
//void calc_cplx_FIR_coeffs (double * coeffs_I, double * coeffs_Q, int numCoeffs, double FLoCut, double FHiCut, double SampleRate)
// pointer to coefficients variable, no. of coefficients to calculate, frequency where it happens, stopband attenuation in dB,
// filter type, half-filter bandwidth (only for bandpass and notch)
{
  //calculate some normalized filter parameters
  double nFL = FLoCut / SampleRate;
  double nFH = FHiCut / SampleRate;
  double nFc = (nFH - nFL) / 2.0; //prototype LP filter cutoff
  double nFs = PI * (nFH + nFL); //2 PI times required frequency shift (FHiCut+FLoCut)/2
  double fCenter = 0.5 * (double)(numCoeffs - 1); //floating point center index of FIR filter

  for (int i = 0; i < numCoeffs * 2; i++) //zero pad entire coefficient buffer
  {
    impulse[i] = 0.0;
  }

  //create LP FIR windowed sinc, sin(x)/x complex LP filter coefficients
  for (int i = 0; i < numCoeffs; i++)
  {
    double x = (float32_t)i - fCenter;
    double z;
    if ( abs((double)i - fCenter) < 0.01) //deal with odd size filter singularity where sin(0)/0==1
      z = 2.0 * nFc;
    else
      switch (FIR_filter_window) {
        case 1:    // 4-term Blackman-Harris --> this is what Power SDR uses
          z = (double)sin(TWO_PI * x * nFc) / (PI * x) *
              (0.35875 - 0.48829 * cos( (TWO_PI * i) / (numCoeffs - 1) )
               + 0.14128 * cos( (FOURPI * i) / (numCoeffs - 1) )
               - 0.01168 * cos( (SIXPI * i) / (numCoeffs - 1) ) );
          break;
        case 2:
          z = (double)sin(TWO_PI * x * nFc) / (PI * x) *
              (0.355768 - 0.487396 * cos( (TWO_PI * i) / (numCoeffs - 1) )
               + 0.144232 * cos( (FOURPI * i) / (numCoeffs - 1) )
               - 0.012604 * cos( (SIXPI * i) / (numCoeffs - 1) ) );
          break;
        case 3: // cosine
          z = (double)sin(TWO_PI * x * nFc) / (PI * x) *
              cos((PI * (float32_t)i) / (numCoeffs - 1));
          break;
        case 4: // Hann
          z = (double)sin(TWO_PI * x * nFc) / (PI * x) *
              0.5 * (double)(1.0 - (cos(PI * 2 * (double)i / (double)(numCoeffs - 1))));
          break;
        default: // Blackman-Nuttall window
          z = (double)sin(TWO_PI * x * nFc) / (PI * x) *
              (0.3635819
               - 0.4891775 * cos( (TWO_PI * i) / (numCoeffs - 1) )
               + 0.1365995 * cos( (FOURPI * i) / (numCoeffs - 1) )
               - 0.0106411 * cos( (SIXPI * i) / (numCoeffs - 1) ) );
          break;
      }
    //shift lowpass filter coefficients in frequency by (hicut+lowcut)/2 to form bandpass filter anywhere in range
    impulse[i * 2 + 0] = z * cos(nFs * x);
    impulse[i * 2 + 1] = z * sin(nFs * x);
    //coeffs_I[i]  =  z * cos(nFs * x);
    //coeffs_Q[i] = z * sin(nFs * x);
  }
}


void init_partitioned_filter_masks()
{
#ifndef PASSTHRU_AUDIO
    for(unsigned j = 0; j < nfor;j++)
    {
      // fill with zeroes
      for (unsigned i = 0; i < partitionsize * 4; i++)
      {
          maskgen[i] = 0.0;  
      }
      // take part of impulse response and fill into maskgen
      for (unsigned i = 0; i < partitionsize * 2; i++)
      {
          maskgen[i + partitionsize * 2] = cplxcoeffs[i + j * partitionsize * 2];  
      }
      // perform complex FFT on maskgen
      arm_cfft_f32(maskS, maskgen, 0, 1);
      // fill into fmask array
      for (unsigned i = 0; i < partitionsize * 4; i++)
      {
          fmask[j][i] = maskgen[i];  
      }    
    }
    
#else
// passthru
    
    for(unsigned j = 0; j < nfor;j++)
    {
      // fill with zeroes
      for (unsigned i = 0; i < partitionsize * 4; i++)
      {
          maskgen[i] = 0.0;  
      }
      if(j==0) maskgen[0] = 1.0; 
      arm_cfft_f32(maskS, maskgen, 0, 1);      
      for (unsigned i = 0; i < partitionsize * 4; i++)
      {
          fmask[j][i] = maskgen[i];  
      } 
    }
#endif
}


// taken from wdsp library by Warren Pratt
void fir_bandpass (double * impulse, int N, double f_low, double f_high, double samplerate, int wintype, int rtype, double scale)
{
  double ft = (f_high - f_low) / (2.0 * samplerate);
  double ft_rad = TWO_PI * ft;
  double w_osc = PI * (f_high + f_low) / samplerate;
  int i, j;
  double m = 0.5 * (double)(N - 1);
  double delta = PI / m;
  double cosphi;
  double posi, posj;
  double sinc, window, coef;

  if (N & 1)
  {
    switch (rtype)
    {
    case 0:
      impulse[N >> 1] = scale * 2.0 * ft;
      break;
    case 1:
      impulse[N - 1] = scale * 2.0 * ft;
      impulse[  N  ] = 0.0;
      break;
    }
  }
  for (i = (N + 1) / 2, j = N / 2 - 1; i < N; i++, j--)
  {
    posi = (double)i - m;
    posj = (double)j - m;
    sinc = sin (ft_rad * posi) / (PI * posi);
    switch (wintype)
    {
    case 0: // Blackman-Harris 4-term
      cosphi = cos (delta * i);
      window  =             + 0.21747
          + cosphi *  ( - 0.45325
          + cosphi *  ( + 0.28256
          + cosphi *  ( - 0.04672 )));
      break;
    case 1: // Blackman-Harris 7-term
      cosphi = cos (delta * i);
      window  =       + 6.3964424114390378e-02
          + cosphi *  ( - 2.3993864599352804e-01
          + cosphi *  ( + 3.5015956323820469e-01
          + cosphi *  ( - 2.4774111897080783e-01
          + cosphi *  ( + 8.5438256055858031e-02
          + cosphi *  ( - 1.2320203369293225e-02
          + cosphi *  ( + 4.3778825791773474e-04 ))))));
      break;
    }
    coef = scale * sinc * window;
    switch (rtype)
    {
    case 0:
      impulse[i] = + coef * cos (posi * w_osc);
      impulse[j] = + coef * cos (posj * w_osc);
      break;
    case 1:
      impulse[2 * i + 0] = + coef * cos (posi * w_osc);
      impulse[2 * i + 1] = - coef * sin (posi * w_osc);
      impulse[2 * j + 0] = + coef * cos (posj * w_osc);
      impulse[2 * j + 1] = - coef * sin (posj * w_osc);
      break;
    }
  }
}

void setI2SFreq(int freq) {
  // thanks FrankB !
  // PLL between 27*24 = 648MHz und 54*24=1296MHz
  int n1 = 4; //SAI prescaler 4 => (n1*n2) = multiple of 4
  int n2 = 1 + (24000000 * 27) / (freq * 256 * n1);
  double C = ((double)freq * 256 * n1 * n2) / 24000000;
  int c0 = C;
  int c2 = 10000;
  int c1 = C * c2 - (c0 * c2);
  set_audioClock(c0, c1, c2, true);
  CCM_CS1CDR = (CCM_CS1CDR & ~(CCM_CS1CDR_SAI1_CLK_PRED_MASK | CCM_CS1CDR_SAI1_CLK_PODF_MASK))
       | CCM_CS1CDR_SAI1_CLK_PRED(n1-1) // &0x07
       | CCM_CS1CDR_SAI1_CLK_PODF(n2-1); // &0x3f 
//Serial.printf("SetI2SFreq(%d)\n",freq);
}

bmillier · Oct 2, 2019

Hi Frank: You are welcome! Some on the forum are really good at math, some at programming. I'm good at circuit design, and I have a 'scope!
Good to see that you solved the crackling problem. I see you used a Vector board. I use these for all of my projects that don't end up on a custom PCB.
I like the Vector 8001 as it has wide GND& VCC planes and 3 pads per hole. I like wirewrap wire for the wiring as it is smaller and easier to solder.

Like you, I haven't further pursued the partitioned convolution. Looks like your interests are in line with high SR and high order filters, and what you have seems to handle that fine. I was more interested in low latency, and I don't know how to accomplish that yet.
I am currently working on a T4 wavetable synthesizer (music). There is a good Teensy Audio Lib object for this, but it only works on a fixed wavetable (voice) in FLASH. I've developed it further to work from SRAM, with an SD card to load the desired voices on demand. Took quite a while to figure out how to do this, but its working nicely now. I see from your code that you are using DMAMEM section to access RAM beyond 512K . Just yesterday KurtE responded to a post of mine, and pointed out this "trick".
It's hard to beat the Teensy forum for tech. issues !!
regards

SamR · Oct 9, 2019

Thanks Brian,Frank for your incredible work on that.
Frank: I'll be more than happy to see your code as a Teensy audio library filter!!

hoho · Oct 20, 2019

DD4WH said:
I managed to fix my crackling noise by using a proper hardware setup on a proper groundplane, that seems to be the key to getting the signals from the T4 to the ADC/DAC and back . I also used 100Ohm series resistors in the BCLK, LRCLK and MCLK lines to the ADC and a 100Ohms resistor in the MCLK line to the DAC.

I am wondering why did you use LRCLK and BCLK resistors for ADC only? I'm resetting up the same thing with T4 (had it with 3.6). Would it make sense to have the resistors on all three LRCLK, BCLK and MCLK for both ADC and DAC?

Fast Convolution Filtering with Teensy 4.0 and audio board

Well-known member

Senior Member+

Well-known member

Well-known member

Well-known member

Senior Member+

Well-known member

Senior Member+

Well-known member

Well-known member

Well-known member

Well-known member

Well-known member

Well-known member

Well-known member

Well-known member

Well-known member

Well-known member

Well-known member

Well-known member

Well-known member

Well-known member

Well-known member

New member

Well-known member