analyze_fft1024, duffs optimization

Status
Not open for further replies.

duff

Well-known member
this optimization drops the Audio Libraries fft 1024 cpu usage by 15%, I wanted to see if anyone would try it out and let me know if works for them? The tests I've done the outputs have been the same as the stock fft but I haven't tried a lot of complex signals.

download here: https://github.com/duff2013/analyze_fft1024_duffs_optimization

After researching a bunch I discovered that you can compute N size fft using four N/4 fft's by using a few tricks. Basically you use both the real and imaginary parts of the fft to pack two data sets into a 256 pt fft (2 for 1). Do another ( 2 for 1) and we just computed four 256 pt ffts and since the input data is interweaved a certain way you can combine all four fft's to get a 1024 pt fft. This means we only do two 256 pt fft's compared to one 1024 pt fft. The speed saving comes from the aforementioned algorithm.

Oh forgot to say this is a Audio Object library so install it that way, it works just like the stock fft Object.
 
Last edited:
Here is a sketch if you have downloaded and installed the library where you can switch between the stock analyze_fft1024 and my optimized version. Type 's' for regular fft and type 'o' for optimized with no line ending. This sketch sweeps the frequency between 10 and 1200 Hz. You can see that the outputs are identical in real time. Make sure you maximize your serial monitor.
Code:
#include <SerialFlash.h>
#include <Audio.h>
#include <Wire.h>
#include <SPI.h>
#include <SD.h>
#include <analyze_fft1024_duffs_optimization.h>
//---------------------------------------------------------------------------------------
AudioAnalyzeFFT1024_Duffs_Optimization  optfft;
AudioAnalyzeFFT1024                     stdfft;
AudioSynthToneSweep                     sweep;
AudioOutputAnalog                       dac;
AudioMixer4                             mixer;
//---------------------------------------------------------------------------------------
AudioConnection patchCord0(sweep, 0, mixer, 0);
AudioConnection patchCord1(mixer, 0, optfft, 0);
AudioConnection patchCord2(mixer, 0, stdfft, 0);
AudioConnection patchCord3(mixer, 0, dac, 0);
//---------------------------------------------------------------------------------------
IntervalTimer playNoteTimer;


volatile bool up_or_down;


bool which_fft;


const float t_ampx = 0.8;
const int t_lox = 40;
const int t_hix = 1200;
// Length of time for the sweep in seconds
const float t_timex = 5;


void playNote(void) {
  if (!sweep.isPlaying()) {
    if (up_or_down == false) {
      sweep.play(t_ampx, t_lox, t_hix, t_timex);
      //Serial.println("AudioSynthToneSweep up - begin");
      up_or_down = true;
    }
    // and now reverse the sweep
    else if (up_or_down == true) {
      sweep.play(t_ampx, t_hix, t_lox, t_timex);
      //Serial.println("AudioSynthToneSweep down - begin");
      up_or_down = false;
    }
  }
  digitalWriteFast(LED_BUILTIN, !digitalReadFast(LED_BUILTIN));
}
//---------------------------------------------------------------------------------------
void setup() {
  while (!Serial);
  delay(100);
  up_or_down = false;
  which_fft  = false;
  AudioMemory(30);
  optfft.windowFunction(AudioWindowHanning1024);
  stdfft.windowFunction(AudioWindowHanning1024);
  pinMode(LED_BUILTIN, OUTPUT);
  // Audio library isr allways gets priority
  playNoteTimer.priority(144);
  playNoteTimer.begin(playNote, 100000);
}


void loop() {
  
  if (Serial.available()) {
    char c = Serial.read();
    if (c == 'o') which_fft = false;
    if (c == 's') which_fft = true;
  }


  float n;
  int i;


    float pusageOpt = optfft.processorUsageMax();
    float pusageStd = stdfft.processorUsageMax();
    
  if (optfft.available() && which_fft == false) {
    Serial.printf("OPT FFT USAGE: %.2f |\t\t:", pusageOpt);
    for (i = 0; i < 30; i++) {
      n = optfft.read(i);
      if (n >= 0.01) {
        Serial.print(n);
        Serial.print(" ");
      } else Serial.print("  -  "); // don't print "0.00"
    }
    Serial.println(":");
  }


  if (stdfft.available() && which_fft == true) {
    Serial.printf("STD FFT USAGE: %.2f |\t\t:", pusageStd);
    for (i = 0; i < 30; i++) {
      n = stdfft.read(i);
      if (n >= 0.01) {
        Serial.print(n);
        Serial.print(" ");
      } else Serial.print("  -  "); // don't print "0.00"
    }
    Serial.println(":");
  }
}
 
Last edited:
Here is another sketch to test if the outputs from the standard fft and my optimized version are giving the same results. It uses the AudioSynthKarplusStrong Object fed to both fft's. You can switch back and forth between the two and change the AudioSynthKarplusStrong frequency by Serial Commands like so..

  • Example1: Sending the string -> "opt 440.0" with a NewLine appended by the serial monitor.
    • will show the optimized fft output and set AudioSynthKarplusStrong frequency to 440.0 Hz
  • Example2: Sending the string -> "std 440.0" with a NewLine appended by the serial monitor.
    • will show the standard fft output and set AudioSynthKarplusStrong frequency to 440.0 Hz

This sketch also prints the Max Processor Usage for whatever version of the fft you have selected to print to the serial monitor. Also you don't need any special hardware just a teensy to see the results.
Code:
#include <SerialFlash.h>
#include <Audio.h>
#include <Wire.h>
#include <SPI.h>
#include <SD.h>
#include <analyze_fft1024_duffs_optimization.h>
//---------------------------------------------------------------------------------------
AudioAnalyzeFFT1024_Duffs_Optimization  optfft;
AudioAnalyzeFFT1024                     stdfft;
AudioSynthKarplusStrong                 string;
AudioOutputAnalog                       dac;
AudioMixer4                             mixer;
//---------------------------------------------------------------------------------------
AudioConnection patchCord0(string,  0, mixer,  0);
AudioConnection patchCord1(mixer,   0, optfft, 0);
AudioConnection patchCord2(mixer,   0, stdfft, 0);
AudioConnection patchCord3(mixer,   0, dac,    0);
//---------------------------------------------------------------------------------------
IntervalTimer playNoteTimer;


volatile float frequency;


String input;


enum fft_version_t { OPTIMIZED, STANDARD } FFT_VERSION;


void playNote(void) {
  string.noteOn(frequency, .8);
  digitalWriteFast(LED_BUILTIN, !digitalReadFast(LED_BUILTIN));
}
//---------------------------------------------------------------------------------------
void setup() {
  FFT_VERSION = OPTIMIZED;
  frequency = 196.0;
  AudioMemory(30);
  while (!Serial);
  delay(120);
  optfft.windowFunction(AudioWindowHanning1024);
  stdfft.windowFunction(AudioWindowHanning1024);
  pinMode(LED_BUILTIN, OUTPUT);
  // Audio library isr allways gets priority
  playNoteTimer.priority(144);
  playNoteTimer.begin(playNote, 3000000);
}


void loop() {


  if (Serial.available()) {
    char c = Serial.read();
    input += c;
    if (input.endsWith("\n")) {


      if (input.startsWith("opt")) {
        input = input.substring(3);
        float f = input.toFloat();
        if (f < 30.00) f = 30.00;
        if (f > 800.00) f = 800.00;
        Serial.print("opt ");
        Serial.println(f);
        frequency = f;
        FFT_VERSION = OPTIMIZED;
      }


      if (input.startsWith("std")) {
        input = input.substring(3);
        float f = input.toFloat();
        if (f < 30.00) f = 30.00;
        if (f > 800.00) f = 800.00;
        Serial.print("std ");
        Serial.println(f);
        frequency = f;
        FFT_VERSION = STANDARD;
      }
      input = "";
    }
  }


  float n;
  int i;


  float pusageOpt = optfft.processorUsageMax();
  float pusageStd = stdfft.processorUsageMax();


  if (optfft.available() && FFT_VERSION == OPTIMIZED) {
    Serial.printf("OPT FFT USAGE: %.2f |\t\t:", pusageOpt);
    for (i = 0; i < 30; i++) {
      n = optfft.read(i);
      if (n >= 0.01) {
        Serial.print(n);
        Serial.print(" ");
      } else Serial.print("  -  "); // don't print "0.00"
    }
    Serial.println(":");
  }




  if (stdfft.available() && FFT_VERSION == STANDARD) {
    Serial.printf("STD FFT USAGE: %.2f |\t\t:", pusageStd);
    for (i = 0; i < 30; i++) {
      n = stdfft.read(i);
      if (n >= 0.01) {
        Serial.print(n);
        Serial.print(" ");
      } else Serial.print("  -  "); // don't print "0.00"
    }
    Serial.println(":");
  }
}
 
The stock audio library uses CMSIS-DSP for FFT algorithm, which is old and has precision issues. CMSIS has newer, faster optimized versions (including ones optimized for real-only time domain inputs) that also fix the precision issues. I'm curious on how your performance compares to those. Several members have apparently already switched to the latest CMSIS-DSP library and I've been looking to do the same my self.

I'm working on a project that requires fast FFTs.
 
The stock audio library uses CMSIS-DSP for FFT algorithm, which is old and has precision issues. CMSIS has newer, faster optimized versions (including ones optimized for real-only time domain inputs) that also fix the precision issues. I'm curious on how your performance compares to those. Several members have apparently already switched to the latest CMSIS-DSP library and I've been looking to do the same my self.

I'm working on a project that requires fast FFTs.
I have used CMSIS-DSP version 1.5.1 and it does speed up the FFT but not by 15% like this does (Teensy 3.2 96 MHz, Fast compile option). If paul signals he is going to update the CMSIS version then I'll see how this does with the newer CMSIS, this optimization should work just as well with the new cmsis-dsp as then one we are using currently.
 
So with anything there are tradeoffs and one thing about this optimization is it eats up some extra ram and flash. I have a few ideas on how to reuse some buffers to reduce the ram footprint and also using the combine fft's twiddle coefficients symmetry to cut the twiddle factors down to a fourth of its current size. Eventually I'll work on the IFFT part of this also.

Another thing I'm looking at and I know Paul alluded to this in another thread is to generate twiddle factors for these CMSIS fft's so we don't have to include the 12288 bytes of twiddle factors, just include enough for each FFT in use for the audio library. This is separate from the tables above that I was mentioning.
 
@Duff - Good Luck - a shame @KPC hasn't been around
Thanks! I have some of his code for doing split radix-2 optimizations for the 1024 fft but the values are off from the stock fft in my tests of his implementation. His algorithm is similar to what I did I think? but I tried to keep as much of the original code as possible instead of total rewrite. I didn't really use his code as a guide either because I found it hard to follow but his performance spec are great but I can't tell if he added extra latency and if he did 50% overlap.
 
So I was able to cut the flash size of the coefficients down by a fourth from before. Still trying to figure out how to reduce RAM usage without sacrificing performance but I went ahead and updated my Teensyduino to CMSIS-DSP version 1.4.5 and I'm not looking back. Now I get only 25% MAX CPU Usage for a Teensy 3.2 at 120 MHz and since this is now a standard overclock option. My optimization with the CMSIS-DSP update now uses less flash than the stock fft because of how the new CMSIS-DSP uses its twiddle factors. My algorithm is only a pair of 256 pt FFT's instead of 1024 pt. Now I can start work on an IFFT optimization and then I'll integrate my Pitch Shifter (Phase Vocoder) I coded up into the Audio library.
 
I'm using the STFT (Short Time Fourier Transform) technique currently for pitch shifting with at least a 75% overlap. Its not optimized at all and very slow so it's only good for post processing which is kinda stupid for using with a microcontroller I would guess.
 
Any idea how this compares with KPC's optimization?

https://forum.pjrc.com/threads/27905-1024-point-FFT-30-more-efficient-(experimental)

This one is on a long list of audio library improvements I haven't had time to evaluate and merge. Please understand I'm a little reluctant to merge when I can't put serious time into evaluating this. Some other stuff I merged without much testing has turned out to have multiple issues. The reverb object comes to mind....
 
On the CMSIS-DSP upgrade, I'm still debating whether adding this to the same release as a C++11 to C++14 change is "too much" at once.

I definitely do want to update to a recent CMSIS-DSP library. The clearer Apache license is also a compelling reason to do the upgrade.
 
Any idea how this compares with KPC's optimization?https://forum.pjrc.com/threads/27905-1024-point-FFT-30-more-efficient-(experimental)This one is on a long list of audio library improvements I haven't had time to evaluate and merge. Please understand I'm a little reluctant to merge when I can't put serious time into evaluating this. Some other stuff I merged without much testing has turned out to have multiple issues. The reverb object comes to mind....
Totally understand your reluctance on merging this, I'll go back test his code again, the tests I did on his code seemed that his values didn't match up exactly with the stock fft but that could be my testing procedure. The test I did was to use his code with Derek Rowell's fft interpolation code to get the fundamental and it did not give right fundamental while the stock cmsis fft did. I also wonder if he added some latency to the output by spreading out the fft calculation. I'll report back on this.
I definitely do want to update to a recent CMSIS-DSP library. The clearer Apache license is also a compelling reason to do the upgrade.
I'm glad this in the pipeline, I think you will be quite happy with the improvements they made. The nice thing is they seem to have all the legacy code still there so things should still "just work" as before.
On the CMSIS-DSP upgrade, I'm still debating whether adding this to the same release as a C++11 to C++14 change is "too much" at once.
I have been using C++14 since you released it and have not encountered any problems once you fixed the min max functions.
 
Cool, I'm interested in playing with the Pitch Shifter. Which algorithm did you use?
I hacked S. M. Bernsee's algorithm for pitch shifting here -> https://github.com/duff2013/pitchShifty

You can find the info and download the actual code I used here -> http://blogs.zynaptiq.com/bernsee/pitch-shifting-using-the-ft/

This example sketch is very basic, all it does is take a sin wave at certain frequency shifts it up some many semitones. Use Arduino's Serial Plotter to visualize the input and output's. While still light years away from using it in the current form in the Audio Library, it does work. I compiled his xcode c++ project which uses the exact same algorithm in which he supplies a vocal sound file which sounds pretty good compared to the pitch shifters I've used.
 
Cool. I'm very interested in pitch shifting algorithms. I play bass guitar in a band and provide a lot of the rhythm guitars by using pitch shifting to generate octave up and various intervals to create the chords. Obviously I mostly interested in real-time applications.

I currently use a nanoPog for octaves and it's pretty damn good, suffering almost none of the common vocoder issues. But can't do intervals and has no MIDI control so I use software VSTs for intervals/harmonies. I bought a Hog2 specifically to get the ability to use MIDI and generate multiple voices at any octave and it's a $500 piece of flaming dog dung. By that I mean it suffers from all the common phase vocoder issues. Considering the enormous price tag I was pretty darn umimpressed.

I'm hoping to use the Teensy 3.6 to do some basic prototyping with generating a single voice, but I think I'll need to go to an FPGA for the full blown, multi-voice generation I require.

Here's a short video demonstrating the use of pitch-shifting as described above. I'm using a bass to generate all the guitar tracks.
https://www.youtube.com/watch?v=DylOstql8n8
 
Last edited:
I'm hoping to use the Teensy 3.6 to do some basic prototyping with generating a single voice, but I think I'll need to go to an FPGA for the full blown, multi-voice generation I require.
I would like to see if Teensy 3.2 could handle a single vocoder channel Audio Library Object. What I posted is very far from that given it uses floating point and requires substantial ram. I have been thinking about how it could be done but nothing concrete as of yet, thats why posted it so other can maybe take wack at it. One limiting factor is going to be dealing with the oversample in context of how the Audio Library uses blocks of data. The best you could currently get is 8 with block sizes of 128 samples. Another would be quality of the output using q15 data. This might be pipe dream after all with the teensy and Audio Library.
 
Hi Duff,

this sounds very exciting and interesting!

I tried to set up a realtime system with Teensy 3.5 & audioboard with electret mic in order to pitch shift the incoming audio, but it does not work, because the code seems to hang itself up after the first two rounds of the pitch shifting algorithm.

Code:
// S. M. Bernsee pitch shift algorithm for teensy
// Use Serial Plotter
#include <Audio.h>

/*******************************************
 *           editable items
 ******************************************/
#define FFT_SIZE    256
#define OVER_SAMPLE 16

AudioInputI2S            i2s_in; 

AudioRecordQueue         IN;    

AudioPlayQueue           OUT; 

AudioOutputI2S           i2s_out;   

AudioConnection          patchCord1(i2s_in, 0, IN, 0);

AudioConnection          patchCord9(OUT, 0,  i2s_out, 0);
AudioConnection          patchCord10(OUT, 0, i2s_out, 1);

AudioControlSGTL5000     sgtl5000;  


//const int myInput = AUDIO_INPUT_LINEIN;
const int myInput = AUDIO_INPUT_MIC;
int8_t mic_gain = 40; //

// input waveform
const float frequency  = 440;
const float sampleRate = 44100;
const float amplitude  = 1; 
/******************************************/
// phase accumulator
float phi         = 0;
// phase increment per sample
const float delta = 2 * PI * frequency / sampleRate;

  
  // semitones to shift, 12 is one octive
  int semitones = 12;
  // length of the input buffer
  int bufferLengthFrames = FFT_SIZE * 8;
  // shift value for algorithm
  float pitchShift = 0.5; //powf(2., semitones / 12.);

float input_buffer[FFT_SIZE * 8];
float output_buffer[FFT_SIZE * 8];

void setup() {
  Serial.begin(115200);
  delay(100);
  AudioMemory(100);
  delay(200);
// Enable the audio shield. select input. and enable output
  sgtl5000.enable();
  sgtl5000.inputSelect(myInput);
  sgtl5000.volume(0.7);
  sgtl5000.micGain (mic_gain);
  sgtl5000.adcHighPassFilterDisable(); // does not help too much!

  IN.begin();

  /*
   * smbPitchShift params:
   * 1: "pitchShift"         -> semitones to shift up
   * 2: "bufferLengthFrames" -> number of samples in input buffer must be larger than FFT_SIZE
   * 3: "FFT_SIZE"           -> size of the FFT, needs to be a power of 2
   * 4: "OVER_SAMPLE"        -> fifo buffer overlap factor, more the better but slower, has to be divisable by FFT_SIZE
   * 5: "sampleRate"         -> sample rate for sin generation
   * 6: "input"              -> input buffer
   * 7: "output"             -> output buffer
   */
}

int16_t *in_pointer;
int frames = 16;

void loop() {
  // put your main code here, to run repeatedly:
    if (IN.available() > frames + 1)
    {
        //get input from I2S
        for (int i = 0; i < frames; i++)
        {  
            in_pointer = IN.readBuffer();
            arm_q15_to_float (in_pointer, &input_buffer[128 * i], 128); // convert int_buffer to float 32bit
            IN.freeBuffer();
        }
        smbPitchShift(pitchShift, bufferLengthFrames, FFT_SIZE, OVER_SAMPLE, sampleRate, input_buffer, output_buffer);

/*         // passthru
        for (int i = 0; i < frames * 128; i++)
        {  
            output_buffer[i] = input_buffer[i];
        }
  */    
        for (int i = 0; i < frames; i++)
        {
          in_pointer = OUT.getBuffer();
          arm_float_to_q15 (&output_buffer[128 * i], in_pointer, 128); 
          OUT.playBuffer(); // play it !
        }  
  
  } 
  
}
Code:
/****************************************************************************
*
* NAME: smbPitchShift.cpp
* VERSION: 1.2
* HOME URL: http://www.dspdimension.com
* KNOWN BUGS: none
*
* SYNOPSIS: Routine for doing pitch shifting while maintaining
* duration using the Short Time Fourier Transform.
*
* DESCRIPTION: The routine takes a pitchShift factor value which is between 0.5
* (one octave down) and 2. (one octave up). A value of exactly 1 does not change
* the pitch. numSampsToProcess tells the routine how many samples in indata[0...
* numSampsToProcess-1] should be pitch shifted and moved to outdata[0 ...
* numSampsToProcess-1]. The two buffers can be identical (ie. it can process the
* data in-place). fftFrameSize defines the FFT frame size used for the
* processing. Typical values are 1024, 2048 and 4096. It may be any value <=
* MAX_FRAME_LENGTH but it MUST be a power of 2. osamp is the STFT
* oversampling factor which also determines the overlap between adjacent STFT
* frames. It should at least be 4 for moderate scaling ratios. A value of 32 is
* recommended for best quality. sampleRate takes the sample rate for the signal 
* in unit Hz, ie. 44100 for 44.1 kHz audio. The data passed to the routine in 
* indata[] should be in the range [-1.0, 1.0), which is also the output range 
* for the data, make sure you scale the data accordingly (for 16bit signed integers
* you would have to divide (and multiply) by 32768). 
*
* COPYRIGHT 1999-2009 Stephan M. Bernsee <smb [AT] dspdimension [DOT] com>
*
*             The Wide Open License (WOL)
*
* Permission to use, copy, modify, distribute and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
* the above copyright notice and this license appear in all source copies. 
* THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF
* ANY KIND. See http://www.dspguru.com/wol.htm for more information.
*
*****************************************************************************/ 

#define MAX_FRAME_LENGTH FFT_SIZE

void smbFft(float *fftBuffer, long fftFrameSize, long sign);
double smbAtan2(double x, double y);

static float gLastPhasetmp = 0;
static int stepShift = 0;
// -----------------------------------------------------------------------------------------------------------------
void smbPitchShift(float pitchShift, long numSampsToProcess, long fftFrameSize, long osamp, float sampleRate, float *indata, float *outdata) {

  static float gInFIFO[MAX_FRAME_LENGTH];
  //static float gOutFIFO[MAX_FRAME_LENGTH];
  static float gFFTworksp[2 * MAX_FRAME_LENGTH];
  static float gLastPhase[MAX_FRAME_LENGTH / 2 + 1];
  static float gSumPhase[MAX_FRAME_LENGTH / 2 + 1];
  static float gOutputAccum[2 * MAX_FRAME_LENGTH];
  static float gAnaFreq[MAX_FRAME_LENGTH / 2 + 1];
  static float gAnaMagn[MAX_FRAME_LENGTH / 2 + 1];
  static float gSynFreq[MAX_FRAME_LENGTH / 2 + 1];
  static float gSynMagn[MAX_FRAME_LENGTH / 2 + 1];
  static long gRover = false, gInit = false;
  float magn, phase, tmp, window, real, imag;
  float freqPerBin, expct;
  long i, k, qpd, index, inFifoLatency, stepSize, fftFrameSize2;

  // set up some handy variables
  fftFrameSize2 = fftFrameSize / 2;
  stepSize = fftFrameSize / osamp;
  freqPerBin = sampleRate / (float)fftFrameSize;
  expct = 2.*M_PI * (float)stepSize / (float)fftFrameSize;
  inFifoLatency = fftFrameSize - stepSize; //Serial.println(inFifoLatency);
  if (gRover == false) gRover = inFifoLatency;

  // initialize our static arrays
  if (gInit == false) {
    memset(gInFIFO,      0, MAX_FRAME_LENGTH           * sizeof(float));
    memset(gFFTworksp,   0, 2 * MAX_FRAME_LENGTH       * sizeof(float));
    memset(gLastPhase,   0, (MAX_FRAME_LENGTH / 2 + 1) *sizeof(float));
    memset(gSumPhase,    0, (MAX_FRAME_LENGTH / 2 + 1) *sizeof(float));
    memset(gOutputAccum, 0, 2 * MAX_FRAME_LENGTH       * sizeof(float));
    memset(gAnaFreq,     0, (MAX_FRAME_LENGTH / 2 + 1) * sizeof(float));
    memset(gAnaMagn,     0, (MAX_FRAME_LENGTH / 2 + 1) * sizeof(float));
    gInit = true;
  }

  for (int b = 0; b < inFifoLatency; b++) {
    gInFIFO[b] = indata[b];
  }
  
  // main processing loop
  for ( i = inFifoLatency; i < numSampsToProcess; i++) {

    // As long as we have not yet collected enough data just read in
    gInFIFO[gRover] = indata[i];
    gRover++;

    // now we have enough data for processing
    if (gRover >= fftFrameSize) {
      gRover = inFifoLatency;
//Serial.println("here");
      // do windowing and re,im interleave
      for (k = 0; k < fftFrameSize; k++) {
        window = -.5 * cosf(2.*M_PI * (float)k / (float)fftFrameSize) + .5;
        gFFTworksp[2 * k] = gInFIFO[k] * window;
        gFFTworksp[2 * k + 1] = 0.;
      }

      // ***************** ANALYSIS *******************
      // do transform
      smbFft(gFFTworksp, fftFrameSize, -1);

      // this is the analysis step
      for (k = 0; k <= fftFrameSize2; k++) {

        // de-interlace FFT buffer
        real = gFFTworksp[2 * k];
        imag = gFFTworksp[2 * k + 1];

        // compute magnitude and phase
        magn = 2.*sqrtf(real * real + imag * imag);
        phase = atan2f(imag, real);

        // compute phase difference
        tmp = phase - gLastPhase[k];
        gLastPhase[k] = phase;

        // subtract expected phase difference
        tmp -= (float)k * expct;

        // map delta phase into +/- Pi interval
        qpd = tmp / M_PI;
        if (qpd >= 0) qpd += qpd & 1;
        else qpd -= qpd & 1;
        tmp -= M_PI * (float)qpd;

        // get deviation from bin frequency from the +/- Pi interval
        tmp = osamp * tmp / (2.*M_PI);

        // compute the k-th partials' true frequency
        tmp = (float)k * freqPerBin + tmp * freqPerBin;

        // store magnitude and true frequency in analysis arrays
        gAnaMagn[k] = magn;
        gAnaFreq[k] = tmp;
      }

      // ***************** PROCESSING *******************
      // this does the actual pitch shifting
      memset(gSynMagn, 0, fftFrameSize2 * sizeof(float));
      memset(gSynFreq, 0, fftFrameSize2 * sizeof(float));
      for (k = 0; k <= fftFrameSize2; k++) {
        index = k * pitchShift;

        if (index <= fftFrameSize2) {
          gSynMagn[index] += gAnaMagn[k];
          gSynFreq[index] = gAnaFreq[k] * pitchShift;
        }
      }

      // ***************** SYNTHESIS *******************
      // this is the synthesis step
      
      for (k = 0; k <= fftFrameSize2; k++) {

        // get magnitude and true frequency from synthesis arrays
        magn = gSynMagn[k];
        tmp = gSynFreq[k];

        // subtract bin mid frequency
        tmp -= (float)k * freqPerBin;

        // get bin deviation from freq deviation
        tmp /= freqPerBin;

        // take osamp into account
        tmp = 2.*M_PI * tmp / osamp;

        // add the overlap phase advance back in
        tmp += (float)k * expct;

        // accumulate delta phase to get bin phase
        gSumPhase[k] += tmp;
        phase = gSumPhase[k];

        // get real and imag part and re-interleave
        gFFTworksp[2 * k] = magn * cosf(phase);
        gFFTworksp[2 * k + 1] = magn * sinf(phase);
      }

      // zero negative frequencies
      for (k = fftFrameSize + 2; k < 2 * fftFrameSize; k++) gFFTworksp[k] = 0.;

      // do inverse transform
      smbFft(gFFTworksp, fftFrameSize, 1);

      // do windowing and add to output accumulator
      for (k = 0; k < fftFrameSize; k++) {
        window = -.5 * cosf(2.*M_PI * (float)k / (float)fftFrameSize) + .5;
        gOutputAccum[k] += 2.*window * gFFTworksp[2 * k] / (fftFrameSize2 * osamp);
      }

      // shift output data buffer
      for (k = 0; k < stepSize; k++) {
        outdata[k + (stepShift * stepSize)]   = gOutputAccum[k];
      }
      stepShift++;
      
      // shift accumulator
      memmove(gOutputAccum, gOutputAccum + stepSize, fftFrameSize * sizeof(float));

      // move input FIFO
      for (k = 0; k < inFifoLatency; k++) gInFIFO[k] = gInFIFO[k + stepSize];

    }
  }
//Serial.println("pitch shifting completed");
//Serial.println(gRover);Serial.println(gInit);
}

// -----------------------------------------------------------------------------------------------------------------


void smbFft(float *fftBuffer, long fftFrameSize, long sign)
{
  float wr, wi, arg, *p1, *p2, temp;
  float tr, ti, ur, ui, *p1r, *p1i, *p2r, *p2i;
  long i, bitm, j, le, le2, k;

  for (i = 2; i < 2 * fftFrameSize - 2; i += 2) {
    for (bitm = 2, j = 0; bitm < 2 * fftFrameSize; bitm <<= 1) {
      if (i & bitm) j++;
      j <<= 1;
    }
    if (i < j) {
      p1 = fftBuffer + i; p2 = fftBuffer + j;
      temp = *p1; *(p1++) = *p2;
      *(p2++) = temp; temp = *p1;
      *p1 = *p2; *p2 = temp;
    }
  }
  for (k = 0, le = 2; k < (long)(logf(fftFrameSize) / logf(2.) + .5); k++) {
    le <<= 1;
    le2 = le >> 1;
    ur = 1.0;
    ui = 0.0;
    arg = M_PI / (le2 >> 1);
    wr = cosf(arg);
    wi = sign * sinf(arg);
    for (j = 0; j < le2; j += 2) {
      p1r = fftBuffer + j; p1i = p1r + 1;
      p2r = p1r + le2; p2i = p2r + 1;
      for (i = j; i < 2 * fftFrameSize; i += le) {
        tr = *p2r * ur - *p2i * ui;
        ti = *p2r * ui + *p2i * ur;
        *p2r = *p1r - tr; *p2i = *p1i - ti;
        *p1r += tr; *p1i += ti;
        p1r += le; p1i += le;
        p2r += le; p2i += le;
      }
      tr = ur * wr - ui * wi;
      ui = ur * wi + ui * wr;
      ur = tr;
    }
  }
}

double smbAtan2(double x, double y)
{
  double signx;
  if (x > 0.) signx = 1.;
  else signx = -1.;

  if (x == 0.) return 0.;
  if (y == 0.) return signx * M_PI / 2.;

  return atan2(x, y);
}

I changed the double variables to float to speed up the whole thing.

Any idea why this is not working?

All the best,

Frank
 
Last edited:
@DD4WH, from what I can see I'm not sure why it's hanging but I do not have way to test your code currently since I'm traveling but hope you figure it out. I'll try it out once I get back.
 
Hey Duff, Just came across your pitch shift project. I haven't had a chance to sit down and take it apart yet. Looks super interesting!

You mentioned before that the thing you got working was not real time, is that the case for your github post pitchshifty?
 
Last edited:
Hey Duff, Just came across your pitch shift project. I haven't had a chance to sit down and take it apart yet. Looks super interesting!

You mentioned before that the thing you got working was not real time, is that the case for your github post pitchshifty?
What's on Github is far as I got on this. I haven't even tried to make Audio Object for this yet but if you want try please do i'll help where i can. The algorithms current form is to slow right now for realtime applications unfortunately.
 
Status
Not open for further replies.
Back
Top