I've been working on implementing a formant shifter based on the OpenAudio Formant shifter, but I've encountered some issues that I could use some help with. Here's a breakdown of my implementation and the challenges I'm facing:

#ifndef _AudioEffectFormantShiftFD_OA_F32_h

#define _AudioEffectFormantShiftFD_OA_F32_h

#include "AudioStream_F32.h"

#include <arm_math.h>

#include "FFT_Overlapped_OA_F32.h"

#include <memory>

// Set frequency range constants

const float minFreq = 100.0f;

const float maxFreq = 20000.0f;

// FIR filter setup

#define BLOCK_SIZE 128 // Should grab this from audio audio_block_samples really to ensure its always correct

#define NUM_TAPS 256

#define CUTOFF_FREQ 100.0f // Hz

class AudioEffectFormantShiftFD_OA_F32 : public AudioStream_F32 {

public:

AudioEffectFormantShiftFD_OA_F32() : AudioStream_F32(1, inputQueueArray_f32) {

}

AudioEffectFormantShiftFD_OA_F32(const AudioSettings_F32 &settings) :

AudioStream_F32(1, inputQueueArray_f32), sample_rate_Hz(settings.sample_rate_Hz) { }

AudioEffectFormantShiftFD_OA_F32(const AudioSettings_F32 &settings, int _N_FFT) :

AudioStream_F32(1, inputQueueArray_f32) {

setup(settings, _N_FFT);

}

void setOversamplingFactor(int factor) { // Set the oversampling factor (currently unused)

oversampling_factor = factor; // Currently unused

}

int setup(const AudioSettings_F32 &settings, int _N_FFT) {

std::vector<float> firCoeffs;

calculateFIRCoefficients(firCoeffs, NUM_TAPS, CUTOFF_FREQ);

window_coeffs = std::make_unique<float[]>(BLOCK_SIZE);

computeHannWindow(window_coeffs.get(), BLOCK_SIZE);

sample_rate_Hz = settings.sample_rate_Hz;

int N_FFT = myFFT.setup(settings, _N_FFT);

if (N_FFT < 1) return N_FFT;

N_FFT = myIFFT.setup(settings, _N_FFT);

if (N_FFT < 1) return N_FFT;

//Set windowing function

bool useMyFFTWindow = true;

if(useMyFFTWindow){

myFFT.getFFTObject()->useHanningWindow();

if (myIFFT.getNBuffBlocks() > 3) {

myIFFT.getIFFTObject()->useHanningWindow();

}

}

// Print FFT parameters for debugging

printFFTParameters(settings, N_FFT);

// Allocate memory for frequency domain data

complex_2N_buffer = std::make_unique<float32_t[]>(2 * N_FFT);

enabled = 1;

return N_FFT;

}

float setScaleFactor(float scale_fac) {

shift_scale_fac = (scale_fac < 0.00001f) ? 0.00001f : scale_fac;

return shift_scale_fac;

}

float getScaleFactor() const {

return shift_scale_fac;

}

virtual void update();

private:

int enabled = 0;

int oversampling_factor = 1;

float sample_rate_Hz = 44117.0f;

float shift_scale_fac = 1.0f;

std::unique_ptr<float32_t[]> complex_2N_buffer;

audio_block_f32_t *inputQueueArray_f32[1];

FFT_Overlapped_OA_F32 myFFT;

IFFT_Overlapped_OA_F32 myIFFT;

float envelope_coeffs[NUM_TAPS] = { };

float32_t firStateF32[NUM_TAPS + BLOCK_SIZE - 1];

float32_t envelope_buffer[BLOCK_SIZE];

float32_t overlap_buffer[BLOCK_SIZE]; // Declaration of overlap buffer

void printFFTParameters(const AudioSettings_F32 &settings, int N_FFT) {

Serial.println("AudioEffectFormantShiftFD_OA_F32: FFT parameters...");

Serial.print(" : N_FFT = "); Serial.println(N_FFT);

Serial.print(" : audio_block_samples = "); Serial.println(settings.audio_block_samples);

Serial.print(" : FFT N_BUFF_BLOCKS = "); Serial.println(myFFT.getNBuffBlocks());

Serial.print(" : IFFT N_BUFF_BLOCKS = "); Serial.println(myIFFT.getNBuffBlocks());

Serial.print(" : FFT use window = "); Serial.println(myFFT.getFFTObject()->get_flagUseWindow());

Serial.print(" : IFFT use window = "); Serial.println(myIFFT.getIFFTObject()->get_flagUseWindow());

}

std::unique_ptr<float[]> window_coeffs;

void computeHannWindow(float *window_coeffs, int length) { // Compute a Hann window for the audio signal to be applied before the audio is proccesed

for (int i = 0; i < length; i++) {

window_coeffs= 0.5f * (1.0f - cosf(2.0f * M_PI * static_cast<float>(i) / static_cast<float>(length)));

Serial.println(window_coeffs);

}

}

void applyWindow(audio_block_f32_t *audio_block) { // Apply the window function to the envelope

for (int i = 0; i < BLOCK_SIZE; i++) {

audio_block->data*= window_coeffs;

}

}

void normalizeInputAudio(audio_block_f32_t *audio_block) {

const float noise_gate_threshold = 0.01f; // Adjust this value based on your needs

float max_val = -1.0f;

for (int i = 0; i < BLOCK_SIZE; i++) {

max_val = fmax(max_val, fabs(audio_block->data));

}

if (max_val > noise_gate_threshold) { // Normalize the audio signal

float normalization_factor = 1.0f / max_val;

for (int i = 0; i < BLOCK_SIZE; i++) {

audio_block->data*= normalization_factor;

}

Serial.print("Normalization factor: ");

Serial.println(normalization_factor);

}

}

void computeEnvelope(const audio_block_f32_t *audio_block) { // Compute the envelope of the audio signal using FIR filter

arm_fir_instance_f32 firInstance;

arm_fir_init_f32(&firInstance, NUM_TAPS, envelope_coeffs, firStateF32, BLOCK_SIZE);

arm_fir_f32(&firInstance, audio_block->data, envelope_buffer, BLOCK_SIZE);

}

void performFFT(audio_block_f32_t *audio_block) { // Perform an FFT on the audio signal and store in complex buffer

myFFT.execute(audio_block, complex_2N_buffer.get());

}

void performIFFT() { // Perform an IFFT on the complex buffer and rebuild the audio signal

audio_block_f32_t *out_audio_block = myIFFT.execute(complex_2N_buffer.get());

AudioStream_F32::transmit(out_audio_block);

}

void calculateIdealImpulseResponse(std::vector<float>& impulseResponse, int numTaps, float cutoffFreq) { // Calculate the ideal impluse response for the FIR Filter

int midPoint = numTaps / 2;

for (int i = 0; i < numTaps; i++) {

if (i == midPoint) {

impulseResponse= 2 * cutoffFreq;

} else {

impulseResponse= sin(2 * M_PI * cutoffFreq * (i - midPoint)) / (M_PI * (i - midPoint));

}

}

}

void applyHammingWindow(std::vector<float>& impulseResponse, int numTaps) { // Apply a Hamming window to the impulse response

for (int i = 0; i < numTaps; i++) {

impulseResponse*= 0.54 - 0.46 * cos(2 * M_PI * i / (numTaps - 1)); // Hamming Window

envelope_coeffs= impulseResponse;

}

}

// Function to calculate FIR filter coefficients dynamically

void calculateFIRCoefficients(std::vector<float>& coeffs, int numTaps, float cutoffFreq) {

coeffs.resize(numTaps);

// Calculate the ideal impulse response

calculateIdealImpulseResponse(coeffs, numTaps, cutoffFreq);

// Apply the Hamming window

applyHammingWindow(coeffs, numTaps);

}

void shiftFormants() { // Shift the formants

int fftSize = myFFT.getNFFT();

int N_2 = fftSize / 2 + 1;

float orig_mag[N_2];

arm_cmplx_mag_f32(complex_2N_buffer.get(), orig_mag, N_2); // Get the magnitude of the complez buffer

for (int dest_ind = 0; dest_ind < N_2; dest_ind++) {

float source_ind_float = static_cast<float>(dest_ind) / shift_scale_fac;

int mirrored_source_ind = mirrorIndex(source_ind_float, N_2);

float new_mag = interpolateMagnitude(orig_mag, mirrored_source_ind, source_ind_float, N_2);

float scale = new_mag / orig_mag[dest_ind];

scaleComplexBuffer(dest_ind, scale);

}

myFFT.rebuildNegativeFrequencySpace(complex_2N_buffer.get());

}

int mirrorIndex(float source_ind_float, int N_2) const { // mirror the index

if (source_ind_float < 1.0f) {

return 1 - static_cast<int>(source_ind_float);

} else if (source_ind_float >= N_2 - 1) {

return N_2 - 2 - static_cast<int>(source_ind_float - (N_2 - 1));

} else {

return static_cast<int>(source_ind_float);

}

}

float interpolateMagnitude(const float *orig_mag, int mirrored_source_ind, float source_ind_float, int N_2) const { // cubic interpolation

float y0, y1, y2, y3;

if (mirrored_source_ind < 0 || mirrored_source_ind >= N_2 - 1) {

y0 = y1 = y2 = y3 = 0.0f;

} else {

y0 = orig_mag[mirrored_source_ind - 1];

y1 = orig_mag[mirrored_source_ind];

y2 = orig_mag[mirrored_source_ind + 1];

y3 = orig_mag[mirrored_source_ind + 2];

}

float interp_fac = source_ind_float - static_cast<float>(mirrored_source_ind);

interp_fac = fmax(0.0f, fmin(interp_fac, 1.0f)); // clamp to [0, 1]

float a0 = -0.5f * y0 + 1.5f * y1 - 1.5f * y2 + y3;

float a1 = y0 - 2.5f * y1 + 2.0f * y2 - 0.5f * y3;

float a2 = -0.5f * y0 + 0.5f * y2;

float a3 = y1;

return ((a0 * interp_fac + a1) * interp_fac + a2) * interp_fac + a3;

}

void scaleComplexBuffer(int dest_ind, float scale) { // Scale the complex buffer

float real_part = complex_2N_buffer[2 * dest_ind];

float imag_part = complex_2N_buffer[2 * dest_ind + 1];

complex_2N_buffer[2 * dest_ind] = real_part * scale;

complex_2N_buffer[2 * dest_ind + 1] = imag_part * scale;

}

};

// Improved update method with normalization and overlap-add

void AudioEffectFormantShiftFD_OA_F32::update() {

audio_block_f32_t *in_audio_block = AudioStream_F32::receiveReadOnly_f32();

if (!in_audio_block) return;

if (!enabled) { // if the effect is not enabled just pass the audio through directly

AudioStream_F32::transmit(in_audio_block);

AudioStream_F32::release(in_audio_block);

return;

}

bool has_denormalized_values = false;

const float denorm_threshold = 1e-20f;

for (int i = 0; i < BLOCK_SIZE; i++) { //

if (fabs(in_audio_block->data) < denorm_threshold && in_audio_block->data!= 0.0f) { // check for denormalized values

has_denormalized_values = true;

break;

}

}

if (!has_denormalized_values) {

//normalizeInputAudio(in_audio_block); -- Not ideal for vocals

}

//applyWindow(in_audio_block); // apply the window function <--- This casues a bitcrush effect when enabled.

computeEnvelope(in_audio_block); // compute the envelope of the audio using the fir filter

performFFT(in_audio_block); // perform the fft

AudioStream_F32::release(in_audio_block); // release the input block

shiftFormants(); // shift the formants

performIFFT(); // rebuild the audio signal

}

#endif

*I've implemented formant shifting logic using cubic interpolation and scaling of the complex buffer and ive used a FIR filter to shape the envelope of the audio signal.*

The issues I have:

Ring Modulated Sound: My implementation produces an almost ring modulated sound, especially noticeable on sustained notes.

Bit Crushed Sound with Hann Window: Enabling the Hann window turns the sound into a bit-crushed effect.

I've tried increasing/decreasing block sizes, number of taps etc, but it tends to make it worse, I'm sure It's something glaringly obvious, but any help would be greatly appreciated!

Thanks!

I've included a sample of the audio: Dry - Formant Shift without Hann Window - Formant shift with Hann Window enabled

The issues I have:

Ring Modulated Sound: My implementation produces an almost ring modulated sound, especially noticeable on sustained notes.

Bit Crushed Sound with Hann Window: Enabling the Hann window turns the sound into a bit-crushed effect.

I've tried increasing/decreasing block sizes, number of taps etc, but it tends to make it worse, I'm sure It's something glaringly obvious, but any help would be greatly appreciated!

Thanks!

I've included a sample of the audio: Dry - Formant Shift without Hann Window - Formant shift with Hann Window enabled