Hi everyone,
I've been working on implementing a formant shifter based on the OpenAudio Formant shifter, but I've encountered some issues that I could use some help with. Here's a breakdown of my implementation and the challenges I'm facing:
I've implemented formant shifting logic using cubic interpolation and scaling of the complex buffer and ive used a FIR filter to shape the envelope of the audio signal.
The issues I have:
Ring Modulated Sound: My implementation produces an almost ring modulated sound, especially noticeable on sustained notes.
Bit Crushed Sound with Hann Window: Enabling the Hann window turns the sound into a bit-crushed effect.
I've tried increasing/decreasing block sizes, number of taps etc, but it tends to make it worse, I'm sure It's something glaringly obvious, but any help would be greatly appreciated!
Thanks!
I've included a sample of the audio: Dry - Formant Shift without Hann Window - Formant shift with Hann Window enabled
I've been working on implementing a formant shifter based on the OpenAudio Formant shifter, but I've encountered some issues that I could use some help with. Here's a breakdown of my implementation and the challenges I'm facing:
#ifndef _AudioEffectFormantShiftFD_OA_F32_h
#define _AudioEffectFormantShiftFD_OA_F32_h
#include "AudioStream_F32.h"
#include <arm_math.h>
#include "FFT_Overlapped_OA_F32.h"
#include <memory>
// Set frequency range constants
const float minFreq = 100.0f;
const float maxFreq = 20000.0f;
// FIR filter setup
#define BLOCK_SIZE 128 // Should grab this from audio audio_block_samples really to ensure its always correct
#define NUM_TAPS 256
#define CUTOFF_FREQ 100.0f // Hz
class AudioEffectFormantShiftFD_OA_F32 : public AudioStream_F32 {
public:
AudioEffectFormantShiftFD_OA_F32() : AudioStream_F32(1, inputQueueArray_f32) {
}
AudioEffectFormantShiftFD_OA_F32(const AudioSettings_F32 &settings) :
AudioStream_F32(1, inputQueueArray_f32), sample_rate_Hz(settings.sample_rate_Hz) { }
AudioEffectFormantShiftFD_OA_F32(const AudioSettings_F32 &settings, int _N_FFT) :
AudioStream_F32(1, inputQueueArray_f32) {
setup(settings, _N_FFT);
}
void setOversamplingFactor(int factor) { // Set the oversampling factor (currently unused)
oversampling_factor = factor; // Currently unused
}
int setup(const AudioSettings_F32 &settings, int _N_FFT) {
std::vector<float> firCoeffs;
calculateFIRCoefficients(firCoeffs, NUM_TAPS, CUTOFF_FREQ);
window_coeffs = std::make_unique<float[]>(BLOCK_SIZE);
computeHannWindow(window_coeffs.get(), BLOCK_SIZE);
sample_rate_Hz = settings.sample_rate_Hz;
int N_FFT = myFFT.setup(settings, _N_FFT);
if (N_FFT < 1) return N_FFT;
N_FFT = myIFFT.setup(settings, _N_FFT);
if (N_FFT < 1) return N_FFT;
//Set windowing function
bool useMyFFTWindow = true;
if(useMyFFTWindow){
myFFT.getFFTObject()->useHanningWindow();
if (myIFFT.getNBuffBlocks() > 3) {
myIFFT.getIFFTObject()->useHanningWindow();
}
}
// Print FFT parameters for debugging
printFFTParameters(settings, N_FFT);
// Allocate memory for frequency domain data
complex_2N_buffer = std::make_unique<float32_t[]>(2 * N_FFT);
enabled = 1;
return N_FFT;
}
float setScaleFactor(float scale_fac) {
shift_scale_fac = (scale_fac < 0.00001f) ? 0.00001f : scale_fac;
return shift_scale_fac;
}
float getScaleFactor() const {
return shift_scale_fac;
}
virtual void update();
private:
int enabled = 0;
int oversampling_factor = 1;
float sample_rate_Hz = 44117.0f;
float shift_scale_fac = 1.0f;
std::unique_ptr<float32_t[]> complex_2N_buffer;
audio_block_f32_t *inputQueueArray_f32[1];
FFT_Overlapped_OA_F32 myFFT;
IFFT_Overlapped_OA_F32 myIFFT;
float envelope_coeffs[NUM_TAPS] = { };
float32_t firStateF32[NUM_TAPS + BLOCK_SIZE - 1];
float32_t envelope_buffer[BLOCK_SIZE];
float32_t overlap_buffer[BLOCK_SIZE]; // Declaration of overlap buffer
void printFFTParameters(const AudioSettings_F32 &settings, int N_FFT) {
Serial.println("AudioEffectFormantShiftFD_OA_F32: FFT parameters...");
Serial.print(" : N_FFT = "); Serial.println(N_FFT);
Serial.print(" : audio_block_samples = "); Serial.println(settings.audio_block_samples);
Serial.print(" : FFT N_BUFF_BLOCKS = "); Serial.println(myFFT.getNBuffBlocks());
Serial.print(" : IFFT N_BUFF_BLOCKS = "); Serial.println(myIFFT.getNBuffBlocks());
Serial.print(" : FFT use window = "); Serial.println(myFFT.getFFTObject()->get_flagUseWindow());
Serial.print(" : IFFT use window = "); Serial.println(myIFFT.getIFFTObject()->get_flagUseWindow());
}
std::unique_ptr<float[]> window_coeffs;
void computeHannWindow(float *window_coeffs, int length) { // Compute a Hann window for the audio signal to be applied before the audio is proccesed
for (int i = 0; i < length; i++) {
window_coeffs = 0.5f * (1.0f - cosf(2.0f * M_PI * static_cast<float>(i) / static_cast<float>(length)));
Serial.println(window_coeffs);
}
}
void applyWindow(audio_block_f32_t *audio_block) { // Apply the window function to the envelope
for (int i = 0; i < BLOCK_SIZE; i++) {
audio_block->data *= window_coeffs;
}
}
void normalizeInputAudio(audio_block_f32_t *audio_block) {
const float noise_gate_threshold = 0.01f; // Adjust this value based on your needs
float max_val = -1.0f;
for (int i = 0; i < BLOCK_SIZE; i++) {
max_val = fmax(max_val, fabs(audio_block->data));
}
if (max_val > noise_gate_threshold) { // Normalize the audio signal
float normalization_factor = 1.0f / max_val;
for (int i = 0; i < BLOCK_SIZE; i++) {
audio_block->data *= normalization_factor;
}
Serial.print("Normalization factor: ");
Serial.println(normalization_factor);
}
}
void computeEnvelope(const audio_block_f32_t *audio_block) { // Compute the envelope of the audio signal using FIR filter
arm_fir_instance_f32 firInstance;
arm_fir_init_f32(&firInstance, NUM_TAPS, envelope_coeffs, firStateF32, BLOCK_SIZE);
arm_fir_f32(&firInstance, audio_block->data, envelope_buffer, BLOCK_SIZE);
}
void performFFT(audio_block_f32_t *audio_block) { // Perform an FFT on the audio signal and store in complex buffer
myFFT.execute(audio_block, complex_2N_buffer.get());
}
void performIFFT() { // Perform an IFFT on the complex buffer and rebuild the audio signal
audio_block_f32_t *out_audio_block = myIFFT.execute(complex_2N_buffer.get());
AudioStream_F32::transmit(out_audio_block);
}
void calculateIdealImpulseResponse(std::vector<float>& impulseResponse, int numTaps, float cutoffFreq) { // Calculate the ideal impluse response for the FIR Filter
int midPoint = numTaps / 2;
for (int i = 0; i < numTaps; i++) {
if (i == midPoint) {
impulseResponse = 2 * cutoffFreq;
} else {
impulseResponse = sin(2 * M_PI * cutoffFreq * (i - midPoint)) / (M_PI * (i - midPoint));
}
}
}
void applyHammingWindow(std::vector<float>& impulseResponse, int numTaps) { // Apply a Hamming window to the impulse response
for (int i = 0; i < numTaps; i++) {
impulseResponse *= 0.54 - 0.46 * cos(2 * M_PI * i / (numTaps - 1)); // Hamming Window
envelope_coeffs = impulseResponse;
}
}
// Function to calculate FIR filter coefficients dynamically
void calculateFIRCoefficients(std::vector<float>& coeffs, int numTaps, float cutoffFreq) {
coeffs.resize(numTaps);
// Calculate the ideal impulse response
calculateIdealImpulseResponse(coeffs, numTaps, cutoffFreq);
// Apply the Hamming window
applyHammingWindow(coeffs, numTaps);
}
void shiftFormants() { // Shift the formants
int fftSize = myFFT.getNFFT();
int N_2 = fftSize / 2 + 1;
float orig_mag[N_2];
arm_cmplx_mag_f32(complex_2N_buffer.get(), orig_mag, N_2); // Get the magnitude of the complez buffer
for (int dest_ind = 0; dest_ind < N_2; dest_ind++) {
float source_ind_float = static_cast<float>(dest_ind) / shift_scale_fac;
int mirrored_source_ind = mirrorIndex(source_ind_float, N_2);
float new_mag = interpolateMagnitude(orig_mag, mirrored_source_ind, source_ind_float, N_2);
float scale = new_mag / orig_mag[dest_ind];
scaleComplexBuffer(dest_ind, scale);
}
myFFT.rebuildNegativeFrequencySpace(complex_2N_buffer.get());
}
int mirrorIndex(float source_ind_float, int N_2) const { // mirror the index
if (source_ind_float < 1.0f) {
return 1 - static_cast<int>(source_ind_float);
} else if (source_ind_float >= N_2 - 1) {
return N_2 - 2 - static_cast<int>(source_ind_float - (N_2 - 1));
} else {
return static_cast<int>(source_ind_float);
}
}
float interpolateMagnitude(const float *orig_mag, int mirrored_source_ind, float source_ind_float, int N_2) const { // cubic interpolation
float y0, y1, y2, y3;
if (mirrored_source_ind < 0 || mirrored_source_ind >= N_2 - 1) {
y0 = y1 = y2 = y3 = 0.0f;
} else {
y0 = orig_mag[mirrored_source_ind - 1];
y1 = orig_mag[mirrored_source_ind];
y2 = orig_mag[mirrored_source_ind + 1];
y3 = orig_mag[mirrored_source_ind + 2];
}
float interp_fac = source_ind_float - static_cast<float>(mirrored_source_ind);
interp_fac = fmax(0.0f, fmin(interp_fac, 1.0f)); // clamp to [0, 1]
float a0 = -0.5f * y0 + 1.5f * y1 - 1.5f * y2 + y3;
float a1 = y0 - 2.5f * y1 + 2.0f * y2 - 0.5f * y3;
float a2 = -0.5f * y0 + 0.5f * y2;
float a3 = y1;
return ((a0 * interp_fac + a1) * interp_fac + a2) * interp_fac + a3;
}
void scaleComplexBuffer(int dest_ind, float scale) { // Scale the complex buffer
float real_part = complex_2N_buffer[2 * dest_ind];
float imag_part = complex_2N_buffer[2 * dest_ind + 1];
complex_2N_buffer[2 * dest_ind] = real_part * scale;
complex_2N_buffer[2 * dest_ind + 1] = imag_part * scale;
}
};
// Improved update method with normalization and overlap-add
void AudioEffectFormantShiftFD_OA_F32::update() {
audio_block_f32_t *in_audio_block = AudioStream_F32::receiveReadOnly_f32();
if (!in_audio_block) return;
if (!enabled) { // if the effect is not enabled just pass the audio through directly
AudioStream_F32::transmit(in_audio_block);
AudioStream_F32::release(in_audio_block);
return;
}
bool has_denormalized_values = false;
const float denorm_threshold = 1e-20f;
for (int i = 0; i < BLOCK_SIZE; i++) { //
if (fabs(in_audio_block->data) < denorm_threshold && in_audio_block->data != 0.0f) { // check for denormalized values
has_denormalized_values = true;
break;
}
}
if (!has_denormalized_values) {
//normalizeInputAudio(in_audio_block); -- Not ideal for vocals
}
//applyWindow(in_audio_block); // apply the window function <--- This casues a bitcrush effect when enabled.
computeEnvelope(in_audio_block); // compute the envelope of the audio using the fir filter
performFFT(in_audio_block); // perform the fft
AudioStream_F32::release(in_audio_block); // release the input block
shiftFormants(); // shift the formants
performIFFT(); // rebuild the audio signal
}
#endif
I've implemented formant shifting logic using cubic interpolation and scaling of the complex buffer and ive used a FIR filter to shape the envelope of the audio signal.
The issues I have:
Ring Modulated Sound: My implementation produces an almost ring modulated sound, especially noticeable on sustained notes.
Bit Crushed Sound with Hann Window: Enabling the Hann window turns the sound into a bit-crushed effect.
I've tried increasing/decreasing block sizes, number of taps etc, but it tends to make it worse, I'm sure It's something glaringly obvious, but any help would be greatly appreciated!
Thanks!
I've included a sample of the audio: Dry - Formant Shift without Hann Window - Formant shift with Hann Window enabled