Hi all,
I'm sharing this library modification with all, in the hope some may find it useful. I noted that whilst developing a little project of mine that the freeverb effect was limited to the single channel option when running on a Teensy 3.2. I needed two independent channels and I only had a Teensy 3.2 available, which was also running TFT code and a small GUI.
Running the Teensy at 120Mhz I noted the following:
The standard freeverb library was consuming ~ 48% of CPU time (per channel). If I ran two simultaneously then as suggest by documentation the total CPU consumption was over 96%. Resulting in my TFT having the performance characteristics of a tectonic plate migration.
I resolved to take a closer look at the code base. It was quickly clear that there had been no attempt to optimise the code to make use of the SIMD capabilities of the M4. I began to make incremental additions using inline assembler. Finally resulting in an almost complete replacement of the code. It looks messy with so many small snippets of asm, but in fact it allowed me to test each element to ensure the computational results of each section was 100% identical to the original code (for good or bad)
The changes included using:
- Packing instructions to align input data and coefficients.
- Quad 16bit multiply with accumulation for state variable calculations.
- Replacement of the C based saturation and round to zero routines with intrinsic M4 SSAT instructions and skipping of round to zero when no truncation was used.
- Saturating additions to the combfilter summations.
Final result was very helpful for my project. I reduced the CPU usage for each instantiation of freeverb to ~32%, thereby allowing me to operate 2 instances on the Teensy 3.2 and still having well over 30% CPU remaining.
Hope someone else may find it helpful:
Here is the code, this is just a single mono instance, if stereo is needed the same code must be duplicated and worked together.
All the best
Aidan
I'm sharing this library modification with all, in the hope some may find it useful. I noted that whilst developing a little project of mine that the freeverb effect was limited to the single channel option when running on a Teensy 3.2. I needed two independent channels and I only had a Teensy 3.2 available, which was also running TFT code and a small GUI.
Running the Teensy at 120Mhz I noted the following:
The standard freeverb library was consuming ~ 48% of CPU time (per channel). If I ran two simultaneously then as suggest by documentation the total CPU consumption was over 96%. Resulting in my TFT having the performance characteristics of a tectonic plate migration.
I resolved to take a closer look at the code base. It was quickly clear that there had been no attempt to optimise the code to make use of the SIMD capabilities of the M4. I began to make incremental additions using inline assembler. Finally resulting in an almost complete replacement of the code. It looks messy with so many small snippets of asm, but in fact it allowed me to test each element to ensure the computational results of each section was 100% identical to the original code (for good or bad)
The changes included using:
- Packing instructions to align input data and coefficients.
- Quad 16bit multiply with accumulation for state variable calculations.
- Replacement of the C based saturation and round to zero routines with intrinsic M4 SSAT instructions and skipping of round to zero when no truncation was used.
- Saturating additions to the combfilter summations.
Final result was very helpful for my project. I reduced the CPU usage for each instantiation of freeverb to ~32%, thereby allowing me to operate 2 instances on the Teensy 3.2 and still having well over 30% CPU remaining.
Hope someone else may find it helpful:
Here is the code, this is just a single mono instance, if stereo is needed the same code must be duplicated and worked together.
All the best
Aidan
Code:
void AudioEffectFreeverb::update()
{
#if defined(__ARM_ARCH_7EM__)
const audio_block_t *block;
audio_block_t *outblock;
int i;
int16_t input, bufout, output;
int32_t sum;
uint32_t PackA, PackB;
outblock = allocate();
if (!outblock) {
audio_block_t *tmp = receiveReadOnly(0);
if (tmp) release(tmp);
return;
}
block = receiveReadOnly(0);
if (!block) block = &zeroblock;
for (i=0; i < AUDIO_BLOCK_SAMPLES; i++) {
// TODO: scale numerical range depending on roomsize & damping
asm volatile(
"MUL %[RESULT], %[BLOCK], %[CONST]\n"
"CMN %[RESULT], #0\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
:[RESULT]"=&r"(input)
:[BLOCK]"r"((int32_t)block->data[i]),[CONST]"r"(8738),[R_RND]"r"(0x1FFFF),[I_SHIFT]"i"(17)
:
);
sum = 0;
asm volatile (
"QADD %[SUM], %[C1], %[C2]\n"
"QADD %[SUM], %[C3], %[C4]\n"
"QADD %[SUM], %[C5], %[C6]\n"
"QADD %[SUM], %[C7], %[C8]\n"
:[SUM]"=r"(sum)
:[C1]"r"((int32_t)comb1buf[comb1index]),[C2]"r"((int32_t)comb2buf[comb2index]),[C3]"r"((int32_t)comb3buf[comb3index]),[C4]"r"((int32_t)comb4buf[comb4index]),[C5]"r"((int32_t)comb5buf[comb5index]),[C6]"r"((int32_t)comb6buf[comb6index]),[C7]"r"((int32_t)comb7buf[comb7index]),[C8]"r"((int32_t)comb8buf[comb8index])
:
);
//output = asm_sat16((sum * 31457), 0x1FFFF, 17);
asm volatile (
"MUL %[RESULT], %[SUM], %[CONST]\n"
"CMN %[RESULT], #0\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
:[RESULT]"=&r"(output)
:[SUM]"r"((int32_t)sum),[CONST]"r"(31457),[R_RND]"r"(0x1FFFF),[I_SHIFT]"i"(17)
:
);
// Comb1
asm volatile (
"PKHBT %[PACKA], %[INBOT], %[INTOP], LSL #16\n"
:[PACKA]"=&r"(PackA)
:[INTOP]"r"(comb1buf[comb1index]),[INBOT]"r"(comb1filter)
:
);
asm volatile (
"PKHBT %[PACKB], %[INBOT], %[INTOP], LSL #16\n"
:[PACKB]"=&r"(PackB)
:[INTOP]"r"(combdamp2),[INBOT]"r"(combdamp1)
:
);
asm volatile(
"BIC %[RESULT], %[RESULT], #0\n"
"SMLAD %[RESULT], %[PACKA], %[PACKB], %[RESULT]\n"
"CMN %[RESULT], #0\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
:[RESULT]"=&r"(comb1filter)
:[PACKA]"r"(PackA),[PACKB]"r"(PackB),[R_RND]"r"(0x7FFF),[I_SHIFT]"i"(15)
:"memory"
);
asm volatile (
"MUL %[RESULT], %[FIL], %[FB]\n"
"CMN %[RESULT], #0\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
"ADD %[RESULT], %[RESULT], %[IN]\n"
"SSAT %[RESULT], #16, %[RESULT]\n"
:[RESULT]"=&r"(comb1buf[comb1index])
:[FIL]"r"(comb1filter),[IN]"r"((int32_t)input),[FB]"r"((int32_t)combfeeback),[R_RND]"r"(0x7FFF),[I_SHIFT]"i"(15)
:
);
// Comb2
asm volatile (
"PKHBT %[PACKA], %[INBOT], %[INTOP], LSL #16\n"
:[PACKA]"=&r"(PackA)
:[INTOP]"r"(comb2buf[comb2index]),[INBOT]"r"(comb2filter)
:
);
asm volatile (
"PKHBT %[PACKB], %[INBOT], %[INTOP], LSL #16\n"
:[PACKB]"=&r"(PackB)
:[INTOP]"r"(combdamp2),[INBOT]"r"(combdamp1)
:
);
asm volatile(
"BIC %[RESULT], %[RESULT], #0\n"
"SMLAD %[RESULT], %[PACKA], %[PACKB], %[RESULT]\n"
"CMN %[RESULT], #0\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
:[RESULT]"=&r"(comb2filter)
:[PACKA]"r"(PackA),[PACKB]"r"(PackB),[R_RND]"r"(0x7FFF),[I_SHIFT]"i"(15)
:"memory"
);
asm volatile (
"MUL %[RESULT], %[FIL], %[FB]\n"
"CMN %[RESULT], #0\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
"ADD %[RESULT], %[RESULT], %[IN]\n"
"SSAT %[RESULT], #16, %[RESULT]\n"
:[RESULT]"=&r"(comb2buf[comb2index])
:[FIL]"r"(comb2filter),[IN]"r"((int32_t)input),[FB]"r"((int32_t)combfeeback),[R_RND]"r"(0x7FFF),[I_SHIFT]"i"(15)
:
);
// Comb3
asm volatile (
"PKHBT %[PACKA], %[INBOT], %[INTOP], LSL #16\n"
:[PACKA]"=&r"(PackA)
:[INTOP]"r"(comb3buf[comb3index]),[INBOT]"r"(comb3filter)
:
);
asm volatile (
"PKHBT %[PACKB], %[INBOT], %[INTOP], LSL #16\n"
:[PACKB]"=&r"(PackB)
:[INTOP]"r"(combdamp2),[INBOT]"r"(combdamp1)
:
);
asm volatile(
"BIC %[RESULT], %[RESULT], #0\n"
"SMLAD %[RESULT], %[PACKA], %[PACKB], %[RESULT]\n"
"CMN %[RESULT], #0\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
:[RESULT]"=&r"(comb3filter)
:[PACKA]"r"(PackA),[PACKB]"r"(PackB),[R_RND]"r"(0x7FFF),[I_SHIFT]"i"(15)
:"memory"
);
asm volatile (
"MUL %[RESULT], %[FIL], %[FB]\n"
"CMN %[RESULT], #0\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
"ADD %[RESULT], %[RESULT], %[IN]\n"
"SSAT %[RESULT], #16, %[RESULT]\n"
:[RESULT]"=&r"(comb3buf[comb3index])
:[FIL]"r"(comb3filter),[IN]"r"((int32_t)input),[FB]"r"((int32_t)combfeeback),[R_RND]"r"(0x7FFF),[I_SHIFT]"i"(15)
:
);
// Comb4
asm volatile (
"PKHBT %[PACKA], %[INBOT], %[INTOP], LSL #16\n"
:[PACKA]"=&r"(PackA)
:[INTOP]"r"(comb4buf[comb4index]),[INBOT]"r"(comb4filter)
:
);
asm volatile (
"PKHBT %[PACKB], %[INBOT], %[INTOP], LSL #16\n"
:[PACKB]"=&r"(PackB)
:[INTOP]"r"(combdamp2),[INBOT]"r"(combdamp1)
:
);
asm volatile(
"BIC %[RESULT], %[RESULT], #0\n"
"SMLAD %[RESULT], %[PACKA], %[PACKB], %[RESULT]\n"
"CMN %[RESULT], #0\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
:[RESULT]"=&r"(comb4filter)
:[PACKA]"r"(PackA),[PACKB]"r"(PackB),[R_RND]"r"(0x7FFF),[I_SHIFT]"i"(15)
:"memory"
);
asm volatile (
"MUL %[RESULT], %[FIL], %[FB]\n"
"CMN %[RESULT], #0\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
"ADD %[RESULT], %[RESULT], %[IN]\n"
"SSAT %[RESULT], #16, %[RESULT]\n"
:[RESULT]"=&r"(comb4buf[comb4index])
:[FIL]"r"(comb4filter),[IN]"r"((int32_t)input),[FB]"r"((int32_t)combfeeback),[R_RND]"r"(0x7FFF),[I_SHIFT]"i"(15)
:
);
// Comb5
asm volatile (
"PKHBT %[PACKA], %[INBOT], %[INTOP], LSL #16\n"
:[PACKA]"=&r"(PackA)
:[INTOP]"r"(comb5buf[comb5index]),[INBOT]"r"(comb5filter)
:
);
asm volatile (
"PKHBT %[PACKB], %[INBOT], %[INTOP], LSL #16\n"
:[PACKB]"=&r"(PackB)
:[INTOP]"r"(combdamp2),[INBOT]"r"(combdamp1)
:
);
asm volatile(
"BIC %[RESULT], %[RESULT], #0\n"
"SMLAD %[RESULT], %[PACKA], %[PACKB], %[RESULT]\n"
"CMN %[RESULT], #0\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
:[RESULT]"=&r"(comb5filter)
:[PACKA]"r"(PackA),[PACKB]"r"(PackB),[R_RND]"r"(0x7FFF),[I_SHIFT]"i"(15)
:"memory"
);
asm volatile (
"MUL %[RESULT], %[FIL], %[FB]\n"
"CMN %[RESULT], #0\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
"ADD %[RESULT], %[RESULT], %[IN]\n"
"SSAT %[RESULT], #16, %[RESULT]\n"
:[RESULT]"=&r"(comb5buf[comb5index])
:[FIL]"r"(comb5filter),[IN]"r"((int32_t)input),[FB]"r"((int32_t)combfeeback),[R_RND]"r"(0x7FFF),[I_SHIFT]"i"(15)
:
);
// Comb6
asm volatile (
"PKHBT %[PACKA], %[INBOT], %[INTOP], LSL #16\n"
:[PACKA]"=&r"(PackA)
:[INTOP]"r"(comb6buf[comb6index]),[INBOT]"r"(comb6filter)
:
);
asm volatile (
"PKHBT %[PACKB], %[INBOT], %[INTOP], LSL #16\n"
:[PACKB]"=&r"(PackB)
:[INTOP]"r"(combdamp2),[INBOT]"r"(combdamp1)
:
);
asm volatile(
"BIC %[RESULT], %[RESULT], #0\n"
"SMLAD %[RESULT], %[PACKA], %[PACKB], %[RESULT]\n"
"CMN %[RESULT], #0\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
:[RESULT]"=&r"(comb6filter)
:[PACKA]"r"(PackA),[PACKB]"r"(PackB),[R_RND]"r"(0x7FFF),[I_SHIFT]"i"(15)
:"memory"
);
asm volatile (
"MUL %[RESULT], %[FIL], %[FB]\n"
"CMN %[RESULT], #0\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
"ADD %[RESULT], %[RESULT], %[IN]\n"
"SSAT %[RESULT], #16, %[RESULT]\n"
:[RESULT]"=&r"(comb6buf[comb6index])
:[FIL]"r"(comb6filter),[IN]"r"((int32_t)input),[FB]"r"((int32_t)combfeeback),[R_RND]"r"(0x7FFF),[I_SHIFT]"i"(15)
:
);
// Comb7
asm volatile (
"PKHBT %[PACKA], %[INBOT], %[INTOP], LSL #16\n"
:[PACKA]"=&r"(PackA)
:[INTOP]"r"(comb7buf[comb7index]),[INBOT]"r"(comb7filter)
:
);
asm volatile (
"PKHBT %[PACKB], %[INBOT], %[INTOP], LSL #16\n"
:[PACKB]"=&r"(PackB)
:[INTOP]"r"(combdamp2),[INBOT]"r"(combdamp1)
:
);
asm volatile(
"BIC %[RESULT], %[RESULT], #0\n"
"SMLAD %[RESULT], %[PACKA], %[PACKB], %[RESULT]\n"
"CMN %[RESULT], #0\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
:[RESULT]"=&r"(comb7filter)
:[PACKA]"r"(PackA),[PACKB]"r"(PackB),[R_RND]"r"(0x7FFF),[I_SHIFT]"i"(15)
:"memory"
);
asm volatile (
"MUL %[RESULT], %[FIL], %[FB]\n"
"CMN %[RESULT], #0\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
"ADD %[RESULT], %[RESULT], %[IN]\n"
"SSAT %[RESULT], #16, %[RESULT]\n"
:[RESULT]"=&r"(comb7buf[comb7index])
:[FIL]"r"(comb7filter),[IN]"r"((int32_t)input),[FB]"r"((int32_t)combfeeback),[R_RND]"r"(0x7FFF),[I_SHIFT]"i"(15)
:
);
// Comb8
asm volatile (
"PKHBT %[PACKA], %[INBOT], %[INTOP], LSL #16\n"
:[PACKA]"=&r"(PackA)
:[INTOP]"r"(comb8buf[comb8index]),[INBOT]"r"(comb8filter)
:
);
asm volatile (
"PKHBT %[PACKB], %[INBOT], %[INTOP], LSL #16\n"
:[PACKB]"=&r"(PackB)
:[INTOP]"r"(combdamp2),[INBOT]"r"(combdamp1)
:
);
asm volatile(
"BIC %[RESULT], %[RESULT], #0\n"
"SMLAD %[RESULT], %[PACKA], %[PACKB], %[RESULT]\n"
"CMN %[RESULT], #0\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
:[RESULT]"=&r"(comb8filter)
:[PACKA]"r"(PackA),[PACKB]"r"(PackB),[R_RND]"r"(0x7FFF),[I_SHIFT]"i"(15)
:"memory"
);
asm volatile(
"MUL %[RESULT], %[FIL], %[FB]\n"
"CMN %[RESULT], #0\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
"ADD %[RESULT], %[RESULT], %[IN]\n"
"SSAT %[RESULT], #16, %[RESULT]\n"
:[RESULT]"=&r"(comb8buf[comb8index])
:[FIL]"r"(comb8filter),[IN]"r"((int32_t)input),[FB]"r"((int32_t)combfeeback),[R_RND]"r"(0x7FFF),[I_SHIFT]"i"(15)
:
);
bufout = allpass1buf[allpass1index];
allpass1buf[allpass1index] = output + (bufout >> 1);
asm volatile(
"SUBS %[RESULT], %[BUF], %[OUT]\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
:[RESULT]"=&r"(output)
:[BUF]"r"((int32_t)bufout),[OUT]"r"((int32_t)output),[R_RND]"r"(0x1),[I_SHIFT]"i"(1)
:
);
bufout = allpass2buf[allpass2index];
allpass2buf[allpass2index] = output + (bufout >> 1);
asm volatile(
"SUBS %[RESULT], %[BUF], %[OUT]\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
:[RESULT]"=&r"(output)
:[BUF]"r"((int32_t)bufout),[OUT]"r"((int32_t)output),[R_RND]"r"(0x1),[I_SHIFT]"i"(1)
:
);
bufout = allpass3buf[allpass3index];
allpass3buf[allpass3index] = output + (bufout >> 1);
asm volatile(
"SUBS %[RESULT], %[BUF], %[OUT]\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
:[RESULT]"=&r"(output)
:[BUF]"r"((int32_t)bufout),[OUT]"r"((int32_t)output),[R_RND]"r"(0x1),[I_SHIFT]"i"(1)
:
);
bufout = allpass4buf[allpass4index];
allpass4buf[allpass4index] = output + (bufout >> 1);
asm volatile(
"SUBS %[RESULT], %[BUF], %[OUT]\n"
"IT MI\n"
"ADDMI %[RESULT], %[RESULT], %[R_RND]\n"
"SSAT %[RESULT], #16, %[RESULT], ASR %[I_SHIFT]\n"
:[RESULT]"=&r"(output)
:[BUF]"r"((int32_t)bufout),[OUT]"r"((int32_t)output),[R_RND]"r"(0x1),[I_SHIFT]"i"(1)
:
);
asm volatile(
"MUL %[RESULT], %[OUT], %[CONST]\n"
"SSAT %[RESULT], #16, %[RESULT]\n"
:[RESULT]"=&r"(outblock->data[i])
:[OUT]"r"((int32_t)output),[CONST]"r"(30)
:
);
if (++allpass1index >= sizeof(allpass1buf)/sizeof(int16_t)) allpass1index = 0;
if (++allpass2index >= sizeof(allpass2buf)/sizeof(int16_t)) allpass2index = 0;
if (++allpass3index >= sizeof(allpass3buf)/sizeof(int16_t)) allpass3index = 0;
if (++allpass4index >= sizeof(allpass4buf)/sizeof(int16_t)) allpass4index = 0;
if (++comb1index >= sizeof(comb1buf)/sizeof(int16_t)) comb1index = 0;
if (++comb2index >= sizeof(comb2buf)/sizeof(int16_t)) comb2index = 0;
if (++comb3index >= sizeof(comb3buf)/sizeof(int16_t)) comb3index = 0;
if (++comb4index >= sizeof(comb4buf)/sizeof(int16_t)) comb4index = 0;
if (++comb5index >= sizeof(comb5buf)/sizeof(int16_t)) comb5index = 0;
if (++comb6index >= sizeof(comb6buf)/sizeof(int16_t)) comb6index = 0;
if (++comb7index >= sizeof(comb7buf)/sizeof(int16_t)) comb7index = 0;
if (++comb8index >= sizeof(comb8buf)/sizeof(int16_t)) comb8index = 0;
}
transmit(outblock);
release(outblock);
if (block != &zeroblock) release((audio_block_t *)block);
#elif defined(KINETISL)
audio_block_t *block;
block = receiveReadOnly(0);
if (block) release(block);
#endif
}
Last edited: