Note: It looks like the state has to be an exact-width type (so uint*N*_t and not uint_fast*N*_t), as any extra bits will mess up the modulo arithmetic.

The 16-bit state is intended for input samples within an *N*-bit range and *Q*-bit shift, where *N*+*Q* ≤ 16. On 8-bit microcontrollers, this yields much, much faster code than using 32-bit state.

On 32-bit architectures,

Code:

#include <stdint.h>
static const uint_fast8_t filter32_shift = 4;
uint32_t filter32_state;
void filter32_init(const int32_t value)
{
filter32_state = (1 << 31) - value + (value << filter32_shift);
}
int32_t filter32(const int32_t sample)
{
filter32_state += sample;
uint_fast32_t result = ((filter32_state + (1 << (filter32_shift - 1))) >> filter32_shift) - (1 << (31 - filter32_shift));
filter32_state -= result;
return (int32_t)result;
}

for example arm-gcc-7.3 -O2 produces

Code:

filter32_init:
rsb r0, r0, r0, lsl #4
ldr r3, .L3
add r0, r0, #-2147483648
str r0, [r3]
bx lr
.L3:
.word filter32_state
filter32:
ldr r2, .L6
ldr r3, [r2]
add r3, r0, r3
add r0, r3, #8
lsr r0, r0, #4
add r0, r0, #-134217728
sub r3, r3, r0
str r3, [r2]
bx lr
.L6:
.word filter32_state

and marking the functions static inline should mean the per-sample filtering overhead is minimal, just a handful of cycles.

On the other hand, on avr-gcc-5.4.0, we get

Code:

filter32_init(long):
push r12
push r13
push r14
push r15
push r16
push r17
mov r16,r22
mov r17,r23
mov r18,r24
mov r19,r25
lsl r16
rol r17
rol r18
rol r19
lsl r16
rol r17
rol r18
rol r19
lsl r16
rol r17
rol r18
rol r19
lsl r16
rol r17
rol r18
rol r19
mov r12,r16
mov r13,r17
mov r14,r18
mov r15,r19
sub r12,r22
sbc r13,r23
sbc r14,r24
sbc r15,r25
sts filter32_state,r12
sts filter32_state+1,r13
sts filter32_state+2,r14
sts filter32_state+3,r15
pop r17
pop r16
pop r15
pop r14
pop r13
pop r12
ret
filter32(long):
push r16
push r17
lds r16,filter32_state
lds r17,filter32_state+1
lds r18,filter32_state+2
lds r19,filter32_state+3
add r16,r22
adc r17,r23
adc r18,r24
adc r19,r25
mov r23,r19
mov r22,r18
mov r21,r17
mov r20,r16
subi r20,-8
sbci r21,-1
sbci r22,-1
sbci r23,-1
mov r25,r23
mov r24,r22
mov r23,r21
mov r22,r20
ldi r20,4
1:
lsr r25
ror r24
ror r23
ror r22
dec r20
brne 1b
sub r16,r22
sbc r17,r23
sbc r18,r24
sbc r19,r25
sts filter32_state,r16
sts filter32_state+1,r17
sts filter32_state+2,r18
sts filter32_state+3,r19
pop r17
pop r16
ret
filter32_state:
.zero 4

which is not really something I'd like to use.

The equivalent but limited to 16-*filter16_shift* -bit sample range, with minimum sample value -filter16_offset (you can set that to whatever 16-bit value you need before initializing the filter), is

Code:

#include <stdint.h>
static const uint_fast8_t filter16_shift = 4;
static const int16_t filter16_offset = 1 << (15 - filter16_shift);
static const uint16_t filter16_half = 1 << (filter16_shift - 1);
uint16_t filter16_state;
void filter16_init(const int16_t value)
{
filter16_state = ((value + filter16_offset) << filter16_shift) - value;
}
int16_t filter16(const int16_t sample)
{
filter16_state += sample;
uint_fast16_t result = ((filter16_state + filter16_half) >> filter16_shift) - filter16_offset;
filter16_state -= result;
return (int16_t)result;
}

which compiles on avr-gcc-5.4 -O2 to a much nicer

Code:

filter16_init(int):
mov r18,r24
mov r19,r25
subi r19,-8
swap r18
swap r19
andi r19,0xf0
eor r19,r18
andi r18,0xf0
eor r19,r18
mov r20,r18
mov r21,r19
sub r20,r24
sbc r21,r25
sts filter16_state+1,r21
sts filter16_state,r20
ret
filter16(int):
lds r18,filter16_state
lds r19,filter16_state+1
add r18,r24
adc r19,r25
mov r24,r18
mov r25,r19
adiw r24,8
swap r25
swap r24
andi r24,0x0f
eor r24,r25
andi r25,0x0f
eor r24,r25
subi r25,8
sub r18,r24
sbc r19,r25
sts filter16_state+1,r19
sts filter16_state,r18
ret
filter16_state:
.zero 2

which is much, much preferable.

I did verify the above produce the correct results, but since this is just idle exploration, brainfarts/typos may have slipped in.