Biquad results using multiply_accumulate_32x32_rshift32_rounded

Status
Not open for further replies.
D

DeletedUser

Guest
I was interesting in finding out what the differences are between using the standard Teensy biquad library and using a direct form 1 32x32 MAC for a 6th order LPF at fc=1000Hz.
Enclosed, find the results.

In Conclusion, I found that:

1. There is an improvement to the frequency response (except for a -90 dB DC component).
2. There are 86.30 cycles/sample or 5.75 cycles/MAC

My question is:

1. How can I remove the DC component (which I believe is due to the fact that we loose 1 bit of information due to the rshift32 instead of shifting by 31)?
2. I would like to know what the number of cycles per sample is for the standard Teensy biquad library. (I don't have access to probe the ARM_DWT_CYCCNT at its input in my sketch).


Code:
#include <Audio.h>
#include <Wire.h>
#include <SPI.h>
#include <SD.h>
#include <SerialFlash.h>
#include "AudioSampleSig1and3kHz.h"
#include "AudioSampleSig1and3kHz.h"
//#define DEBUG
AudioPlayMemory          sound0;
AudioOutputUSB           usb_out;
AudioFilterBiquad        biquad1;        
AudioOutputI2S           i2s2;     
AudioRecordQueue         queue1;
AudioPlayQueue           queue2;
AudioConnection          patchCord1(sound0, 0, biquad1, 0);
AudioConnection          patchCord2(sound0, 0, queue1, 0);
AudioConnection          patchCord3(biquad1, 0, i2s2, 0);
AudioConnection          patchCord4(queue2, 0, i2s2, 1);
AudioConnection          patchCord5(biquad1, 0, usb_out, 0);
AudioConnection          patchCord6(queue2, 0, usb_out, 1);
AudioControlSGTL5000     sgtl5000_1;     //xy=523.2000122070312,327


//const int myInput = AUDIO_INPUT_LINEIN;
//const int myInput = AUDIO_INPUT_MIC;

//------------------------IIR parameters ---------------
//6th order iir lpfilter at 1100Hz (option 2 of calcfilt.m)

//-------- for standard biquad library ----------------
double coefs0[]=
{ 
#include "iircoefs0.h"
};
double coefs1[]=
{ 
#include "iircoefs1.h"
};
double coefs2[]=
{ 
#include "iircoefs2.h"
};

//--------- for myTeensyBiq ------------------
int32_t numi[]=
{
#include "numi.h"
};
int32_t deni[]=
{
#include "deni.h"
};
#define NUMCELLS 3
int32_t Dni[2],Ddi[2*NUMCELLS];
#ifdef DEBUG
unsigned long last_time = millis();
int cyc=0,nMACs;
#endif
//------------------------------------------------------

void setup() {
#ifdef DEBUG
  Serial.begin(115200);
  delay(500);
#endif
  AudioMemory(12);
  sgtl5000_1.enable();  // Enable the audio shield
//  sgtl5000_1.inputSelect(myInput);
  sgtl5000_1.volume(0.5);

  biquad1.setCoefficients(0,coefs0);
  biquad1.setCoefficients(1,coefs1);
  biquad1.setCoefficients(2,coefs2);
  // Start the record queue
  queue1.begin();
}


void loop() {
  if (sound0.isPlaying() == false)
  {
    sound0.play(AudioSampleSig1and3khz);
  }
  if (queue1.available() >= 1) //Left Channel
  {
#ifdef DEBUG
    cyc = ARM_DWT_CYCCNT;
    myTeensyBiq(queue1.readBuffer(),queue2.getBuffer(),numi,deni,Dni,Ddi,NUMCELLS);
    cyc = ARM_DWT_CYCCNT - cyc;
#else
    myTeensyBiq(queue1.readBuffer(),queue2.getBuffer(),numi,deni,Dni,Ddi,NUMCELLS);
#endif
    // Free the input audio buffer
    queue1.freeBuffer();
    // and play it back into the audio queue
    queue2.playBuffer();
  } //if (queue1.available() >= 1)
#ifdef DEBUG
  if (millis() - last_time >= 2500) 
  {
    nMACs=AUDIO_BLOCK_SAMPLES*NUMCELLS*5;
    Serial.print(" myTeensyBiq cycles/sample=");
    Serial.print((float)cyc/(float)AUDIO_BLOCK_SAMPLES);
    Serial.print(" cycles/MAC=");
    Serial.println((float)cyc/(float)nMACs);
    last_time=millis();
  }
#endif
}

void myTeensyBiq(int16_t *inbuf,int16_t *outbuf,int32_t *numi,int32_t *deni,int32_t *Dni,int32_t *Ddi,int numcells)
{
  int coefnum,datanum,i,j;
  int32_t input,res,accu;
  int16_t *bp1,*bp2;
  bp1=inbuf;
  bp2=outbuf; 
  // Copy from input to output buffer
  for( j = 0;j < AUDIO_BLOCK_SAMPLES;j++) 
  {
    input=((int32_t)*bp1++)<<16;
    coefnum=0;
    datanum=0;
    accu=multiply_32x32_rshift32_rounded(input,numi[coefnum]);
    accu=multiply_accumulate_32x32_rshift32_rounded(accu,Dni[datanum],numi[coefnum+1]);
    accu=multiply_accumulate_32x32_rshift32_rounded(accu,Dni[datanum],numi[coefnum+1]);
    accu=multiply_accumulate_32x32_rshift32_rounded(accu,Dni[datanum+1],numi[coefnum+2]);
    Dni[datanum+1]=Dni[datanum];
    Dni[datanum]=input;
    for (i=1;i<numcells;i++)
    { 
      accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
      accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
      accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum+1],-deni[coefnum+2]);
      res=accu<<1;
      coefnum+=3;
      accu=multiply_32x32_rshift32_rounded(res,numi[coefnum]);
      accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],numi[coefnum+1]);
      accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],numi[coefnum+1]);
      accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum+1],numi[coefnum+2]);
      Ddi[datanum+1]=Ddi[datanum];
      Ddi[datanum]=res;
      datanum+=2;
    }
    accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
    accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
    accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum+1],-deni[coefnum+2]);
    Ddi[datanum+1]=Ddi[datanum];
    Ddi[datanum]=accu<<1;
    *bp2++=(int16_t)(accu>>15); //round result to 16 bits
  }
}

BiqInput2.jpg
BiqTemplate2.jpg
OutStdTeensyLibrary2.jpg
OutMyTeensyBiq2.jpg
myTeensyBiqStructure2.jpg
myTeensyBiqCycles2.jpg
 
Code:
    *bp2++=(int16_t)(accu>>15); //round result to 16 bits
floor() is not the same as round() - you are using the fixpoint equivalent of floor(accu / 32768.0) here.
This is the fair rounding way:
Code:
    *bp2++=(int16_t)((accu + 0x4000) >>15); //round result to 16 bits without bias
That's likely where the DC bias comes from.
 
floor() is not the same as round() - you are using the fixpoint equivalent of floor(accu / 32768.0) here.
This is the fair rounding way:
Code:
    *bp2++=(int16_t)((accu + 0x4000) >>15); //round result to 16 bits without bias
That's likely where the DC bias comes from.

Yes I forgot to do that. DC offset is gone now. I also made a mistake on cycles/mac because I was counting only 5 mac's per biquad when it's really 7 (2nd term of num & den polynomial coefs can be > 1 so I scaled them by .5 and did the mac twice. I get 4.11 cycles per mac now. What are the cycles for the library biquad or how can I measure them? Why is the 32x32 multiply shifted by 32 instead of 31? Thanks.
 
I have recorded biquad outputs to SDCard instead of thru the USB interface to avoid any additional processing to them.
Here is my code and results (cycles corrected for calculation error).

What are the cycles needed for the std library biquad and why is the 32x32 multiply right shifted by 32 instead of 31?

myTeensyBiqCycles2.jpg OutMyTeensyBiqToSD2.jpg OutStdTeensyLibraryToSD2.jpg

Code:
 // Record sound as raw data to an SD card
//
// Requires the audio shield:
//   http://www.pjrc.com/store/teensy3_audio.html
//
// 1 pushbutton needs to be connected:
//   Record Button: pin 0 to GND
//
// This example code is in the public domain.

#include <Bounce.h>
#include <Audio.h>
#include <Wire.h>
#include <SPI.h>
#include <SD.h>
#include <SerialFlash.h>
#include "AudioSampleSig1and3kHz.h"
#define RECORD_PIN 0
#define DEBUG

#define stdBIQinput 0 //which mixer inputs?
#define myBIQinput 1
#define MAXITER 1722 // 1722x256(sample buffers)=440832 samples (approx. 10sec) stored in SDCARD before stopRecording

// GUItool: begin automatically generated code
AudioPlayMemory          sound0;       //xy=222.20001220703125,471.20001220703125
AudioRecordQueue         queue1;         //xy=430.20001220703125,551.2000122070312
AudioFilterBiquad        biquad1;        //xy=498.20001220703125,423.20001220703125
AudioPlayQueue           queue2;         //xy=566.2000122070312,552.2000122070312
AudioOutputI2S           i2s1;           //xy=743.2000122070312,431.2000427246094
AudioMixer4              mixer1;         //xy=745.2000122070312,533.2000122070312
AudioRecordQueue         queue3;         //xy=892.2000122070312,532.2000122070312
AudioConnection          patchCord1(sound0, queue1);
AudioConnection          patchCord2(sound0, biquad1);
AudioConnection          patchCord3(biquad1, 0, i2s1, 0);
AudioConnection          patchCord4(biquad1, 0, mixer1, stdBIQinput);
AudioConnection          patchCord5(queue2, 0, i2s1, 1);
AudioConnection          patchCord6(queue2, 0, mixer1, myBIQinput);
AudioConnection          patchCord7(mixer1, queue3);
AudioControlSGTL5000     sgtl5000_1;     //xy=506.20001220703125,665.2000122070312
// GUItool: end automatically generated code

// Bounce objects to easily and reliably read the buttons
Bounce buttonRecord = Bounce(RECORD_PIN, 8);

// Use these with the Teensy Audio Shield
#define SDCARD_CS_PIN    10
#define SDCARD_MOSI_PIN  7
#define SDCARD_SCK_PIN   14

//------------------------IIR parameters ---------------
//6th order iir lpfilter at 1100Hz (option 2 of calcfilt.m)
//for myTeensyBiq 
int32_t numi[]=
{
#include "numi.h"
};
int32_t deni[]=
{
#include "deni.h"
};
#define NUMCELLS 3
int32_t Dni[2]={0,0};
int32_t Ddi[2*NUMCELLS]={0,0,0,0,0,0};
#ifdef DEBUG
unsigned long last_time = millis();
int cyc=0,nMACs;
#endif
//for standard biquad library
double coefs0[]=
{ 
#include "iircoefs0.h"
};
double coefs1[]=
{ 
#include "iircoefs1.h"
};
double coefs2[]=
{ 
#include "iircoefs2.h"
};

//------------------------------------------------------

// Remember which mode we're doing
int mode = 0;  // 0=stopped, 1=recording, 2=stopRecording

// Choose which biquad output to be recorded -> standard Teensy library biquad or myTeensyBiquad with 32x32 Mac's
int MYBIQ=1;

// The file where data is recorded
File frec;

// counter for number of 256 sample buffers to be recorded on SDCard
int nbiter=0;

void setup() 
{
  // Configure the pushbutton pins
  pinMode(RECORD_PIN, INPUT_PULLUP);

#ifdef DEBUG
  Serial.begin(115200);
  delay(500);
#endif
  // Audio connections require memory, and the record queue
  // uses this memory to buffer incoming audio.
  AudioMemory(60);

  // Enable the audio shield, select input, and enable output
  sgtl5000_1.enable();
  sgtl5000_1.volume(0.5);

  biquad1.setCoefficients(0,coefs0);
  biquad1.setCoefficients(1,coefs1);
  biquad1.setCoefficients(2,coefs2);

  // Initialize the SD card
  SPI.setMOSI(SDCARD_MOSI_PIN);
  SPI.setSCK(SDCARD_SCK_PIN);
  if (!(SD.begin(SDCARD_CS_PIN))) {
    // stop here if no SD card, but print a message
    while (1) {
#ifdef DEBUG
      Serial.println("Unable to access the SD card");
#endif
      delay(500);
    }
  }
  queue1.begin();
} //end setup()


void loop() 
{
  //keep playing sound0 forever
  if (sound0.isPlaying() == false)
  {
#ifdef DEBUG
    Serial.println("sound0.play");
#endif
    sound0.play(AudioSampleSig1and3khz);
  }
  
  // First, read the button
  buttonRecord.update();
  // Respond to button presses
  if (buttonRecord.fallingEdge()) 
  {
#ifdef DEBUG
    Serial.println("Record Button Press");
#endif
    if (mode == 0) startRecording();
  }
  else
  {
    if (mode == 1) 
    {
      // If we'rerecording, carry on...
      continueRecording();
    }
    else
    { 
      if (mode == 2)
      // Stop Recording 
      {
        stopRecording();
      }
    }
  }
  if (queue1.available() >= 1) //myTeensyBiquad
  {
#ifdef DEBUG
    cyc = ARM_DWT_CYCCNT;
    myTeensyBiq(queue1.readBuffer(),queue2.getBuffer(),numi,deni,Dni,Ddi,NUMCELLS);
    cyc = ARM_DWT_CYCCNT - cyc;
#else
    myTeensyBiq(queue1.readBuffer(),queue2.getBuffer(),numi,deni,Dni,Ddi,NUMCELLS);
#endif
    // Free the input audio buffer
    queue1.freeBuffer();
    // and play it back into the audio queue
    queue2.playBuffer();
  } //if (queue1.available() >= 1)
#ifdef DEBUG
  if (millis() - last_time >= 2500) 
  {
    nMACs=AUDIO_BLOCK_SAMPLES*NUMCELLS*7;
    Serial.print(" myTeensyBiq cycles/sample=");
    Serial.print((float)cyc/(float)AUDIO_BLOCK_SAMPLES);
    Serial.print(" cycles/MAC=");
    Serial.println((float)cyc/(float)nMACs);
    last_time=millis();
  }
#endif
} //end loop()


void startRecording() 
{
#ifdef DEBUG
  Serial.print("startRecording, MYBIQ=");
  Serial.println(MYBIQ);
#endif
  //select which biquad output to be recorded
  selectBIQ(MYBIQ);
  
  // The SD library writes new data to the end of the
  // file, so to start a new recording, the old file
  // must be deleted before new data is written.
  if (MYBIQ)
  {
    if (SD.exists("mybiq.raw")) 
    {
      SD.remove("mybiq.raw");
    }
    frec = SD.open("mybiq.raw", FILE_WRITE);
  }
  else
  {
    if (SD.exists("stdbiq.raw")) 
    {
      SD.remove("stdbiq.raw");
    }
    frec = SD.open("stdbiq.raw", FILE_WRITE);
  }
  if (frec) 
  {
    queue3.begin();
    mode = 1;
  }
}

void continueRecording() 
{
  if (queue3.available() >= 2) 
  {
    byte buffer[512];
    // Fetch 2 blocks from the audio library and copy
    // into a 512 byte buffer.  The Arduino SD library
    // is most efficient when full 512 byte sector size
    // writes are used.
    memcpy(buffer, queue3.readBuffer(), 256);
    queue3.freeBuffer();
    memcpy(buffer+256, queue3.readBuffer(), 256);
    queue3.freeBuffer();
    // write all 512 bytes to the SD card
    elapsedMicros usec = 0;
    frec.write(buffer, 512);
    // Uncomment these lines to see how long SD writes
    // are taking.  A pair of audio blocks arrives every
    // 5802 microseconds, so hopefully most of the writes
    // take well under 5802 us.  Some will take more, as
    // the SD library also must write to the FAT tables
    // and the SD card controller manages media erase and
    // wear leveling.  The queue object can buffer
    // approximately 301700 us of audio, to allow time
    // for occasional high SD card latency, as long as
    // the average write time is under 5802 us.
    Serial.print("SD write, us=");
    Serial.println(usec);
    if (nbiter<MAXITER) nbiter++; //440832 samples (approx. 10sec) stored in SDCARD before stopRecording
    else mode=2; //stop Recording
  }
}

void stopRecording() 
{
#ifdef DEBUG
  Serial.print("stopRecording of MYBIQ=");
  Serial.println(MYBIQ);
#endif
  queue3.end();
  while (queue3.available() > 0) 
  {
    frec.write((byte*)queue3.readBuffer(), 256);
    queue3.freeBuffer();
  }
  frec.close();
  nbiter=0;
  mode=0;
  if (MYBIQ) MYBIQ=0;
  else MYBIQ=1;
}

void myTeensyBiq(int16_t *inbuf,int16_t *outbuf,int32_t *numi,int32_t *deni,int32_t *Dni,int32_t *Ddi,int numcells)
{
  int coefnum,datanum,i,j;
  int32_t input,res,accu;
  int16_t *bp1,*bp2;
  bp1=inbuf;
  bp2=outbuf; 
  // Copy from input to output buffer
  for( j = 0;j < AUDIO_BLOCK_SAMPLES;j++) 
  {
    input=((int32_t)*bp1++)<<16;
    coefnum=0;
    datanum=0;
    accu=multiply_32x32_rshift32_rounded(input,numi[coefnum]);
    accu=multiply_accumulate_32x32_rshift32_rounded(accu,Dni[datanum],numi[coefnum+1]);
    accu=multiply_accumulate_32x32_rshift32_rounded(accu,Dni[datanum],numi[coefnum+1]);
    accu=multiply_accumulate_32x32_rshift32_rounded(accu,Dni[datanum+1],numi[coefnum+2]);
    Dni[datanum+1]=Dni[datanum];
    Dni[datanum]=input;
    for (i=1;i<numcells;i++)
    { 
      accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
      accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
      accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum+1],-deni[coefnum+2]);
      res=accu<<1;
      coefnum+=3;
      accu=multiply_32x32_rshift32_rounded(res,numi[coefnum]);
      accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],numi[coefnum+1]);
      accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],numi[coefnum+1]);
      accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum+1],numi[coefnum+2]);
      Ddi[datanum+1]=Ddi[datanum];
      Ddi[datanum]=res;
      datanum+=2;
    }
    accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
    accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
    accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum+1],-deni[coefnum+2]);
    Ddi[datanum+1]=Ddi[datanum];
    Ddi[datanum]=accu<<1;
    *bp2++=(int16_t)((accu+0x4000)>>15); //round result to 16 bits
  }
}

void selectBIQ(int myBIQ) //select myBIQ or stdBIQ
{
  if (myBIQ)
  {
    mixer1.gain(myBIQinput,1.0); //connect myBIQ to output
    mixer1.gain(stdBIQinput,0.0);
  }
  else
  {
    mixer1.gain(myBIQinput,0.0); //connect stdBIQ to output
    mixer1.gain(stdBIQinput,1.0);
  }
}
 
What are the cycles needed for the std library biquad and why is the 32x32 multiply right shifted by 32 instead of 31?

I'm not sure which multiply you mean, the standard biquad does 32x16 multiples which internally shift right 16,
and then shifts these results right by a further 14.

The coefficients are in fix2.30 format and the samples in fix1.15, so the multiplies give fix3.29 which and then
shifted by 14 to fix17.15 format, then truncated to 1.15 for the final result samples.

annotated filter_biquad.cpp snippet as I understand it:
Code:
void AudioFilterBiquad::update(void)
{
	audio_block_t *block;
	int32_t b0, b1, b2, a1, a2, sum;   // fix2.30  coefficients
	uint32_t in2, out2, bprev, aprev, flag;  // packed 16+16 variables for samples, delayed samlples and outputs
	uint32_t *data, *end;
	int32_t *state;
	block = receiveWritable();
	if (!block) return;
	end = (uint32_t *)(block->data) + AUDIO_BLOCK_SAMPLES/2;
	state = (int32_t *)definition;
	do {    // loop over stages
		b0 = *state++;   // read in coefficients for this stage in the filter
		b1 = *state++;
		b2 = *state++;
		a1 = *state++;
		a2 = *state++;
		bprev = *state++;
		aprev = *state++;
		sum = *state & 0x3FFF;
		data = end - AUDIO_BLOCK_SAMPLES/2;
		do {   // loop over samples in block
			in2 = *data;   //   32 bit load of 2 samples packed
			sum = signed_multiply_accumulate_32x16b(sum, b0, in2);    // the 32x16 multiply-accum on packed operands
			sum = signed_multiply_accumulate_32x16t(sum, b1, bprev); // using SMLAWT and SMLAWB instructions
			sum = signed_multiply_accumulate_32x16b(sum, b2, bprev);
			sum = signed_multiply_accumulate_32x16t(sum, a1, aprev);
			sum = signed_multiply_accumulate_32x16b(sum, a2, aprev);
			out2 = signed_saturate_rshift(sum, 16, 14);             // final shift 14 right, SSAT instruction 
			sum &= 0x3FFF;     // noise shaping 
 			sum = signed_multiply_accumulate_32x16t(sum, b0, in2);    // repeating for the other sample
			sum = signed_multiply_accumulate_32x16b(sum, b1, in2);
			sum = signed_multiply_accumulate_32x16t(sum, b2, bprev);
			sum = signed_multiply_accumulate_32x16b(sum, a1, out2);
			sum = signed_multiply_accumulate_32x16t(sum, a2, aprev);
			bprev = in2;
			aprev = pack_16b_16b(
				signed_saturate_rshift(sum, 16, 14), out2);   // packing the 2 result samples
			// retaining part of the sum is meant to implement the
			// "first order noise shaping" described in this article:
			// http://www.earlevel.com/main/2003/02/28/biquads/
			//  TODO: is logical AND really correct, or maybe it
			//        should really be signed_saturate_rshift() ???
			sum &= 0x3FFF;
			bprev = in2;
			*data++ = aprev;
		} while (data < end);
....
....
 
See myTeensyBiq() in the code which I just sent using multiply_accumulate_32x32_rshift32_rounded() & described in dspinst.h. I get better results than filter_biquad.cpp (THD=90dB instead of 72), which is normal because it's multiplying 32x32. I wanted to know the penalty in cycles (mine takes 86/sample or 29 cycles/biquad/sample). I also wanted to know why this instruction shifts the results by 32 instead of 31 so we're, nevertheless, losing 1 bit of precision. Maybe you know who I can ask about all that?

In any case, for future projects where I need better IIR performance (eg. hifi music), I will use the myTeensyBiq(). Anyway, for now, I'm just evaluating things.
 
Ah, in your code - I suspect the reason is to keep signed and unsigned version of instructions in step with each other,
and the shifting versions consistent with the 64 bit result versions.
Its possible to perform unsigned multiply using signed multiply and a couple of conditional additions, and vice-versa,
and this maybe how the processor does things internally so that the signed and unsigned operations need to be consistent.
 
Ah, in your code - I suspect the reason is to keep signed and unsigned version of instructions in step with each other,
and the shifting versions consistent with the 64 bit result versions.
Its possible to perform unsigned multiply using signed multiply and a couple of conditional additions, and vice-versa,
and this maybe how the processor does things internally so that the signed and unsigned operations need to be consistent.

1.OK got it.

2.Last question of the (next) day. I was able to measure the execution cycles/sample wit myTeensyBiq() because I have access to the beginning and end execution times but not so with the filter_biquad.cpp (at least not in my sketch that you see above). Maybe I'll have to plunge into the library code to place?:

cyc = ARM_DWT_CYCCNT;

filter_biquad.cpp();

cyc = ARM_DWT_CYCCNT - cyc;

What do you think?

Thanx!
 
OK, I have an idea on how I can compare processor usage of myTeensyBiq vs. the standard audio library biquad.

1. I know that I can use the command, biquad.processorUsage() to have the the % usage of the CPU total maximum available cycles. I get 0.26% for the 3-stage cascaded audio library biquad above.

2. I get 86.34 cycles/sample for myTeensyBiq by using ARM_DWT_CYCCNT. Knowing that the CPU runs at 600MHz, it has 13605.4 cycles available for each 44100Hz sample.
Then I should consume 86.34/13605.4 or 0.63% of the total CPU available.

Conclusion: Using a 32x32 multiplier for each MAC instead of a 32x16 as in the standard audio library Biquad, I have a usage of 0.63% instead of 0.26%. Sounds logical to me. The price to pay for a 20dB improvement in Total Harmonic Distortion for low frequency, highly selective cascaded biquads. biquad1Usage.jpg myTeensyBiqUsage.jpg
 
Final version:

Code:
// Record sound as raw data to a SD card
//
// Requires the audio shield:
//   http://www.pjrc.com/store/teensy3_audio.html
//
// 1 pushbutton needs to be connected:
//   Record Button: pin 0 to GND
//
// This example code is in the public domain.

#include <Bounce.h>
#include <Audio.h>
#include <Wire.h>
#include <SPI.h>
#include <SD.h>
#include <SerialFlash.h>
#include "AudioSampleSig1and3kHz.h"
#define RECORD_PIN 0
#define DEBUG
#define MYBIQ //instead of using standard Teensy library biquad, use myTeensyBiquad with 32x32 Mac's

AudioPlayMemory          sound0; 
AudioOutputI2S           i2s1; 
AudioRecordQueue         queue3;
AudioControlSGTL5000     sgtl5000_1;
#ifdef MYBIQ
AudioRecordQueue         queue1;
AudioPlayQueue           queue2;
AudioConnection          patchCord1(sound0, queue1);
AudioConnection          patchCord2(queue2, queue3);
AudioConnection          patchCord3(queue2, 0, i2s1, 0);
AudioConnection          patchCord4(queue2, 0, i2s1, 1);
#else
AudioFilterBiquad        biquad1;
AudioConnection          patchCord1(sound0, biquad1);
AudioConnection          patchCord2(biquad1, queue3);
AudioConnection          patchCord3(biquad1, 0, i2s1, 0);
AudioConnection          patchCord4(biquad1, 0, i2s1, 1);
#endif

// Bounce objects to easily and reliably read the buttons
Bounce buttonRecord = Bounce(RECORD_PIN, 8);

// Use these with the Teensy Audio Shield
#define SDCARD_CS_PIN    10
#define SDCARD_MOSI_PIN  7
#define SDCARD_SCK_PIN   14

//------------------------IIR parameters ---------------
//6th order iir lpfilter at 1100Hz (option 2 of calcfilt.m)
#ifdef MYBIQ
//for myTeensyBiq 
int32_t numi[]=
{
#include "numi.h"
};
int32_t deni[]=
{
#include "deni.h"
};
#define NUMCELLS 3
int32_t Dni[2]={0,0};
int32_t Ddi[2*NUMCELLS]={0,0,0,0,0,0};
#ifdef DEBUG
int cyc=0,nMACs;
#endif
#else //#ifdef MYBIQ
//for standard biquad library
double coefs0[]=
{ 
#include "iircoefs0.h"
};
double coefs1[]=
{ 
#include "iircoefs1.h"
};
double coefs2[]=
{ 
#include "iircoefs2.h"
};
#endif //#ifdef MYBIQ
//------------------------------------------------------

// Remember which mode we're doing
int mode = 0;  // 0=stopped, 1=recording, 2=stopRecording

// The file where data is recorded
File frec;
int nbiter=0;

#ifdef DEBUG
unsigned long last_time = millis();
#endif

void setup() {
  // Configure the pushbutton pins
  pinMode(RECORD_PIN, INPUT_PULLUP);

#ifdef DEBUG
  Serial.begin(115200);
  delay(500);
#endif
  // Audio connections require memory, and the record queue
  // uses this memory to buffer incoming audio.
  AudioMemory(60);

  // Enable the audio shield, select input, and enable output
  sgtl5000_1.enable();
  sgtl5000_1.volume(0.5);

#ifndef MYBIQ
  biquad1.setCoefficients(0,coefs0);
  biquad1.setCoefficients(1,coefs1);
  biquad1.setCoefficients(2,coefs2);
#endif

  // Initialize the SD card
  SPI.setMOSI(SDCARD_MOSI_PIN);
  SPI.setSCK(SDCARD_SCK_PIN);
  if (!(SD.begin(SDCARD_CS_PIN))) {
    // stop here if no SD card, but print a message
    while (1) {
#ifdef DEBUG
      Serial.println("Unable to access the SD card");
#endif
      delay(500);
    }
  }
#ifdef MYBIQ
  queue1.begin();
#endif
}


void loop() 
{
  if (sound0.isPlaying() == false) //play sound0 forever
  {
    sound0.play(AudioSampleSig1and3khz);
  }
  
  // First, read the button
  buttonRecord.update();
  
  // Respond to button presses
  if (buttonRecord.fallingEdge()) 
  {
#ifdef DEBUG
    Serial.println("-------------------");
    Serial.println("Record Button Press");
    Serial.println("-------------------");
#endif
    if (mode == 0) startRecording();
  }
  else
  { 
    if (mode == 1) 
    {
      // If we'rerecording, carry on...
      continueRecording();
    }
    else
    {
      if (mode==2)
      {
        // Stop Recording
        stopRecording();
      }
    }
  }

#ifdef MYBIQ
  //execute myBIQ
  if (queue1.available() >= 1) //Left Channel
  {
#ifdef DEBUG
    cyc = ARM_DWT_CYCCNT;
    myTeensyBiq(queue1.readBuffer(),queue2.getBuffer(),numi,deni,Dni,Ddi,NUMCELLS);
    cyc = ARM_DWT_CYCCNT - cyc;
#else
    myTeensyBiq(queue1.readBuffer(),queue2.getBuffer(),numi,deni,Dni,Ddi,NUMCELLS);
#endif
    // Free the input audio buffer
    queue1.freeBuffer();
    // and play it back into the audio queue
    queue2.playBuffer();
  } //if (queue1.available() >= 1)
#ifdef DEBUG
  if (millis() - last_time >= 2500) 
  {
    nMACs=AUDIO_BLOCK_SAMPLES*NUMCELLS*7;
    Serial.print("myTeensyBiq cycles/sample=");
    Serial.print((float)cyc/(float)AUDIO_BLOCK_SAMPLES);
    Serial.print(" cycles/MAC=");
    Serial.print((float)cyc/(float)nMACs);
    Serial.print(" ");
    Serial.print(100.*((float)cyc/(float)AUDIO_BLOCK_SAMPLES)/13605.4); //There are 13605.4 processor cycles available per sample @ 44.1kHz 
    Serial.println(" %");
    
    last_time=millis();
  }
#endif
#else  //#ifdef MYBIQ
  if (millis() - last_time >= 2500) 
  {
    Serial.print("biquad1: ");
    Serial.print(biquad1.processorUsage());
    Serial.println(" %");
    last_time=millis();
  } 
#endif //#ifdef MYBIQ  
}

void startRecording() 
{
#ifdef DEBUG
  Serial.println("--------------");
  Serial.println("startRecording");
  Serial.println("--------------");
#endif
  // The SD library writes new data to the end of the
  // file, so to start a new recording, the old file
  // must be deleted before new data is written.
#ifdef MYBIQ
  if (SD.exists("mybiq.raw")) 
  {
    SD.remove("mybiq.raw");
  }
  frec = SD.open("mybiq.raw", FILE_WRITE);
#else
  if (SD.exists("stdbiq.raw")) 
  {
    SD.remove("stdbiq.raw");
  }
  frec = SD.open("stdbiq.raw", FILE_WRITE);
#endif
  if (frec) 
  {
    queue3.begin();
    mode = 1;
  }
}

void continueRecording() 
{
  if (queue3.available() >= 2) 
  {
    byte buffer[512];
    // Fetch 2 blocks from the audio library and copy
    // into a 512 byte buffer.  The Arduino SD library
    // is most efficient when full 512 byte sector size
    // writes are used.
    memcpy(buffer, queue3.readBuffer(), 256);
    queue3.freeBuffer();
    memcpy(buffer+256, queue3.readBuffer(), 256);
    queue3.freeBuffer();
    // write all 512 bytes to the SD card
#ifdef DEBUG1
    elapsedMicros usec = 0;
#endif
    frec.write(buffer, 512);
    // Uncomment these lines to see how long SD writes
    // are taking.  A pair of audio blocks arrives every
    // 5802 microseconds, so hopefully most of the writes
    // take well under 5802 us.  Some will take more, as
    // the SD library also must write to the FAT tables
    // and the SD card controller manages media erase and
    // wear leveling.  The queue object can buffer
    // approximately 301700 us of audio, to allow time
    // for occasional high SD card latency, as long as
    // the average write time is under 5802 us.
#ifdef DEBUG1
    Serial.print("SD write, us=");
    Serial.println(usec);
#endif
    if (nbiter<1722) nbiter++; //440832 samples (approx. 10sec) stored in SDCARD before stopRecording
    else mode=2; //stopRecording
  }
}

void stopRecording() 
{
#ifdef DEBUG
  Serial.println("-------------");
  Serial.println("stopRecording");
  Serial.println("-------------");
#endif
  queue3.end();  
  while (queue3.available() > 0) 
  {
    frec.write((byte*)queue3.readBuffer(), 256);
    queue3.freeBuffer();
  }
  frec.close();
  nbiter=0;
  mode = 0;
}

#ifdef MYBIQ
void myTeensyBiq(int16_t *inbuf,int16_t *outbuf,int32_t *numi,int32_t *deni,int32_t *Dni,int32_t *Ddi,int numcells)
{
  int coefnum,datanum,i,j;
  int32_t input,res,accu;
  int16_t *bp1,*bp2;
  bp1=inbuf;
  bp2=outbuf; 
  // Copy from input to output buffer
  for( j = 0;j < AUDIO_BLOCK_SAMPLES;j++) 
  {
    input=((int32_t)*bp1++)<<16;
    coefnum=0;
    datanum=0;
    accu=multiply_32x32_rshift32_rounded(input,numi[coefnum]);
    accu=multiply_accumulate_32x32_rshift32_rounded(accu,Dni[datanum],numi[coefnum+1]);
    accu=multiply_accumulate_32x32_rshift32_rounded(accu,Dni[datanum],numi[coefnum+1]);
    accu=multiply_accumulate_32x32_rshift32_rounded(accu,Dni[datanum+1],numi[coefnum+2]);
    Dni[datanum+1]=Dni[datanum];
    Dni[datanum]=input;
    for (i=1;i<numcells;i++)
    { 
      accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
      accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
      accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum+1],-deni[coefnum+2]);
      res=accu<<1;
      coefnum+=3;
      accu=multiply_32x32_rshift32_rounded(res,numi[coefnum]);
      accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],numi[coefnum+1]);
      accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],numi[coefnum+1]);
      accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum+1],numi[coefnum+2]);
      Ddi[datanum+1]=Ddi[datanum];
      Ddi[datanum]=res;
      datanum+=2;
    }
    accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
    accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
    accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum+1],-deni[coefnum+2]);
    Ddi[datanum+1]=Ddi[datanum];
    Ddi[datanum]=accu<<1;
    *bp2++=(int16_t)((accu+0x4000)>>15); //round result to 16 bits
  }
}
#endif


BiqInput2.jpg BiqTemplate2.jpg Biquad1Usage.jpg OutStdTeensyLibraryToSD2.jpg myTeensyBiqUsage.jpg OutMyTeensyBiqToSD2.jpg
 
dspinst.h problem with multiply_accumulate_32x32_rshift32_rounded(int32_t sum, int32_t a, int32_t b) and others

Seems to be a missing 0 for return sum + ((((int64_t)a * (int64_t)b) + 0x8000000) >> 32);
I do believe it should be return sum + ((((int64_t)a * (int64_t)b) + 0x80000000) >> 32);
Or maybe I've got the wrong version?
I develop my Teensy programs offline using mingw64 before integrating them into Teensy environment.


Here's my version:

Code:
 /* Audio Library for Teensy 3.X
 * Copyright (c) 2014, Paul Stoffregen, paul@pjrc.com
 *
 * Development of this audio library was funded by PJRC.COM, LLC by sales of
 * Teensy and Audio Adaptor boards.  Please support PJRC's efforts to develop
 * open source software by purchasing Teensy or other PJRC products.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice, development funding notice, and this permission
 * notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

#ifndef dspinst_h_
#define dspinst_h_

#include <stdint.h>

// computes limit((val >> rshift), 2**bits)
static inline int32_t signed_saturate_rshift(int32_t val, int bits, int rshift) __attribute__((always_inline, unused));
static inline int32_t signed_saturate_rshift(int32_t val, int bits, int rshift)
{
#if defined (__ARM_ARCH_7EM__)
	int32_t out;
	asm volatile("ssat %0, %1, %2, asr %3" : "=r" (out) : "I" (bits), "r" (val), "I" (rshift));
	return out;
#elif defined(KINETISL)
	int32_t out, max;
	out = val >> rshift;
	max = 1 << (bits - 1);
	if (out >= 0) {
		if (out > max - 1) out = max - 1;
	} else {
		if (out < -max) out = -max;
	}
	return out;
#endif
}

// computes limit(val, 2**bits)
static inline int16_t saturate16(int32_t val) __attribute__((always_inline, unused));
static inline int16_t saturate16(int32_t val)
{
#if defined (__ARM_ARCH_7EM__)
	int16_t out;
	int32_t tmp;
	asm volatile("ssat %0, %1, %2" : "=r" (tmp) : "I" (16), "r" (val) );
	out = (int16_t) (tmp);
	return out;
#else
    if (val > 32767) val = 32767;
    else if (val < -32768) val = -32768;
    return val;
#endif
}

// computes ((a[31:0] * b[15:0]) >> 16)
static inline int32_t signed_multiply_32x16b(int32_t a, uint32_t b) __attribute__((always_inline, unused));
static inline int32_t signed_multiply_32x16b(int32_t a, uint32_t b)
{
#if defined (__ARM_ARCH_7EM__)
	int32_t out;
	asm volatile("smulwb %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
	return out;
#elif defined(KINETISL)
	return ((int64_t)a * (int16_t)(b & 0xFFFF)) >> 16;
#endif
}

// computes ((a[31:0] * b[31:16]) >> 16)
static inline int32_t signed_multiply_32x16t(int32_t a, uint32_t b) __attribute__((always_inline, unused));
static inline int32_t signed_multiply_32x16t(int32_t a, uint32_t b)
{
#if defined (__ARM_ARCH_7EM__)
	int32_t out;
	asm volatile("smulwt %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
	return out;
#elif defined(KINETISL)
	return ((int64_t)a * (int16_t)(b >> 16)) >> 16;
#endif
}

// computes (((int64_t)a[31:0] * (int64_t)b[31:0]) >> 32)
static inline int32_t multiply_32x32_rshift32(int32_t a, int32_t b) __attribute__((always_inline, unused));
static inline int32_t multiply_32x32_rshift32(int32_t a, int32_t b)
{
#if defined (__ARM_ARCH_7EM__)
	int32_t out;
	asm volatile("smmul %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
	return out;
#elif defined(KINETISL)
	return ((int64_t)a * (int64_t)b) >> 32;
#endif
}

// computes (((int64_t)a[31:0] * (int64_t)b[31:0] + 0x8000000) >> 32)
static inline int32_t multiply_32x32_rshift32_rounded(int32_t a, int32_t b) __attribute__((always_inline, unused));
static inline int32_t multiply_32x32_rshift32_rounded(int32_t a, int32_t b)
{
#if defined (__ARM_ARCH_7EM__)
	int32_t out;
	asm volatile("smmulr %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
	return out;
#elif defined(KINETISL)
	return (((int64_t)a * (int64_t)b) + 0x8000000) >> 32;
#endif
}

// computes sum + (((int64_t)a[31:0] * (int64_t)b[31:0] + 0x8000000) >> 32)
static inline int32_t multiply_accumulate_32x32_rshift32_rounded(int32_t sum, int32_t a, int32_t b) __attribute__((always_inline, unused));
static inline int32_t multiply_accumulate_32x32_rshift32_rounded(int32_t sum, int32_t a, int32_t b)
{
#if defined (__ARM_ARCH_7EM__)
	int32_t out;
	asm volatile("smmlar %0, %2, %3, %1" : "=r" (out) : "r" (sum), "r" (a), "r" (b));
	return out;
#elif defined(KINETISL)
	return sum + ((((int64_t)a * (int64_t)b) + 0x8000000) >> 32);
#endif
}

// computes sum - (((int64_t)a[31:0] * (int64_t)b[31:0] + 0x8000000) >> 32)
static inline int32_t multiply_subtract_32x32_rshift32_rounded(int32_t sum, int32_t a, int32_t b) __attribute__((always_inline, unused));
static inline int32_t multiply_subtract_32x32_rshift32_rounded(int32_t sum, int32_t a, int32_t b)
{
#if defined (__ARM_ARCH_7EM__)
	int32_t out;
	asm volatile("smmlsr %0, %2, %3, %1" : "=r" (out) : "r" (sum), "r" (a), "r" (b));
	return out;
#elif defined(KINETISL)
	return sum - ((((int64_t)a * (int64_t)b) + 0x8000000) >> 32);
#endif
}


// computes (a[31:16] | (b[31:16] >> 16))
static inline uint32_t pack_16t_16t(int32_t a, int32_t b) __attribute__((always_inline, unused));
static inline uint32_t pack_16t_16t(int32_t a, int32_t b)
{
#if defined (__ARM_ARCH_7EM__)
	int32_t out;
	asm volatile("pkhtb %0, %1, %2, asr #16" : "=r" (out) : "r" (a), "r" (b));
	return out;
#elif defined(KINETISL)
	return (a & 0xFFFF0000) | ((uint32_t)b >> 16);
#endif
}

// computes (a[31:16] | b[15:0])
static inline uint32_t pack_16t_16b(int32_t a, int32_t b) __attribute__((always_inline, unused));
static inline uint32_t pack_16t_16b(int32_t a, int32_t b)
{
#if defined (__ARM_ARCH_7EM__)
	int32_t out;
	asm volatile("pkhtb %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
	return out;
#elif defined(KINETISL)
	return (a & 0xFFFF0000) | (b & 0x0000FFFF);
#endif
}

// computes ((a[15:0] << 16) | b[15:0])
static inline uint32_t pack_16b_16b(int32_t a, int32_t b) __attribute__((always_inline, unused));
static inline uint32_t pack_16b_16b(int32_t a, int32_t b)
{
#if defined (__ARM_ARCH_7EM__)
	int32_t out;
	asm volatile("pkhbt %0, %1, %2, lsl #16" : "=r" (out) : "r" (b), "r" (a));
	return out;
#elif defined(KINETISL)
	return (a << 16) | (b & 0x0000FFFF);
#endif
}

// computes ((a[15:0] << 16) | b[15:0])
/*
static inline uint32_t pack_16x16(int32_t a, int32_t b) __attribute__((always_inline, unused));
static inline uint32_t pack_16x16(int32_t a, int32_t b)
{
	int32_t out;
	asm volatile("pkhbt %0, %1, %2, lsl #16" : "=r" (out) : "r" (b), "r" (a));
	return out;
}
*/
#if defined (__ARM_ARCH_7EM__)
// computes (((a[31:16] + b[31:16]) << 16) | (a[15:0 + b[15:0]))  (saturates)
static inline uint32_t signed_add_16_and_16(uint32_t a, uint32_t b) __attribute__((always_inline, unused));
static inline uint32_t signed_add_16_and_16(uint32_t a, uint32_t b)
{
	int32_t out;
	asm volatile("qadd16 %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
	return out;
}

// computes (((a[31:16] - b[31:16]) << 16) | (a[15:0 - b[15:0]))  (saturates)
static inline int32_t signed_subtract_16_and_16(int32_t a, int32_t b) __attribute__((always_inline, unused));
static inline int32_t signed_subtract_16_and_16(int32_t a, int32_t b)
{
	int32_t out;
	asm volatile("qsub16 %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
	return out;
}

// computes out = (((a[31:16]+b[31:16])/2) <<16) | ((a[15:0]+b[15:0])/2)
static inline int32_t signed_halving_add_16_and_16(int32_t a, int32_t b) __attribute__((always_inline, unused));
static inline int32_t signed_halving_add_16_and_16(int32_t a, int32_t b)
{
	int32_t out;
	asm volatile("shadd16 %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
	return out;
}

// computes out = (((a[31:16]-b[31:16])/2) <<16) | ((a[15:0]-b[15:0])/2)
static inline int32_t signed_halving_subtract_16_and_16(int32_t a, int32_t b) __attribute__((always_inline, unused));
static inline int32_t signed_halving_subtract_16_and_16(int32_t a, int32_t b)
{
	int32_t out;
	asm volatile("shsub16 %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
	return out;
}

// computes (sum + ((a[31:0] * b[15:0]) >> 16))
static inline int32_t signed_multiply_accumulate_32x16b(int32_t sum, int32_t a, uint32_t b) __attribute__((always_inline, unused));
static inline int32_t signed_multiply_accumulate_32x16b(int32_t sum, int32_t a, uint32_t b)
{
	int32_t out;
	asm volatile("smlawb %0, %2, %3, %1" : "=r" (out) : "r" (sum), "r" (a), "r" (b));
	return out;
}

// computes (sum + ((a[31:0] * b[31:16]) >> 16))
static inline int32_t signed_multiply_accumulate_32x16t(int32_t sum, int32_t a, uint32_t b) __attribute__((always_inline, unused));
static inline int32_t signed_multiply_accumulate_32x16t(int32_t sum, int32_t a, uint32_t b)
{
	int32_t out;
	asm volatile("smlawt %0, %2, %3, %1" : "=r" (out) : "r" (sum), "r" (a), "r" (b));
	return out;
}

// computes logical and, forces compiler to allocate register and use single cycle instruction
static inline uint32_t logical_and(uint32_t a, uint32_t b) __attribute__((always_inline, unused));
static inline uint32_t logical_and(uint32_t a, uint32_t b)
{
	asm volatile("and %0, %1" : "+r" (a) : "r" (b));
	return a;
}

// computes ((a[15:0] * b[15:0]) + (a[31:16] * b[31:16]))
static inline int32_t multiply_16tx16t_add_16bx16b(uint32_t a, uint32_t b) __attribute__((always_inline, unused));
static inline int32_t multiply_16tx16t_add_16bx16b(uint32_t a, uint32_t b)
{
	int32_t out;
	asm volatile("smuad %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
	return out;
}

// computes ((a[15:0] * b[31:16]) + (a[31:16] * b[15:0]))
static inline int32_t multiply_16tx16b_add_16bx16t(uint32_t a, uint32_t b) __attribute__((always_inline, unused));
static inline int32_t multiply_16tx16b_add_16bx16t(uint32_t a, uint32_t b)
{
	int32_t out;
	asm volatile("smuadx %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
	return out;
}

// // computes sum += ((a[15:0] * b[15:0]) + (a[31:16] * b[31:16]))
static inline int64_t multiply_accumulate_16tx16t_add_16bx16b(int64_t sum, uint32_t a, uint32_t b)
{
	asm volatile("smlald %Q0, %R0, %1, %2" : "+r" (sum) : "r" (a), "r" (b));
	return sum;
}

// // computes sum += ((a[15:0] * b[31:16]) + (a[31:16] * b[15:0]))
static inline int64_t multiply_accumulate_16tx16b_add_16bx16t(int64_t sum, uint32_t a, uint32_t b)
{
	asm volatile("smlaldx %Q0, %R0, %1, %2" : "+r" (sum) : "r" (a), "r" (b));
	return sum;
}

// computes ((a[15:0] * b[15:0])
static inline int32_t multiply_16bx16b(uint32_t a, uint32_t b) __attribute__((always_inline, unused));
static inline int32_t multiply_16bx16b(uint32_t a, uint32_t b)
{
	int32_t out;
	asm volatile("smulbb %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
	return out;
}

// computes ((a[15:0] * b[31:16])
static inline int32_t multiply_16bx16t(uint32_t a, uint32_t b) __attribute__((always_inline, unused));
static inline int32_t multiply_16bx16t(uint32_t a, uint32_t b)
{
	int32_t out;
	asm volatile("smulbt %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
	return out;
}

// computes ((a[31:16] * b[15:0])
static inline int32_t multiply_16tx16b(uint32_t a, uint32_t b) __attribute__((always_inline, unused));
static inline int32_t multiply_16tx16b(uint32_t a, uint32_t b)
{
	int32_t out;
	asm volatile("smultb %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
	return out;
}

// computes ((a[31:16] * b[31:16])
static inline int32_t multiply_16tx16t(uint32_t a, uint32_t b) __attribute__((always_inline, unused));
static inline int32_t multiply_16tx16t(uint32_t a, uint32_t b)
{
	int32_t out;
	asm volatile("smultt %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
	return out;
}

// computes (a - b), result saturated to 32 bit integer range
static inline int32_t substract_32_saturate(uint32_t a, uint32_t b) __attribute__((always_inline, unused));
static inline int32_t substract_32_saturate(uint32_t a, uint32_t b)
{
	int32_t out;
	asm volatile("qsub %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
	return out;
}

// Multiply two S.31 fractional integers, and return the 32 most significant
// bits after a shift left by the constant z.
// This comes from rockbox.org

static inline int32_t FRACMUL_SHL(int32_t x, int32_t y, int z)
{
    int32_t t, t2;
    asm ("smull    %[t], %[t2], %[a], %[b]\n\t"
         "mov      %[t2], %[t2], asl %[c]\n\t"
         "orr      %[t], %[t2], %[t], lsr %[d]\n\t"
         : [t] "=&r" (t), [t2] "=&r" (t2)
         : [a] "r" (x), [b] "r" (y),
           [c] "Mr" ((z) + 1), [d] "Mr" (31 - (z)));
    return t;
}

#endif

//get Q from PSR
static inline uint32_t get_q_psr(void) __attribute__((always_inline, unused));
static inline uint32_t get_q_psr(void)
{
  uint32_t out;
  asm ("mrs %0, APSR" : "=r" (out));
  return (out & 0x8000000)>>27;
}

//clear Q BIT in PSR
static inline void clr_q_psr(void) __attribute__((always_inline, unused));
static inline void clr_q_psr(void)
{
  uint32_t t;
  asm ("mov %[t],#0\n"
       "msr APSR_nzcvq,%0\n" : [t] "=&r" (t)::"cc");
}


#endif
 
Whoops... All but one of the occurrences of 0x8000000 seems incorrect in that file. The one in get_q_psr() is valid,
don't break that when fixing this.
 
Whoops... All but one of the occurrences of 0x8000000 seems incorrect in that file. The one in get_q_psr() is valid,
don't break that when fixing this.

No problem. I have no intention of touching any libraries. I am a simple customer who reports bugs to those who manage them.

Apparently there are few people who develop offline. I find that it's much harder to develop new algorithms in the Teensy environment. No file system, and 16-bit samples. Besides, it's cumbersome to evaluate results via an SDcard or thru degraded USB audio due to resampling.
 
Status
Not open for further replies.
Back
Top