Forum Rule: Always post complete source code & details to reproduce any issue!
Results 1 to 15 of 15

Thread: Biquad results using multiply_accumulate_32x32_rshift32_rounded

  1. #1
    Member
    Join Date
    Sep 2021
    Location
    American living in France
    Posts
    95

    Biquad results using multiply_accumulate_32x32_rshift32_rounded

    I was interesting in finding out what the differences are between using the standard Teensy biquad library and using a direct form 1 32x32 MAC for a 6th order LPF at fc=1000Hz.
    Enclosed, find the results.

    In Conclusion, I found that:

    1. There is an improvement to the frequency response (except for a -90 dB DC component).
    2. There are 86.30 cycles/sample or 5.75 cycles/MAC

    My question is:

    1. How can I remove the DC component (which I believe is due to the fact that we loose 1 bit of information due to the rshift32 instead of shifting by 31)?
    2. I would like to know what the number of cycles per sample is for the standard Teensy biquad library. (I don't have access to probe the ARM_DWT_CYCCNT at its input in my sketch).


    Code:
     
    
    #include <Audio.h>
    #include <Wire.h>
    #include <SPI.h>
    #include <SD.h>
    #include <SerialFlash.h>
    #include "AudioSampleSig1and3kHz.h"
    #include "AudioSampleSig1and3kHz.h"
    //#define DEBUG
    AudioPlayMemory          sound0;
    AudioOutputUSB           usb_out;
    AudioFilterBiquad        biquad1;        
    AudioOutputI2S           i2s2;     
    AudioRecordQueue         queue1;
    AudioPlayQueue           queue2;
    AudioConnection          patchCord1(sound0, 0, biquad1, 0);
    AudioConnection          patchCord2(sound0, 0, queue1, 0);
    AudioConnection          patchCord3(biquad1, 0, i2s2, 0);
    AudioConnection          patchCord4(queue2, 0, i2s2, 1);
    AudioConnection          patchCord5(biquad1, 0, usb_out, 0);
    AudioConnection          patchCord6(queue2, 0, usb_out, 1);
    AudioControlSGTL5000     sgtl5000_1;     //xy=523.2000122070312,327
    
    
    //const int myInput = AUDIO_INPUT_LINEIN;
    //const int myInput = AUDIO_INPUT_MIC;
    
    //------------------------IIR parameters ---------------
    //6th order iir lpfilter at 1100Hz (option 2 of calcfilt.m)
    
    //-------- for standard biquad library ----------------
    double coefs0[]=
    { 
    #include "iircoefs0.h"
    };
    double coefs1[]=
    { 
    #include "iircoefs1.h"
    };
    double coefs2[]=
    { 
    #include "iircoefs2.h"
    };
    
    //--------- for myTeensyBiq ------------------
    int32_t numi[]=
    {
    #include "numi.h"
    };
    int32_t deni[]=
    {
    #include "deni.h"
    };
    #define NUMCELLS 3
    int32_t Dni[2],Ddi[2*NUMCELLS];
    #ifdef DEBUG
    unsigned long last_time = millis();
    int cyc=0,nMACs;
    #endif
    //------------------------------------------------------
    
    void setup() {
    #ifdef DEBUG
      Serial.begin(115200);
      delay(500);
    #endif
      AudioMemory(12);
      sgtl5000_1.enable();  // Enable the audio shield
    //  sgtl5000_1.inputSelect(myInput);
      sgtl5000_1.volume(0.5);
    
      biquad1.setCoefficients(0,coefs0);
      biquad1.setCoefficients(1,coefs1);
      biquad1.setCoefficients(2,coefs2);
      // Start the record queue
      queue1.begin();
    }
    
    
    void loop() {
      if (sound0.isPlaying() == false)
      {
        sound0.play(AudioSampleSig1and3khz);
      }
      if (queue1.available() >= 1) //Left Channel
      {
    #ifdef DEBUG
        cyc = ARM_DWT_CYCCNT;
        myTeensyBiq(queue1.readBuffer(),queue2.getBuffer(),numi,deni,Dni,Ddi,NUMCELLS);
        cyc = ARM_DWT_CYCCNT - cyc;
    #else
        myTeensyBiq(queue1.readBuffer(),queue2.getBuffer(),numi,deni,Dni,Ddi,NUMCELLS);
    #endif
        // Free the input audio buffer
        queue1.freeBuffer();
        // and play it back into the audio queue
        queue2.playBuffer();
      } //if (queue1.available() >= 1)
    #ifdef DEBUG
      if (millis() - last_time >= 2500) 
      {
        nMACs=AUDIO_BLOCK_SAMPLES*NUMCELLS*5;
        Serial.print(" myTeensyBiq cycles/sample=");
        Serial.print((float)cyc/(float)AUDIO_BLOCK_SAMPLES);
        Serial.print(" cycles/MAC=");
        Serial.println((float)cyc/(float)nMACs);
        last_time=millis();
      }
    #endif
    }
    
    void myTeensyBiq(int16_t *inbuf,int16_t *outbuf,int32_t *numi,int32_t *deni,int32_t *Dni,int32_t *Ddi,int numcells)
    {
      int coefnum,datanum,i,j;
      int32_t input,res,accu;
      int16_t *bp1,*bp2;
      bp1=inbuf;
      bp2=outbuf; 
      // Copy from input to output buffer
      for( j = 0;j < AUDIO_BLOCK_SAMPLES;j++) 
      {
        input=((int32_t)*bp1++)<<16;
        coefnum=0;
        datanum=0;
        accu=multiply_32x32_rshift32_rounded(input,numi[coefnum]);
        accu=multiply_accumulate_32x32_rshift32_rounded(accu,Dni[datanum],numi[coefnum+1]);
        accu=multiply_accumulate_32x32_rshift32_rounded(accu,Dni[datanum],numi[coefnum+1]);
        accu=multiply_accumulate_32x32_rshift32_rounded(accu,Dni[datanum+1],numi[coefnum+2]);
        Dni[datanum+1]=Dni[datanum];
        Dni[datanum]=input;
        for (i=1;i<numcells;i++)
        { 
          accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
          accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
          accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum+1],-deni[coefnum+2]);
          res=accu<<1;
          coefnum+=3;
          accu=multiply_32x32_rshift32_rounded(res,numi[coefnum]);
          accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],numi[coefnum+1]);
          accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],numi[coefnum+1]);
          accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum+1],numi[coefnum+2]);
          Ddi[datanum+1]=Ddi[datanum];
          Ddi[datanum]=res;
          datanum+=2;
        }
        accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
        accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
        accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum+1],-deni[coefnum+2]);
        Ddi[datanum+1]=Ddi[datanum];
        Ddi[datanum]=accu<<1;
        *bp2++=(int16_t)(accu>>15); //round result to 16 bits
      }
    }
    Click image for larger version. 

Name:	BiqInput2.jpg 
Views:	28 
Size:	42.5 KB 
ID:	26448
    Click image for larger version. 

Name:	BiqTemplate2.jpg 
Views:	29 
Size:	31.6 KB 
ID:	26449
    Click image for larger version. 

Name:	OutStdTeensyLibrary2.jpg 
Views:	31 
Size:	46.5 KB 
ID:	26450
    Click image for larger version. 

Name:	OutMyTeensyBiq2.jpg 
Views:	31 
Size:	42.7 KB 
ID:	26451
    Click image for larger version. 

Name:	myTeensyBiqStructure2.jpg 
Views:	29 
Size:	128.6 KB 
ID:	26452
    Click image for larger version. 

Name:	myTeensyBiqCycles2.jpg 
Views:	28 
Size:	99.9 KB 
ID:	26453

  2. #2
    Senior Member
    Join Date
    Jul 2020
    Posts
    1,456
    Code:
        *bp2++=(int16_t)(accu>>15); //round result to 16 bits
    floor() is not the same as round() - you are using the fixpoint equivalent of floor(accu / 32768.0) here.
    This is the fair rounding way:
    Code:
        *bp2++=(int16_t)((accu + 0x4000) >>15); //round result to 16 bits without bias
    That's likely where the DC bias comes from.

  3. #3
    Member
    Join Date
    Sep 2021
    Location
    American living in France
    Posts
    95
    Quote Originally Posted by MarkT View Post
    floor() is not the same as round() - you are using the fixpoint equivalent of floor(accu / 32768.0) here.
    This is the fair rounding way:
    Code:
        *bp2++=(int16_t)((accu + 0x4000) >>15); //round result to 16 bits without bias
    That's likely where the DC bias comes from.
    Yes I forgot to do that. DC offset is gone now. I also made a mistake on cycles/mac because I was counting only 5 mac's per biquad when it's really 7 (2nd term of num & den polynomial coefs can be > 1 so I scaled them by .5 and did the mac twice. I get 4.11 cycles per mac now. What are the cycles for the library biquad or how can I measure them? Why is the 32x32 multiply shifted by 32 instead of 31? Thanks.

  4. #4
    Member
    Join Date
    Sep 2021
    Location
    American living in France
    Posts
    95
    I have recorded biquad outputs to SDCard instead of thru the USB interface to avoid any additional processing to them.
    Here is my code and results (cycles corrected for calculation error).

    What are the cycles needed for the std library biquad and why is the 32x32 multiply right shifted by 32 instead of 31?

    Click image for larger version. 

Name:	myTeensyBiqCycles2.jpg 
Views:	21 
Size:	99.1 KB 
ID:	26565 Click image for larger version. 

Name:	OutMyTeensyBiqToSD2.jpg 
Views:	22 
Size:	56.4 KB 
ID:	26566 Click image for larger version. 

Name:	OutStdTeensyLibraryToSD2.jpg 
Views:	25 
Size:	56.3 KB 
ID:	26567

    Code:
     // Record sound as raw data to an SD card
    //
    // Requires the audio shield:
    //   http://www.pjrc.com/store/teensy3_audio.html
    //
    // 1 pushbutton needs to be connected:
    //   Record Button: pin 0 to GND
    //
    // This example code is in the public domain.
    
    #include <Bounce.h>
    #include <Audio.h>
    #include <Wire.h>
    #include <SPI.h>
    #include <SD.h>
    #include <SerialFlash.h>
    #include "AudioSampleSig1and3kHz.h"
    #define RECORD_PIN 0
    #define DEBUG
    
    #define stdBIQinput 0 //which mixer inputs?
    #define myBIQinput 1
    #define MAXITER 1722 // 1722x256(sample buffers)=440832 samples (approx. 10sec) stored in SDCARD before stopRecording
    
    // GUItool: begin automatically generated code
    AudioPlayMemory          sound0;       //xy=222.20001220703125,471.20001220703125
    AudioRecordQueue         queue1;         //xy=430.20001220703125,551.2000122070312
    AudioFilterBiquad        biquad1;        //xy=498.20001220703125,423.20001220703125
    AudioPlayQueue           queue2;         //xy=566.2000122070312,552.2000122070312
    AudioOutputI2S           i2s1;           //xy=743.2000122070312,431.2000427246094
    AudioMixer4              mixer1;         //xy=745.2000122070312,533.2000122070312
    AudioRecordQueue         queue3;         //xy=892.2000122070312,532.2000122070312
    AudioConnection          patchCord1(sound0, queue1);
    AudioConnection          patchCord2(sound0, biquad1);
    AudioConnection          patchCord3(biquad1, 0, i2s1, 0);
    AudioConnection          patchCord4(biquad1, 0, mixer1, stdBIQinput);
    AudioConnection          patchCord5(queue2, 0, i2s1, 1);
    AudioConnection          patchCord6(queue2, 0, mixer1, myBIQinput);
    AudioConnection          patchCord7(mixer1, queue3);
    AudioControlSGTL5000     sgtl5000_1;     //xy=506.20001220703125,665.2000122070312
    // GUItool: end automatically generated code
    
    // Bounce objects to easily and reliably read the buttons
    Bounce buttonRecord = Bounce(RECORD_PIN, 8);
    
    // Use these with the Teensy Audio Shield
    #define SDCARD_CS_PIN    10
    #define SDCARD_MOSI_PIN  7
    #define SDCARD_SCK_PIN   14
    
    //------------------------IIR parameters ---------------
    //6th order iir lpfilter at 1100Hz (option 2 of calcfilt.m)
    //for myTeensyBiq 
    int32_t numi[]=
    {
    #include "numi.h"
    };
    int32_t deni[]=
    {
    #include "deni.h"
    };
    #define NUMCELLS 3
    int32_t Dni[2]={0,0};
    int32_t Ddi[2*NUMCELLS]={0,0,0,0,0,0};
    #ifdef DEBUG
    unsigned long last_time = millis();
    int cyc=0,nMACs;
    #endif
    //for standard biquad library
    double coefs0[]=
    { 
    #include "iircoefs0.h"
    };
    double coefs1[]=
    { 
    #include "iircoefs1.h"
    };
    double coefs2[]=
    { 
    #include "iircoefs2.h"
    };
    
    //------------------------------------------------------
    
    // Remember which mode we're doing
    int mode = 0;  // 0=stopped, 1=recording, 2=stopRecording
    
    // Choose which biquad output to be recorded -> standard Teensy library biquad or myTeensyBiquad with 32x32 Mac's
    int MYBIQ=1;
    
    // The file where data is recorded
    File frec;
    
    // counter for number of 256 sample buffers to be recorded on SDCard
    int nbiter=0;
    
    void setup() 
    {
      // Configure the pushbutton pins
      pinMode(RECORD_PIN, INPUT_PULLUP);
    
    #ifdef DEBUG
      Serial.begin(115200);
      delay(500);
    #endif
      // Audio connections require memory, and the record queue
      // uses this memory to buffer incoming audio.
      AudioMemory(60);
    
      // Enable the audio shield, select input, and enable output
      sgtl5000_1.enable();
      sgtl5000_1.volume(0.5);
    
      biquad1.setCoefficients(0,coefs0);
      biquad1.setCoefficients(1,coefs1);
      biquad1.setCoefficients(2,coefs2);
    
      // Initialize the SD card
      SPI.setMOSI(SDCARD_MOSI_PIN);
      SPI.setSCK(SDCARD_SCK_PIN);
      if (!(SD.begin(SDCARD_CS_PIN))) {
        // stop here if no SD card, but print a message
        while (1) {
    #ifdef DEBUG
          Serial.println("Unable to access the SD card");
    #endif
          delay(500);
        }
      }
      queue1.begin();
    } //end setup()
    
    
    void loop() 
    {
      //keep playing sound0 forever
      if (sound0.isPlaying() == false)
      {
    #ifdef DEBUG
        Serial.println("sound0.play");
    #endif
        sound0.play(AudioSampleSig1and3khz);
      }
      
      // First, read the button
      buttonRecord.update();
      // Respond to button presses
      if (buttonRecord.fallingEdge()) 
      {
    #ifdef DEBUG
        Serial.println("Record Button Press");
    #endif
        if (mode == 0) startRecording();
      }
      else
      {
        if (mode == 1) 
        {
          // If we'rerecording, carry on...
          continueRecording();
        }
        else
        { 
          if (mode == 2)
          // Stop Recording 
          {
            stopRecording();
          }
        }
      }
      if (queue1.available() >= 1) //myTeensyBiquad
      {
    #ifdef DEBUG
        cyc = ARM_DWT_CYCCNT;
        myTeensyBiq(queue1.readBuffer(),queue2.getBuffer(),numi,deni,Dni,Ddi,NUMCELLS);
        cyc = ARM_DWT_CYCCNT - cyc;
    #else
        myTeensyBiq(queue1.readBuffer(),queue2.getBuffer(),numi,deni,Dni,Ddi,NUMCELLS);
    #endif
        // Free the input audio buffer
        queue1.freeBuffer();
        // and play it back into the audio queue
        queue2.playBuffer();
      } //if (queue1.available() >= 1)
    #ifdef DEBUG
      if (millis() - last_time >= 2500) 
      {
        nMACs=AUDIO_BLOCK_SAMPLES*NUMCELLS*7;
        Serial.print(" myTeensyBiq cycles/sample=");
        Serial.print((float)cyc/(float)AUDIO_BLOCK_SAMPLES);
        Serial.print(" cycles/MAC=");
        Serial.println((float)cyc/(float)nMACs);
        last_time=millis();
      }
    #endif
    } //end loop()
    
    
    void startRecording() 
    {
    #ifdef DEBUG
      Serial.print("startRecording, MYBIQ=");
      Serial.println(MYBIQ);
    #endif
      //select which biquad output to be recorded
      selectBIQ(MYBIQ);
      
      // The SD library writes new data to the end of the
      // file, so to start a new recording, the old file
      // must be deleted before new data is written.
      if (MYBIQ)
      {
        if (SD.exists("mybiq.raw")) 
        {
          SD.remove("mybiq.raw");
        }
        frec = SD.open("mybiq.raw", FILE_WRITE);
      }
      else
      {
        if (SD.exists("stdbiq.raw")) 
        {
          SD.remove("stdbiq.raw");
        }
        frec = SD.open("stdbiq.raw", FILE_WRITE);
      }
      if (frec) 
      {
        queue3.begin();
        mode = 1;
      }
    }
    
    void continueRecording() 
    {
      if (queue3.available() >= 2) 
      {
        byte buffer[512];
        // Fetch 2 blocks from the audio library and copy
        // into a 512 byte buffer.  The Arduino SD library
        // is most efficient when full 512 byte sector size
        // writes are used.
        memcpy(buffer, queue3.readBuffer(), 256);
        queue3.freeBuffer();
        memcpy(buffer+256, queue3.readBuffer(), 256);
        queue3.freeBuffer();
        // write all 512 bytes to the SD card
        elapsedMicros usec = 0;
        frec.write(buffer, 512);
        // Uncomment these lines to see how long SD writes
        // are taking.  A pair of audio blocks arrives every
        // 5802 microseconds, so hopefully most of the writes
        // take well under 5802 us.  Some will take more, as
        // the SD library also must write to the FAT tables
        // and the SD card controller manages media erase and
        // wear leveling.  The queue object can buffer
        // approximately 301700 us of audio, to allow time
        // for occasional high SD card latency, as long as
        // the average write time is under 5802 us.
        Serial.print("SD write, us=");
        Serial.println(usec);
        if (nbiter<MAXITER) nbiter++; //440832 samples (approx. 10sec) stored in SDCARD before stopRecording
        else mode=2; //stop Recording
      }
    }
    
    void stopRecording() 
    {
    #ifdef DEBUG
      Serial.print("stopRecording of MYBIQ=");
      Serial.println(MYBIQ);
    #endif
      queue3.end();
      while (queue3.available() > 0) 
      {
        frec.write((byte*)queue3.readBuffer(), 256);
        queue3.freeBuffer();
      }
      frec.close();
      nbiter=0;
      mode=0;
      if (MYBIQ) MYBIQ=0;
      else MYBIQ=1;
    }
    
    void myTeensyBiq(int16_t *inbuf,int16_t *outbuf,int32_t *numi,int32_t *deni,int32_t *Dni,int32_t *Ddi,int numcells)
    {
      int coefnum,datanum,i,j;
      int32_t input,res,accu;
      int16_t *bp1,*bp2;
      bp1=inbuf;
      bp2=outbuf; 
      // Copy from input to output buffer
      for( j = 0;j < AUDIO_BLOCK_SAMPLES;j++) 
      {
        input=((int32_t)*bp1++)<<16;
        coefnum=0;
        datanum=0;
        accu=multiply_32x32_rshift32_rounded(input,numi[coefnum]);
        accu=multiply_accumulate_32x32_rshift32_rounded(accu,Dni[datanum],numi[coefnum+1]);
        accu=multiply_accumulate_32x32_rshift32_rounded(accu,Dni[datanum],numi[coefnum+1]);
        accu=multiply_accumulate_32x32_rshift32_rounded(accu,Dni[datanum+1],numi[coefnum+2]);
        Dni[datanum+1]=Dni[datanum];
        Dni[datanum]=input;
        for (i=1;i<numcells;i++)
        { 
          accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
          accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
          accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum+1],-deni[coefnum+2]);
          res=accu<<1;
          coefnum+=3;
          accu=multiply_32x32_rshift32_rounded(res,numi[coefnum]);
          accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],numi[coefnum+1]);
          accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],numi[coefnum+1]);
          accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum+1],numi[coefnum+2]);
          Ddi[datanum+1]=Ddi[datanum];
          Ddi[datanum]=res;
          datanum+=2;
        }
        accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
        accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
        accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum+1],-deni[coefnum+2]);
        Ddi[datanum+1]=Ddi[datanum];
        Ddi[datanum]=accu<<1;
        *bp2++=(int16_t)((accu+0x4000)>>15); //round result to 16 bits
      }
    }
    
    void selectBIQ(int myBIQ) //select myBIQ or stdBIQ
    {
      if (myBIQ)
      {
        mixer1.gain(myBIQinput,1.0); //connect myBIQ to output
        mixer1.gain(stdBIQinput,0.0);
      }
      else
      {
        mixer1.gain(myBIQinput,0.0); //connect stdBIQ to output
        mixer1.gain(stdBIQinput,1.0);
      }
    }

  5. #5
    Senior Member
    Join Date
    Jul 2020
    Posts
    1,456
    Quote Originally Posted by Bill Glass View Post
    What are the cycles needed for the std library biquad and why is the 32x32 multiply right shifted by 32 instead of 31?
    I'm not sure which multiply you mean, the standard biquad does 32x16 multiples which internally shift right 16,
    and then shifts these results right by a further 14.

    The coefficients are in fix2.30 format and the samples in fix1.15, so the multiplies give fix3.29 which and then
    shifted by 14 to fix17.15 format, then truncated to 1.15 for the final result samples.

    annotated filter_biquad.cpp snippet as I understand it:
    Code:
    void AudioFilterBiquad::update(void)
    {
    	audio_block_t *block;
    	int32_t b0, b1, b2, a1, a2, sum;   // fix2.30  coefficients
    	uint32_t in2, out2, bprev, aprev, flag;  // packed 16+16 variables for samples, delayed samlples and outputs
    	uint32_t *data, *end;
    	int32_t *state;
    	block = receiveWritable();
    	if (!block) return;
    	end = (uint32_t *)(block->data) + AUDIO_BLOCK_SAMPLES/2;
    	state = (int32_t *)definition;
    	do {    // loop over stages
    		b0 = *state++;   // read in coefficients for this stage in the filter
    		b1 = *state++;
    		b2 = *state++;
    		a1 = *state++;
    		a2 = *state++;
    		bprev = *state++;
    		aprev = *state++;
    		sum = *state & 0x3FFF;
    		data = end - AUDIO_BLOCK_SAMPLES/2;
    		do {   // loop over samples in block
    			in2 = *data;   //   32 bit load of 2 samples packed
    			sum = signed_multiply_accumulate_32x16b(sum, b0, in2);    // the 32x16 multiply-accum on packed operands
    			sum = signed_multiply_accumulate_32x16t(sum, b1, bprev); // using SMLAWT and SMLAWB instructions
    			sum = signed_multiply_accumulate_32x16b(sum, b2, bprev);
    			sum = signed_multiply_accumulate_32x16t(sum, a1, aprev);
    			sum = signed_multiply_accumulate_32x16b(sum, a2, aprev);
    			out2 = signed_saturate_rshift(sum, 16, 14);             // final shift 14 right, SSAT instruction 
    			sum &= 0x3FFF;     // noise shaping 
     			sum = signed_multiply_accumulate_32x16t(sum, b0, in2);    // repeating for the other sample
    			sum = signed_multiply_accumulate_32x16b(sum, b1, in2);
    			sum = signed_multiply_accumulate_32x16t(sum, b2, bprev);
    			sum = signed_multiply_accumulate_32x16b(sum, a1, out2);
    			sum = signed_multiply_accumulate_32x16t(sum, a2, aprev);
    			bprev = in2;
    			aprev = pack_16b_16b(
    				signed_saturate_rshift(sum, 16, 14), out2);   // packing the 2 result samples
    			// retaining part of the sum is meant to implement the
    			// "first order noise shaping" described in this article:
    			// http://www.earlevel.com/main/2003/02/28/biquads/
    			//  TODO: is logical AND really correct, or maybe it
    			//        should really be signed_saturate_rshift() ???
    			sum &= 0x3FFF;
    			bprev = in2;
    			*data++ = aprev;
    		} while (data < end);
    ....
    ....

  6. #6
    Member
    Join Date
    Sep 2021
    Location
    American living in France
    Posts
    95
    See myTeensyBiq() in the code which I just sent using multiply_accumulate_32x32_rshift32_rounded() & described in dspinst.h. I get better results than filter_biquad.cpp (THD=90dB instead of 72), which is normal because it's multiplying 32x32. I wanted to know the penalty in cycles (mine takes 86/sample or 29 cycles/biquad/sample). I also wanted to know why this instruction shifts the results by 32 instead of 31 so we're, nevertheless, losing 1 bit of precision. Maybe you know who I can ask about all that?

    In any case, for future projects where I need better IIR performance (eg. hifi music), I will use the myTeensyBiq(). Anyway, for now, I'm just evaluating things.

  7. #7
    Senior Member
    Join Date
    Jul 2020
    Posts
    1,456
    Ah, in your code - I suspect the reason is to keep signed and unsigned version of instructions in step with each other,
    and the shifting versions consistent with the 64 bit result versions.
    Its possible to perform unsigned multiply using signed multiply and a couple of conditional additions, and vice-versa,
    and this maybe how the processor does things internally so that the signed and unsigned operations need to be consistent.

  8. #8
    Member
    Join Date
    Sep 2021
    Location
    American living in France
    Posts
    95
    Quote Originally Posted by MarkT View Post
    Ah, in your code - I suspect the reason is to keep signed and unsigned version of instructions in step with each other,
    and the shifting versions consistent with the 64 bit result versions.
    Its possible to perform unsigned multiply using signed multiply and a couple of conditional additions, and vice-versa,
    and this maybe how the processor does things internally so that the signed and unsigned operations need to be consistent.
    1.OK got it.

    2.Last question of the (next) day. I was able to measure the execution cycles/sample wit myTeensyBiq() because I have access to the beginning and end execution times but not so with the filter_biquad.cpp (at least not in my sketch that you see above). Maybe I'll have to plunge into the library code to place?:

    cyc = ARM_DWT_CYCCNT;

    filter_biquad.cpp();

    cyc = ARM_DWT_CYCCNT - cyc;

    What do you think?

    Thanx!

  9. #9
    Member
    Join Date
    Sep 2021
    Location
    American living in France
    Posts
    95
    OK, I have an idea on how I can compare processor usage of myTeensyBiq vs. the standard audio library biquad.

    1. I know that I can use the command, biquad.processorUsage() to have the the % usage of the CPU total maximum available cycles. I get 0.26% for the 3-stage cascaded audio library biquad above.

    2. I get 86.34 cycles/sample for myTeensyBiq by using ARM_DWT_CYCCNT. Knowing that the CPU runs at 600MHz, it has 13605.4 cycles available for each 44100Hz sample.
    Then I should consume 86.34/13605.4 or 0.63% of the total CPU available.

    Conclusion: Using a 32x32 multiplier for each MAC instead of a 32x16 as in the standard audio library Biquad, I have a usage of 0.63% instead of 0.26%. Sounds logical to me. The price to pay for a 20dB improvement in Total Harmonic Distortion for low frequency, highly selective cascaded biquads. Click image for larger version. 

Name:	biquad1Usage.jpg 
Views:	18 
Size:	47.4 KB 
ID:	26590 Click image for larger version. 

Name:	myTeensyBiqUsage.jpg 
Views:	17 
Size:	95.3 KB 
ID:	26591

  10. #10
    Member
    Join Date
    Sep 2021
    Location
    American living in France
    Posts
    95
    Final version:

    Code:
    // Record sound as raw data to a SD card
    //
    // Requires the audio shield:
    //   http://www.pjrc.com/store/teensy3_audio.html
    //
    // 1 pushbutton needs to be connected:
    //   Record Button: pin 0 to GND
    //
    // This example code is in the public domain.
    
    #include <Bounce.h>
    #include <Audio.h>
    #include <Wire.h>
    #include <SPI.h>
    #include <SD.h>
    #include <SerialFlash.h>
    #include "AudioSampleSig1and3kHz.h"
    #define RECORD_PIN 0
    #define DEBUG
    #define MYBIQ //instead of using standard Teensy library biquad, use myTeensyBiquad with 32x32 Mac's
    
    AudioPlayMemory          sound0; 
    AudioOutputI2S           i2s1; 
    AudioRecordQueue         queue3;
    AudioControlSGTL5000     sgtl5000_1;
    #ifdef MYBIQ
    AudioRecordQueue         queue1;
    AudioPlayQueue           queue2;
    AudioConnection          patchCord1(sound0, queue1);
    AudioConnection          patchCord2(queue2, queue3);
    AudioConnection          patchCord3(queue2, 0, i2s1, 0);
    AudioConnection          patchCord4(queue2, 0, i2s1, 1);
    #else
    AudioFilterBiquad        biquad1;
    AudioConnection          patchCord1(sound0, biquad1);
    AudioConnection          patchCord2(biquad1, queue3);
    AudioConnection          patchCord3(biquad1, 0, i2s1, 0);
    AudioConnection          patchCord4(biquad1, 0, i2s1, 1);
    #endif
    
    // Bounce objects to easily and reliably read the buttons
    Bounce buttonRecord = Bounce(RECORD_PIN, 8);
    
    // Use these with the Teensy Audio Shield
    #define SDCARD_CS_PIN    10
    #define SDCARD_MOSI_PIN  7
    #define SDCARD_SCK_PIN   14
    
    //------------------------IIR parameters ---------------
    //6th order iir lpfilter at 1100Hz (option 2 of calcfilt.m)
    #ifdef MYBIQ
    //for myTeensyBiq 
    int32_t numi[]=
    {
    #include "numi.h"
    };
    int32_t deni[]=
    {
    #include "deni.h"
    };
    #define NUMCELLS 3
    int32_t Dni[2]={0,0};
    int32_t Ddi[2*NUMCELLS]={0,0,0,0,0,0};
    #ifdef DEBUG
    int cyc=0,nMACs;
    #endif
    #else //#ifdef MYBIQ
    //for standard biquad library
    double coefs0[]=
    { 
    #include "iircoefs0.h"
    };
    double coefs1[]=
    { 
    #include "iircoefs1.h"
    };
    double coefs2[]=
    { 
    #include "iircoefs2.h"
    };
    #endif //#ifdef MYBIQ
    //------------------------------------------------------
    
    // Remember which mode we're doing
    int mode = 0;  // 0=stopped, 1=recording, 2=stopRecording
    
    // The file where data is recorded
    File frec;
    int nbiter=0;
    
    #ifdef DEBUG
    unsigned long last_time = millis();
    #endif
    
    void setup() {
      // Configure the pushbutton pins
      pinMode(RECORD_PIN, INPUT_PULLUP);
    
    #ifdef DEBUG
      Serial.begin(115200);
      delay(500);
    #endif
      // Audio connections require memory, and the record queue
      // uses this memory to buffer incoming audio.
      AudioMemory(60);
    
      // Enable the audio shield, select input, and enable output
      sgtl5000_1.enable();
      sgtl5000_1.volume(0.5);
    
    #ifndef MYBIQ
      biquad1.setCoefficients(0,coefs0);
      biquad1.setCoefficients(1,coefs1);
      biquad1.setCoefficients(2,coefs2);
    #endif
    
      // Initialize the SD card
      SPI.setMOSI(SDCARD_MOSI_PIN);
      SPI.setSCK(SDCARD_SCK_PIN);
      if (!(SD.begin(SDCARD_CS_PIN))) {
        // stop here if no SD card, but print a message
        while (1) {
    #ifdef DEBUG
          Serial.println("Unable to access the SD card");
    #endif
          delay(500);
        }
      }
    #ifdef MYBIQ
      queue1.begin();
    #endif
    }
    
    
    void loop() 
    {
      if (sound0.isPlaying() == false) //play sound0 forever
      {
        sound0.play(AudioSampleSig1and3khz);
      }
      
      // First, read the button
      buttonRecord.update();
      
      // Respond to button presses
      if (buttonRecord.fallingEdge()) 
      {
    #ifdef DEBUG
        Serial.println("-------------------");
        Serial.println("Record Button Press");
        Serial.println("-------------------");
    #endif
        if (mode == 0) startRecording();
      }
      else
      { 
        if (mode == 1) 
        {
          // If we'rerecording, carry on...
          continueRecording();
        }
        else
        {
          if (mode==2)
          {
            // Stop Recording
            stopRecording();
          }
        }
      }
    
    #ifdef MYBIQ
      //execute myBIQ
      if (queue1.available() >= 1) //Left Channel
      {
    #ifdef DEBUG
        cyc = ARM_DWT_CYCCNT;
        myTeensyBiq(queue1.readBuffer(),queue2.getBuffer(),numi,deni,Dni,Ddi,NUMCELLS);
        cyc = ARM_DWT_CYCCNT - cyc;
    #else
        myTeensyBiq(queue1.readBuffer(),queue2.getBuffer(),numi,deni,Dni,Ddi,NUMCELLS);
    #endif
        // Free the input audio buffer
        queue1.freeBuffer();
        // and play it back into the audio queue
        queue2.playBuffer();
      } //if (queue1.available() >= 1)
    #ifdef DEBUG
      if (millis() - last_time >= 2500) 
      {
        nMACs=AUDIO_BLOCK_SAMPLES*NUMCELLS*7;
        Serial.print("myTeensyBiq cycles/sample=");
        Serial.print((float)cyc/(float)AUDIO_BLOCK_SAMPLES);
        Serial.print(" cycles/MAC=");
        Serial.print((float)cyc/(float)nMACs);
        Serial.print(" ");
        Serial.print(100.*((float)cyc/(float)AUDIO_BLOCK_SAMPLES)/13605.4); //There are 13605.4 processor cycles available per sample @ 44.1kHz 
        Serial.println(" %");
        
        last_time=millis();
      }
    #endif
    #else  //#ifdef MYBIQ
      if (millis() - last_time >= 2500) 
      {
        Serial.print("biquad1: ");
        Serial.print(biquad1.processorUsage());
        Serial.println(" %");
        last_time=millis();
      } 
    #endif //#ifdef MYBIQ  
    }
    
    void startRecording() 
    {
    #ifdef DEBUG
      Serial.println("--------------");
      Serial.println("startRecording");
      Serial.println("--------------");
    #endif
      // The SD library writes new data to the end of the
      // file, so to start a new recording, the old file
      // must be deleted before new data is written.
    #ifdef MYBIQ
      if (SD.exists("mybiq.raw")) 
      {
        SD.remove("mybiq.raw");
      }
      frec = SD.open("mybiq.raw", FILE_WRITE);
    #else
      if (SD.exists("stdbiq.raw")) 
      {
        SD.remove("stdbiq.raw");
      }
      frec = SD.open("stdbiq.raw", FILE_WRITE);
    #endif
      if (frec) 
      {
        queue3.begin();
        mode = 1;
      }
    }
    
    void continueRecording() 
    {
      if (queue3.available() >= 2) 
      {
        byte buffer[512];
        // Fetch 2 blocks from the audio library and copy
        // into a 512 byte buffer.  The Arduino SD library
        // is most efficient when full 512 byte sector size
        // writes are used.
        memcpy(buffer, queue3.readBuffer(), 256);
        queue3.freeBuffer();
        memcpy(buffer+256, queue3.readBuffer(), 256);
        queue3.freeBuffer();
        // write all 512 bytes to the SD card
    #ifdef DEBUG1
        elapsedMicros usec = 0;
    #endif
        frec.write(buffer, 512);
        // Uncomment these lines to see how long SD writes
        // are taking.  A pair of audio blocks arrives every
        // 5802 microseconds, so hopefully most of the writes
        // take well under 5802 us.  Some will take more, as
        // the SD library also must write to the FAT tables
        // and the SD card controller manages media erase and
        // wear leveling.  The queue object can buffer
        // approximately 301700 us of audio, to allow time
        // for occasional high SD card latency, as long as
        // the average write time is under 5802 us.
    #ifdef DEBUG1
        Serial.print("SD write, us=");
        Serial.println(usec);
    #endif
        if (nbiter<1722) nbiter++; //440832 samples (approx. 10sec) stored in SDCARD before stopRecording
        else mode=2; //stopRecording
      }
    }
    
    void stopRecording() 
    {
    #ifdef DEBUG
      Serial.println("-------------");
      Serial.println("stopRecording");
      Serial.println("-------------");
    #endif
      queue3.end();  
      while (queue3.available() > 0) 
      {
        frec.write((byte*)queue3.readBuffer(), 256);
        queue3.freeBuffer();
      }
      frec.close();
      nbiter=0;
      mode = 0;
    }
    
    #ifdef MYBIQ
    void myTeensyBiq(int16_t *inbuf,int16_t *outbuf,int32_t *numi,int32_t *deni,int32_t *Dni,int32_t *Ddi,int numcells)
    {
      int coefnum,datanum,i,j;
      int32_t input,res,accu;
      int16_t *bp1,*bp2;
      bp1=inbuf;
      bp2=outbuf; 
      // Copy from input to output buffer
      for( j = 0;j < AUDIO_BLOCK_SAMPLES;j++) 
      {
        input=((int32_t)*bp1++)<<16;
        coefnum=0;
        datanum=0;
        accu=multiply_32x32_rshift32_rounded(input,numi[coefnum]);
        accu=multiply_accumulate_32x32_rshift32_rounded(accu,Dni[datanum],numi[coefnum+1]);
        accu=multiply_accumulate_32x32_rshift32_rounded(accu,Dni[datanum],numi[coefnum+1]);
        accu=multiply_accumulate_32x32_rshift32_rounded(accu,Dni[datanum+1],numi[coefnum+2]);
        Dni[datanum+1]=Dni[datanum];
        Dni[datanum]=input;
        for (i=1;i<numcells;i++)
        { 
          accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
          accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
          accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum+1],-deni[coefnum+2]);
          res=accu<<1;
          coefnum+=3;
          accu=multiply_32x32_rshift32_rounded(res,numi[coefnum]);
          accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],numi[coefnum+1]);
          accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],numi[coefnum+1]);
          accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum+1],numi[coefnum+2]);
          Ddi[datanum+1]=Ddi[datanum];
          Ddi[datanum]=res;
          datanum+=2;
        }
        accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
        accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum],-deni[coefnum+1]);
        accu=multiply_accumulate_32x32_rshift32_rounded(accu,Ddi[datanum+1],-deni[coefnum+2]);
        Ddi[datanum+1]=Ddi[datanum];
        Ddi[datanum]=accu<<1;
        *bp2++=(int16_t)((accu+0x4000)>>15); //round result to 16 bits
      }
    }
    #endif

    Click image for larger version. 

Name:	BiqInput2.jpg 
Views:	21 
Size:	42.5 KB 
ID:	26602 Click image for larger version. 

Name:	BiqTemplate2.jpg 
Views:	20 
Size:	31.6 KB 
ID:	26603 Click image for larger version. 

Name:	Biquad1Usage.jpg 
Views:	21 
Size:	47.4 KB 
ID:	26604 Click image for larger version. 

Name:	OutStdTeensyLibraryToSD2.jpg 
Views:	16 
Size:	56.3 KB 
ID:	26599 Click image for larger version. 

Name:	myTeensyBiqUsage.jpg 
Views:	20 
Size:	94.7 KB 
ID:	26601 Click image for larger version. 

Name:	OutMyTeensyBiqToSD2.jpg 
Views:	18 
Size:	56.4 KB 
ID:	26600

  11. #11
    Member
    Join Date
    Sep 2021
    Location
    American living in France
    Posts
    95
    dspinst.h problem with multiply_accumulate_32x32_rshift32_rounded(int32_t sum, int32_t a, int32_t b) and others

    Seems to be a missing 0 for return sum + ((((int64_t)a * (int64_t)b) + 0x8000000) >> 32);
    I do believe it should be return sum + ((((int64_t)a * (int64_t)b) + 0x80000000) >> 32);
    Or maybe I've got the wrong version?
    I develop my Teensy programs offline using mingw64 before integrating them into Teensy environment.


    Here's my version:

    Code:
     /* Audio Library for Teensy 3.X
     * Copyright (c) 2014, Paul Stoffregen, paul@pjrc.com
     *
     * Development of this audio library was funded by PJRC.COM, LLC by sales of
     * Teensy and Audio Adaptor boards.  Please support PJRC's efforts to develop
     * open source software by purchasing Teensy or other PJRC products.
     *
     * Permission is hereby granted, free of charge, to any person obtaining a copy
     * of this software and associated documentation files (the "Software"), to deal
     * in the Software without restriction, including without limitation the rights
     * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     * copies of the Software, and to permit persons to whom the Software is
     * furnished to do so, subject to the following conditions:
     *
     * The above copyright notice, development funding notice, and this permission
     * notice shall be included in all copies or substantial portions of the Software.
     *
     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     * THE SOFTWARE.
     */
    
    #ifndef dspinst_h_
    #define dspinst_h_
    
    #include <stdint.h>
    
    // computes limit((val >> rshift), 2**bits)
    static inline int32_t signed_saturate_rshift(int32_t val, int bits, int rshift) __attribute__((always_inline, unused));
    static inline int32_t signed_saturate_rshift(int32_t val, int bits, int rshift)
    {
    #if defined (__ARM_ARCH_7EM__)
    	int32_t out;
    	asm volatile("ssat %0, %1, %2, asr %3" : "=r" (out) : "I" (bits), "r" (val), "I" (rshift));
    	return out;
    #elif defined(KINETISL)
    	int32_t out, max;
    	out = val >> rshift;
    	max = 1 << (bits - 1);
    	if (out >= 0) {
    		if (out > max - 1) out = max - 1;
    	} else {
    		if (out < -max) out = -max;
    	}
    	return out;
    #endif
    }
    
    // computes limit(val, 2**bits)
    static inline int16_t saturate16(int32_t val) __attribute__((always_inline, unused));
    static inline int16_t saturate16(int32_t val)
    {
    #if defined (__ARM_ARCH_7EM__)
    	int16_t out;
    	int32_t tmp;
    	asm volatile("ssat %0, %1, %2" : "=r" (tmp) : "I" (16), "r" (val) );
    	out = (int16_t) (tmp);
    	return out;
    #else
        if (val > 32767) val = 32767;
        else if (val < -32768) val = -32768;
        return val;
    #endif
    }
    
    // computes ((a[31:0] * b[15:0]) >> 16)
    static inline int32_t signed_multiply_32x16b(int32_t a, uint32_t b) __attribute__((always_inline, unused));
    static inline int32_t signed_multiply_32x16b(int32_t a, uint32_t b)
    {
    #if defined (__ARM_ARCH_7EM__)
    	int32_t out;
    	asm volatile("smulwb %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
    	return out;
    #elif defined(KINETISL)
    	return ((int64_t)a * (int16_t)(b & 0xFFFF)) >> 16;
    #endif
    }
    
    // computes ((a[31:0] * b[31:16]) >> 16)
    static inline int32_t signed_multiply_32x16t(int32_t a, uint32_t b) __attribute__((always_inline, unused));
    static inline int32_t signed_multiply_32x16t(int32_t a, uint32_t b)
    {
    #if defined (__ARM_ARCH_7EM__)
    	int32_t out;
    	asm volatile("smulwt %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
    	return out;
    #elif defined(KINETISL)
    	return ((int64_t)a * (int16_t)(b >> 16)) >> 16;
    #endif
    }
    
    // computes (((int64_t)a[31:0] * (int64_t)b[31:0]) >> 32)
    static inline int32_t multiply_32x32_rshift32(int32_t a, int32_t b) __attribute__((always_inline, unused));
    static inline int32_t multiply_32x32_rshift32(int32_t a, int32_t b)
    {
    #if defined (__ARM_ARCH_7EM__)
    	int32_t out;
    	asm volatile("smmul %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
    	return out;
    #elif defined(KINETISL)
    	return ((int64_t)a * (int64_t)b) >> 32;
    #endif
    }
    
    // computes (((int64_t)a[31:0] * (int64_t)b[31:0] + 0x8000000) >> 32)
    static inline int32_t multiply_32x32_rshift32_rounded(int32_t a, int32_t b) __attribute__((always_inline, unused));
    static inline int32_t multiply_32x32_rshift32_rounded(int32_t a, int32_t b)
    {
    #if defined (__ARM_ARCH_7EM__)
    	int32_t out;
    	asm volatile("smmulr %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
    	return out;
    #elif defined(KINETISL)
    	return (((int64_t)a * (int64_t)b) + 0x8000000) >> 32;
    #endif
    }
    
    // computes sum + (((int64_t)a[31:0] * (int64_t)b[31:0] + 0x8000000) >> 32)
    static inline int32_t multiply_accumulate_32x32_rshift32_rounded(int32_t sum, int32_t a, int32_t b) __attribute__((always_inline, unused));
    static inline int32_t multiply_accumulate_32x32_rshift32_rounded(int32_t sum, int32_t a, int32_t b)
    {
    #if defined (__ARM_ARCH_7EM__)
    	int32_t out;
    	asm volatile("smmlar %0, %2, %3, %1" : "=r" (out) : "r" (sum), "r" (a), "r" (b));
    	return out;
    #elif defined(KINETISL)
    	return sum + ((((int64_t)a * (int64_t)b) + 0x8000000) >> 32);
    #endif
    }
    
    // computes sum - (((int64_t)a[31:0] * (int64_t)b[31:0] + 0x8000000) >> 32)
    static inline int32_t multiply_subtract_32x32_rshift32_rounded(int32_t sum, int32_t a, int32_t b) __attribute__((always_inline, unused));
    static inline int32_t multiply_subtract_32x32_rshift32_rounded(int32_t sum, int32_t a, int32_t b)
    {
    #if defined (__ARM_ARCH_7EM__)
    	int32_t out;
    	asm volatile("smmlsr %0, %2, %3, %1" : "=r" (out) : "r" (sum), "r" (a), "r" (b));
    	return out;
    #elif defined(KINETISL)
    	return sum - ((((int64_t)a * (int64_t)b) + 0x8000000) >> 32);
    #endif
    }
    
    
    // computes (a[31:16] | (b[31:16] >> 16))
    static inline uint32_t pack_16t_16t(int32_t a, int32_t b) __attribute__((always_inline, unused));
    static inline uint32_t pack_16t_16t(int32_t a, int32_t b)
    {
    #if defined (__ARM_ARCH_7EM__)
    	int32_t out;
    	asm volatile("pkhtb %0, %1, %2, asr #16" : "=r" (out) : "r" (a), "r" (b));
    	return out;
    #elif defined(KINETISL)
    	return (a & 0xFFFF0000) | ((uint32_t)b >> 16);
    #endif
    }
    
    // computes (a[31:16] | b[15:0])
    static inline uint32_t pack_16t_16b(int32_t a, int32_t b) __attribute__((always_inline, unused));
    static inline uint32_t pack_16t_16b(int32_t a, int32_t b)
    {
    #if defined (__ARM_ARCH_7EM__)
    	int32_t out;
    	asm volatile("pkhtb %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
    	return out;
    #elif defined(KINETISL)
    	return (a & 0xFFFF0000) | (b & 0x0000FFFF);
    #endif
    }
    
    // computes ((a[15:0] << 16) | b[15:0])
    static inline uint32_t pack_16b_16b(int32_t a, int32_t b) __attribute__((always_inline, unused));
    static inline uint32_t pack_16b_16b(int32_t a, int32_t b)
    {
    #if defined (__ARM_ARCH_7EM__)
    	int32_t out;
    	asm volatile("pkhbt %0, %1, %2, lsl #16" : "=r" (out) : "r" (b), "r" (a));
    	return out;
    #elif defined(KINETISL)
    	return (a << 16) | (b & 0x0000FFFF);
    #endif
    }
    
    // computes ((a[15:0] << 16) | b[15:0])
    /*
    static inline uint32_t pack_16x16(int32_t a, int32_t b) __attribute__((always_inline, unused));
    static inline uint32_t pack_16x16(int32_t a, int32_t b)
    {
    	int32_t out;
    	asm volatile("pkhbt %0, %1, %2, lsl #16" : "=r" (out) : "r" (b), "r" (a));
    	return out;
    }
    */
    #if defined (__ARM_ARCH_7EM__)
    // computes (((a[31:16] + b[31:16]) << 16) | (a[15:0 + b[15:0]))  (saturates)
    static inline uint32_t signed_add_16_and_16(uint32_t a, uint32_t b) __attribute__((always_inline, unused));
    static inline uint32_t signed_add_16_and_16(uint32_t a, uint32_t b)
    {
    	int32_t out;
    	asm volatile("qadd16 %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
    	return out;
    }
    
    // computes (((a[31:16] - b[31:16]) << 16) | (a[15:0 - b[15:0]))  (saturates)
    static inline int32_t signed_subtract_16_and_16(int32_t a, int32_t b) __attribute__((always_inline, unused));
    static inline int32_t signed_subtract_16_and_16(int32_t a, int32_t b)
    {
    	int32_t out;
    	asm volatile("qsub16 %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
    	return out;
    }
    
    // computes out = (((a[31:16]+b[31:16])/2) <<16) | ((a[15:0]+b[15:0])/2)
    static inline int32_t signed_halving_add_16_and_16(int32_t a, int32_t b) __attribute__((always_inline, unused));
    static inline int32_t signed_halving_add_16_and_16(int32_t a, int32_t b)
    {
    	int32_t out;
    	asm volatile("shadd16 %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
    	return out;
    }
    
    // computes out = (((a[31:16]-b[31:16])/2) <<16) | ((a[15:0]-b[15:0])/2)
    static inline int32_t signed_halving_subtract_16_and_16(int32_t a, int32_t b) __attribute__((always_inline, unused));
    static inline int32_t signed_halving_subtract_16_and_16(int32_t a, int32_t b)
    {
    	int32_t out;
    	asm volatile("shsub16 %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
    	return out;
    }
    
    // computes (sum + ((a[31:0] * b[15:0]) >> 16))
    static inline int32_t signed_multiply_accumulate_32x16b(int32_t sum, int32_t a, uint32_t b) __attribute__((always_inline, unused));
    static inline int32_t signed_multiply_accumulate_32x16b(int32_t sum, int32_t a, uint32_t b)
    {
    	int32_t out;
    	asm volatile("smlawb %0, %2, %3, %1" : "=r" (out) : "r" (sum), "r" (a), "r" (b));
    	return out;
    }
    
    // computes (sum + ((a[31:0] * b[31:16]) >> 16))
    static inline int32_t signed_multiply_accumulate_32x16t(int32_t sum, int32_t a, uint32_t b) __attribute__((always_inline, unused));
    static inline int32_t signed_multiply_accumulate_32x16t(int32_t sum, int32_t a, uint32_t b)
    {
    	int32_t out;
    	asm volatile("smlawt %0, %2, %3, %1" : "=r" (out) : "r" (sum), "r" (a), "r" (b));
    	return out;
    }
    
    // computes logical and, forces compiler to allocate register and use single cycle instruction
    static inline uint32_t logical_and(uint32_t a, uint32_t b) __attribute__((always_inline, unused));
    static inline uint32_t logical_and(uint32_t a, uint32_t b)
    {
    	asm volatile("and %0, %1" : "+r" (a) : "r" (b));
    	return a;
    }
    
    // computes ((a[15:0] * b[15:0]) + (a[31:16] * b[31:16]))
    static inline int32_t multiply_16tx16t_add_16bx16b(uint32_t a, uint32_t b) __attribute__((always_inline, unused));
    static inline int32_t multiply_16tx16t_add_16bx16b(uint32_t a, uint32_t b)
    {
    	int32_t out;
    	asm volatile("smuad %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
    	return out;
    }
    
    // computes ((a[15:0] * b[31:16]) + (a[31:16] * b[15:0]))
    static inline int32_t multiply_16tx16b_add_16bx16t(uint32_t a, uint32_t b) __attribute__((always_inline, unused));
    static inline int32_t multiply_16tx16b_add_16bx16t(uint32_t a, uint32_t b)
    {
    	int32_t out;
    	asm volatile("smuadx %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
    	return out;
    }
    
    // // computes sum += ((a[15:0] * b[15:0]) + (a[31:16] * b[31:16]))
    static inline int64_t multiply_accumulate_16tx16t_add_16bx16b(int64_t sum, uint32_t a, uint32_t b)
    {
    	asm volatile("smlald %Q0, %R0, %1, %2" : "+r" (sum) : "r" (a), "r" (b));
    	return sum;
    }
    
    // // computes sum += ((a[15:0] * b[31:16]) + (a[31:16] * b[15:0]))
    static inline int64_t multiply_accumulate_16tx16b_add_16bx16t(int64_t sum, uint32_t a, uint32_t b)
    {
    	asm volatile("smlaldx %Q0, %R0, %1, %2" : "+r" (sum) : "r" (a), "r" (b));
    	return sum;
    }
    
    // computes ((a[15:0] * b[15:0])
    static inline int32_t multiply_16bx16b(uint32_t a, uint32_t b) __attribute__((always_inline, unused));
    static inline int32_t multiply_16bx16b(uint32_t a, uint32_t b)
    {
    	int32_t out;
    	asm volatile("smulbb %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
    	return out;
    }
    
    // computes ((a[15:0] * b[31:16])
    static inline int32_t multiply_16bx16t(uint32_t a, uint32_t b) __attribute__((always_inline, unused));
    static inline int32_t multiply_16bx16t(uint32_t a, uint32_t b)
    {
    	int32_t out;
    	asm volatile("smulbt %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
    	return out;
    }
    
    // computes ((a[31:16] * b[15:0])
    static inline int32_t multiply_16tx16b(uint32_t a, uint32_t b) __attribute__((always_inline, unused));
    static inline int32_t multiply_16tx16b(uint32_t a, uint32_t b)
    {
    	int32_t out;
    	asm volatile("smultb %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
    	return out;
    }
    
    // computes ((a[31:16] * b[31:16])
    static inline int32_t multiply_16tx16t(uint32_t a, uint32_t b) __attribute__((always_inline, unused));
    static inline int32_t multiply_16tx16t(uint32_t a, uint32_t b)
    {
    	int32_t out;
    	asm volatile("smultt %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
    	return out;
    }
    
    // computes (a - b), result saturated to 32 bit integer range
    static inline int32_t substract_32_saturate(uint32_t a, uint32_t b) __attribute__((always_inline, unused));
    static inline int32_t substract_32_saturate(uint32_t a, uint32_t b)
    {
    	int32_t out;
    	asm volatile("qsub %0, %1, %2" : "=r" (out) : "r" (a), "r" (b));
    	return out;
    }
    
    // Multiply two S.31 fractional integers, and return the 32 most significant
    // bits after a shift left by the constant z.
    // This comes from rockbox.org
    
    static inline int32_t FRACMUL_SHL(int32_t x, int32_t y, int z)
    {
        int32_t t, t2;
        asm ("smull    %[t], %[t2], %[a], %[b]\n\t"
             "mov      %[t2], %[t2], asl %[c]\n\t"
             "orr      %[t], %[t2], %[t], lsr %[d]\n\t"
             : [t] "=&r" (t), [t2] "=&r" (t2)
             : [a] "r" (x), [b] "r" (y),
               [c] "Mr" ((z) + 1), [d] "Mr" (31 - (z)));
        return t;
    }
    
    #endif
    
    //get Q from PSR
    static inline uint32_t get_q_psr(void) __attribute__((always_inline, unused));
    static inline uint32_t get_q_psr(void)
    {
      uint32_t out;
      asm ("mrs %0, APSR" : "=r" (out));
      return (out & 0x8000000)>>27;
    }
    
    //clear Q BIT in PSR
    static inline void clr_q_psr(void) __attribute__((always_inline, unused));
    static inline void clr_q_psr(void)
    {
      uint32_t t;
      asm ("mov %[t],#0\n"
           "msr APSR_nzcvq,%0\n" : [t] "=&r" (t)::"cc");
    }
    
    
    #endif

  12. #12
    Senior Member
    Join Date
    Jul 2020
    Posts
    1,456
    Whoops... All but one of the occurrences of 0x8000000 seems incorrect in that file. The one in get_q_psr() is valid,
    don't break that when fixing this.

  13. #13
    Member
    Join Date
    Sep 2021
    Location
    American living in France
    Posts
    95
    Quote Originally Posted by MarkT View Post
    Whoops... All but one of the occurrences of 0x8000000 seems incorrect in that file. The one in get_q_psr() is valid,
    don't break that when fixing this.
    No problem. I have no intention of touching any libraries. I am a simple customer who reports bugs to those who manage them.

    Apparently there are few people who develop offline. I find that it's much harder to develop new algorithms in the Teensy environment. No file system, and 16-bit samples. Besides, it's cumbersome to evaluate results via an SDcard or thru degraded USB audio due to resampling.

  14. #14
    Senior Member
    Join Date
    Jul 2020
    Posts
    1,456
    Pull request for this created:
    https://github.com/PaulStoffregen/Audio/pull/426

  15. #15
    Member
    Join Date
    Sep 2021
    Location
    American living in France
    Posts
    95
    Quote Originally Posted by MarkT View Post
    Thanx for the info

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •