SPI, DMA, EventResponder?

cinhcet

Active member
Hi,

In the SPI code https://github.com/PaulStoffregen/SPI/blob/master/SPI.cpp#L766 one can find an SPI transfer function that takes an EventResponderRef as one of its argument.
Is there any documentation about this feature? I found this thread https://forum.pjrc.com/threads/60289-Large-SPI-Transfers-on-Teensy?p=234623&viewfull=1#post234623 where it is said that this is "experimental".

Does this work with any SPI port on a teensy 3.6 or only on the first one? Does this have something to do with the FIFO capability of the first SPI port?

Background: I want to drive 40 OLED display with a teensy 3.6. To achieve reasonable update rates, my plan is to hook 20 displays each to a separate SPI port such that I can compute the new display content while the transfer of another happens.

Thanks!
 
Sorry there is not much documentation on the EventResponder.

Other than the stuff up on the thread and examples I have put up on different areas.

It should work on any SPI port on the T3.6, it did the last time I tried it. It was more problematic with T3.5.

The only caveat with DMA. Many times it appears to work better if what you are DMA into or out of is aligned on a 32 byte boundary. (Although maybe more with T4...)
 
Thanks for your reply!!

I will test it and report here.

I was looking in more detail into the EventResponder class and its relation to SPI.
For what I want to achieve, I would simply need an async transfer function and then a blocking "waitForFinished" function.
With the current EventResponder API this seems unnecessarily complex to implement.
As far as I can tell, there is already a state variable in the SPI class "_dma_state" which could be exposed in order to check if a transfer is ongoing or is completed.
Any opinions?

What about this function waitForEvent here: https://github.com/PaulStoffregen/cores/blob/master/teensy3/EventResponder.h#L173
 
For what it is worth, here is one of the test programs I was using when I did some of the development and testing of SPI using Async...

Code:
#include <SPI.h>
#include <EventResponder.h>
#define CS_PIN 10
volatile bool event_happened = false;

EventResponder event;
static const uint8_t buffer[] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";

void asyncEventResponder(EventResponderRef event_responder)
{
  digitalWriteFast(CS_PIN, HIGH);
  event_happened = true;
  //Serial.println("Event happened");
}
void setup() {
  pinMode(CS_PIN, OUTPUT);
  digitalWriteFast(CS_PIN, HIGH);
  while (!Serial && millis() < 4000) ;  // wait for Serial port
  Serial.begin(115200);
  SPI.begin();
  Serial.println("SPI Test program");
  Serial1.begin(2000000);
  Serial2.begin(2000000);
  Serial3.begin(2000000);
  extern const uint8_t _serialEvent_default;
  extern const uint8_t _serialEvent1_default;
  extern const uint8_t _serialEvent2_default;
  extern const uint8_t _serialEvent3_default;
  Serial.printf("Default serialEvent? %d %d %d %d\n", _serialEvent_default,
                _serialEvent1_default, _serialEvent2_default, _serialEvent3_default);
#if defined(__IMXRT1062__)
  Serial4.begin(2000000);
  Serial5.begin(2000000);
  Serial6.begin(2000000);
  Serial7.begin(2000000);
  //Serial8.begin(2000000);
  extern const uint8_t _serialEvent4_default;
  extern const uint8_t _serialEvent5_default;
  extern const uint8_t _serialEvent6_default;
  extern const uint8_t _serialEvent7_default;
  //  extern const uint8_t _serialEvent8_default;
  Serial.printf("    %d %d %d %d\n", _serialEvent4_default,
                _serialEvent5_default, _serialEvent6_default, _serialEvent7_default);
#endif
}

void TimeYieldCalls(const char *sz) {
  yield();
  Serial.print(sz); Serial.flush();
  elapsedMicros em = 0;
  for (uint32_t i = 0; i < 1000; i++) yield();
  uint32_t elapsed = em;
  Serial.print(": ");
  Serial.println(elapsed, DEC);
  Serial.flush();
}

void loop() {
  while (Serial.read() != -1) ; // Make sure queue is empty.
  Serial.println("Press any key to run test");
  while (!Serial.available()) ; // will loop until it receives something
  while (Serial.read() != -1) ; // loop until queue is empty

  Serial.printf("start test yield_active_check_flags %x\n", yield_active_check_flags);
  Serial.printf("  systick ISR: %x\n", (uint32_t) _VectorsRam[15]);
  TimeYieldCalls("Start");

  // First try with immediate call.
  event.attachImmediate(&asyncEventResponder);
  Serial.printf("Test Immediate: %x %x\n", yield_active_check_flags, (uint32_t) _VectorsRam[15]);
  event.clearEvent();
  digitalWriteFast(CS_PIN, LOW);
  SPI.beginTransaction(SPISettings(2000000, MSBFIRST, SPI_MODE0));
  SPI.transfer(buffer, NULL, sizeof(buffer), event);
  while (!event_happened) ;
  SPI.endTransaction();
  TimeYieldCalls("After Immediate");

  // Use yield .
  event.detach();
  event.attach(&asyncEventResponder);
  Serial.printf("Test yield: %x %x\n", yield_active_check_flags, (uint32_t) _VectorsRam[15]);
  event.clearEvent();
  digitalWriteFast(CS_PIN, LOW);
  SPI.transfer(buffer, NULL, sizeof(buffer), event);
  while (!event_happened) ;
  TimeYieldCalls("After yield");

  // Use Interrupt .
  event.detach();
  event.attachInterrupt(&asyncEventResponder);
  Serial.printf("Test Interrupt: %x %x\n", yield_active_check_flags, (uint32_t) _VectorsRam[15]);
  event.clearEvent();
  digitalWriteFast(CS_PIN, LOW);
  SPI.transfer(buffer, NULL, sizeof(buffer), event);
  while (!event_happened) ;
  TimeYieldCalls("After Interrupt");
  Serial2.write(buffer, sizeof(buffer));
  delay(5000);

}
void XserialEvent1() {
  int ch;
  while ((ch = Serial1.read()) != -1) Serial.write(ch);

}
void XserialEvent() {
  Serial.write(Serial.read());
}
void serialEventUSB1() {
  while (SerialUSB1.available())
    Serial.write(SerialUSB1.read());
}

void serialEventUSB2() {
  while (SerialUSB2.available())
    Serial.write(SerialUSB2.read());
}

Here is another test program, where I will try to mark up some of the interesting things on how I often use this:

Code:
#include <SPI.h>
#include <EventResponder.h>
#define SPIT SPI1
#define DBGSerial Serial
#define CS_PIN 0

#define SMALL_TRANSFER_SIZE 128
#define BUFFER_SIZE 0x12000l   // More than 64K
//uint8_t buffer[BUFFER_SIZE];
uint8_t *buffer;  // lets malloc it...
//uint8_t rxBuffer[SMALL_TRANSFER_SIZE];
DMAMEM uint8_t rxBuffer[BUFFER_SIZE];
//uint8_t *rxBuffer;
uint8_t *foo_bar = nullptr;
uint8_t static_buffer[16];


[COLOR="#FF0000"]EventResponder event;[/COLOR]
[COLOR="#FF0000"]volatile bool event_happened = false;
void asyncEventResponder(EventResponderRef event_responder)
{
  digitalWriteFast(CS_PIN, HIGH);
  event_happened = true;
}
[/COLOR]
void setup() {
  // debug pins
  uint8_t stack_buffer[10];
//  pinMode(0, OUTPUT);
//  pinMode(1, OUTPUT);
//  digitalWrite(0, LOW);
//  digitalWrite(1, LOW);
  extern unsigned long _heap_start;
  extern unsigned long _heap_end;

  pinMode(CS_PIN, OUTPUT);
  digitalWriteFast(CS_PIN, HIGH);
  while (!Serial && millis() < 4000) ;  // wait for Serial port
  DBGSerial.begin(115200);
  delay(500);
  DBGSerial.println("SPI Test program");
  SPIT.begin();

  buffer = (uint8_t *)malloc(BUFFER_SIZE);
  //rxBuffer = (uint8_t *)malloc(BUFFER_SIZE);


  DBGSerial.print("Buffer: ");
  DBGSerial.print((uint32_t)buffer, HEX);
  DBGSerial.print(" RX Buffer: ");
  DBGSerial.print((uint32_t)rxBuffer, HEX);
  DBGSerial.print(" ");
  DBGSerial.println(BUFFER_SIZE, DEC);
  DBGSerial.printf("Static buffer: %x, Stack Buffer: %x\n", (uint32_t)static_buffer, (uint32_t)stack_buffer);
  DBGSerial.printf("Heap Start: %x, Heap End: %x\n", (uint32_t)&_heap_start, (uint32_t)&_heap_end);
[COLOR="#FF0000"]  event.attachImmediate(&asyncEventResponder);[/COLOR]
}
int nn = 0;
void loop() {
//  while (DBGSerial.read() != -1) ; // Make sure queue is empty.
//  DBGSerial.println("Press any key to run test");
  //while (!DBGSerial.available()) ; // will loop until it receives something
//  while (DBGSerial.read() != -1) ; // loop until queue is empty
  DBGSerial.println("Ready to start tests");


  SPIT.beginTransaction(SPISettings(8000000, MSBFIRST, SPI_MODE0));
  DBGSerial.println("After Begin Transaction");

  //=================================================================
  // Transfer Sync
  //=================================================================


  for (uint32_t i = 0; i < BUFFER_SIZE; i++) buffer[i] = i & 0xff;
  for (uint32_t i = 0; i < BUFFER_SIZE; i++)  rxBuffer[i] = 0x5a;
  DBGSerial.println("Transfer Small"); //DBGSerial.flush();
  digitalWriteFast(CS_PIN, LOW);
  SPIT.transfer(buffer, rxBuffer, SMALL_TRANSFER_SIZE);
  digitalWriteFast(CS_PIN, HIGH);
  DBGSerial.println("*** Completed ***"); DBGSerial.flush();
  dumpBuffer(buffer, SMALL_TRANSFER_SIZE);
  DBGSerial.println();
  dumpBuffer(rxBuffer, SMALL_TRANSFER_SIZE);
  validateTXBuffer(0);
  delay(5);

  for (uint32_t i = 0; i < BUFFER_SIZE; i++) buffer[i] = i & 0xff;
  DBGSerial.println("write Small"); DBGSerial.flush();
  digitalWriteFast(CS_PIN, LOW);
  SPIT.transfer(buffer, NULL, SMALL_TRANSFER_SIZE);
  digitalWriteFast(CS_PIN, HIGH);
  DBGSerial.println("*** Completed ***"); DBGSerial.flush();
  validateTXBuffer(0);
  delay(5);

  for (uint32_t i = 0; i < BUFFER_SIZE; i++) buffer[i] = i & 0xff;
  DBGSerial.println("read Small"); DBGSerial.flush();
  digitalWriteFast(CS_PIN, LOW);
  SPIT.transfer(NULL, rxBuffer, SMALL_TRANSFER_SIZE);
  digitalWriteFast(CS_PIN, HIGH);
  DBGSerial.println("*** Completed ***"); DBGSerial.flush();
  dumpBuffer(rxBuffer, SMALL_TRANSFER_SIZE);
  delay(5);

  SPIT.beginTransaction(SPISettings(2000000, MSBFIRST, SPI_MODE0));
  for (uint32_t i = 0; i < BUFFER_SIZE; i++)buffer[i] = i / 1024;
  for (uint32_t i = 0; i < BUFFER_SIZE; i++)  rxBuffer[i] = 0x5a;

  DBGSerial.println("Transfer Full"); DBGSerial.flush();
  digitalWriteFast(CS_PIN, LOW);
  SPIT.transfer(buffer, rxBuffer, BUFFER_SIZE);
  digitalWriteFast(CS_PIN, HIGH);
  validateTXBuffer(1);
  delay(5);

  for (uint32_t i = 0; i < BUFFER_SIZE; i++) buffer[i] = i / 1024;
  DBGSerial.println("write full"); DBGSerial.flush();
  digitalWriteFast(CS_PIN, LOW);
  SPIT.transfer(buffer, NULL, BUFFER_SIZE);
  digitalWriteFast(CS_PIN, HIGH);
  validateTXBuffer(1);
  delay(5);

  for (uint32_t i = 0; i < BUFFER_SIZE; i++) buffer[i] = i & 0xff;
  DBGSerial.println("read full"); DBGSerial.flush();
  digitalWriteFast(CS_PIN, LOW);
  SPIT.transfer(NULL, buffer, BUFFER_SIZE);
  digitalWriteFast(CS_PIN, HIGH);
  delay(5);
  //=================================================================
  // Transfer Async
  //=================================================================
  for (uint32_t i = 0; i < 5; i++) {
    digitalWriteFast(CS_PIN, LOW);
    delay(1);
    digitalWriteFast(CS_PIN, HIGH);
    delay(1);
  }
[COLOR="#FF0000"]  event_happened = false;[/COLOR]
  for (uint32_t i = 0; i < BUFFER_SIZE; i++) buffer[i] = i & 0xff;
  DBGSerial.println("Async write Small"); DBGSerial.flush();
  digitalWriteFast(CS_PIN, LOW);
[COLOR="#FF0000"]  SPIT.transfer(buffer, NULL, SMALL_TRANSFER_SIZE, event);[/COLOR]
  DBGSerial.println("After write call, waiting for event");
[COLOR="#FF0000"]  while (!event_happened) ;
  event_happened = false;[/COLOR]
  validateTXBuffer(0);
  delay(5);

  for (uint32_t i = 0; i < BUFFER_SIZE; i++) buffer[i] = i & 0xff;
  for (uint32_t i = 0; i < BUFFER_SIZE; i++)  rxBuffer[i] = 0x5a;
  DBGSerial.println("Async Transfer Small"); DBGSerial.flush();
  digitalWriteFast(CS_PIN, LOW);
  SPIT.transfer(buffer, rxBuffer, SMALL_TRANSFER_SIZE, event);
  DBGSerial.println("After Transfer call, waiting for event");
  while (!event_happened) ;
  event_happened = false;
  dumpBuffer(buffer, SMALL_TRANSFER_SIZE);
  DBGSerial.println();
  dumpBuffer(rxBuffer, SMALL_TRANSFER_SIZE);
  validateTXBuffer(0);
  delay(5);


  for (uint32_t i = 0; i < BUFFER_SIZE; i++) buffer[i] = i & 0xff;
  for (uint32_t i = 0; i < BUFFER_SIZE; i++)  rxBuffer[i] = 0x5a;
  DBGSerial.println("Async read Small"); DBGSerial.flush();
  digitalWriteFast(CS_PIN, LOW);
  SPIT.setTransferWriteFill(0x42);
  SPIT.transfer(NULL, rxBuffer, SMALL_TRANSFER_SIZE, event);
  //arm_dcache_delete(rxBuffer, SMALL_TRANSFER_SIZE);
  while (!event_happened) ;
  event_happened = false;
  dumpBuffer(rxBuffer, SMALL_TRANSFER_SIZE);
  validateTXBuffer(0);
  delay(5);

  for (uint32_t i = 0; i < BUFFER_SIZE; i++) buffer[i] = i / 1024;
  for (uint32_t i = 0; i < BUFFER_SIZE; i++)  rxBuffer[i] = 0x5a;
  DBGSerial.println("Async Transfer Full"); DBGSerial.flush();
  digitalWriteFast(CS_PIN, LOW);
  SPIT.transfer(buffer, rxBuffer, BUFFER_SIZE, event);
  while (!event_happened) ;
  event_happened = false;
  dumpBuffer(rxBuffer, 512);
  validateTXBuffer(1);
  delay(5);

  for (uint32_t i = 0; i < BUFFER_SIZE; i++) buffer[i] = i / 1024;
  DBGSerial.println("Async write full"); DBGSerial.flush();
  digitalWriteFast(CS_PIN, LOW);
  SPIT.transfer(buffer, NULL, BUFFER_SIZE, event);
  while (!event_happened) ;
  event_happened = false;
  validateTXBuffer(1);
  delay(5);

  for (uint32_t i = 0; i < BUFFER_SIZE; i++) buffer[i] = i & 0xff;
  for (uint32_t i = 0; i < BUFFER_SIZE; i++)  rxBuffer[i] = 0x5a;
  DBGSerial.println("Async read full"); DBGSerial.flush();
  digitalWriteFast(CS_PIN, LOW);
  SPIT.transfer(NULL, rxBuffer, BUFFER_SIZE, event);
  while (!event_happened) ;
  event_happened = false;
  dumpBuffer(rxBuffer, 512);
  validateTXBuffer(0);
  delay(5);


  DBGSerial.println("Tests completed");
  SPIT.endTransaction();
}

void dumpBuffer(uint8_t *pb, int cb) {
  uint8_t i = 0;
  while (cb) {
    DBGSerial.print(*pb++, HEX);
    cb--;
    DBGSerial.print(" ");
    i++;
    if (i == 16) {
      DBGSerial.println();
      i = 0;
    }
  }
  DBGSerial.println();
}
void validateTXBuffer(uint8_t test)
{
  uint8_t error_count = 0;
  for (int i = 0; i < BUFFER_SIZE; i++) {
    if (((test == 0) && (buffer[i] != (i & 0xff)))
        || ((test == 1) && (buffer[i] != (i / 1024)))) {
      DBGSerial.print("Tx Buffer validate failed Index: ");
      DBGSerial.print(i, DEC);
      DBGSerial.print(" Value: ");
      DBGSerial.println(buffer[i], HEX);
      error_count++;
      DBGSerial.print("Tx Buffer validate failed Index: ");
      DBGSerial.print(i, DEC);
      if (error_count == 10)
        break;
    }
  }
}
Note on the waits for the event to have happened I may also put in some form of timeout.
 
SPI DMA seems to work on SPI and SPI1 on teensy 3.6

I hacked around in the SPI class and made _dma_state volatile as well as making it public. Then you can query this state without having to implement an EventResponder (and having the implications with yield() etc).
Something like:
Code:
while(true) {
    if(SPI._dma_state == SPIClass:DMAState::completed) break;
    delayMicroseconds(10);
}

With this, I can fill the screen of two SSD1306 OLEDs with text/boxes and updating them via SPI within 0.75 ms
 
Last edited:
Comparing Kurt's code with my hack of exposing _dma_state, I am wondering if there is interest in a pull request that exposes _dma_state through a getter function?
Then SPI_HAS_TRANSFER_ASYNC would always be 1. The only thing I am uncertain about is how the transfer API should look like, because at the moment, normal transfer and async transfer are differentiated whether an EventResponder REFERENCE is passed to the transfer function, which is quite implicit, I would say.

Or are there plans to make the concept of the EventResponder more ubiquitous for example also for async i2c?
 
Comparing Kurt's code with my hack of exposing _dma_state, I am wondering if there is interest in a pull request that exposes _dma_state through a getter function?
Then SPI_HAS_TRANSFER_ASYNC would always be 1. The only thing I am uncertain about is how the transfer API should look like, because at the moment, normal transfer and async transfer are differentiated whether an EventResponder REFERENCE is passed to the transfer function, which is quite implicit, I would say.

Or are there plans to make the concept of the EventResponder more ubiquitous for example also for async i2c?

First, the last part: The EventResponder concept was something Paul setup as something he would like to fully integrate to take care of several different areas of event like code... So will it make it into something like I2C?
Not sure. Probably would be nice, but will it ever get up to a high enough priority to get done? I am not sure. Obviously if someone took that on and did it, Paul would probably be interested, in reviewing and maybe adding.

Originally when I was wanting to add Asynchronous SPI, my first implementations were with just passing in a pointer to callback function, but at that time Paul wanted to go through the EventResponder as a good test case, which also gave you options on when that gets called. So in the case when you want it fast, just setting up the responder to call immediate did not add a whole lot of overhead.
 
Originally when I was wanting to add Asynchronous SPI, my first implementations were with just passing in a pointer to callback function, but at that time Paul wanted to go through the EventResponder as a good test case, which also gave you options on when that gets called. So in the case when you want it fast, just setting up the responder to call immediate did not add a whole lot of overhead.

Thanks a lot for your answer!
From an outsiders perspective who did not know the history/current state of the EventResponder, I would argue that the EventResponder API should be optional and a more direct way with passing a pointer to a callback function as well as the ability to query the dma state could be a good addition to the SPI class.
 
I understand about simple callback...

As for DMA state. The main issue may be, that how the DMA works is different for different processors. Like the T3.5 is sort of screwy if you try to use SPI1 or SPI2 as there is only one DMA event for both RX and TX and you can only use it for one of them... So it has to do other tricks. Also if your copy is larger than one DMA transfer can handle the underlying handler than restarts the DMA at the next locations... So the idea is you hopefully don't need to know all of those details.
 
Ok, then I have misunderstood the code. As far as I could tell, _dma_state really tells if the whole transfer has finished, even if there are multiple ones needed.
 
Ok, then I have misunderstood the code. As far as I could tell, _dma_state really tells if the whole transfer has finished, even if there are multiple ones needed.
You may be right that I kept an internal variable for state information. I have not used it in awhile.

At the time we only wanted to add minimal new APIS to SPI that were different than the ones that the Arduino boards support. At the time I also had a Transfer16 version as well that was async. but that was removed.
 
Wise folk! I'd really love your help. I've jumped through a lot of hoops on this project already, including making 3-wire SPI happen (because when I did my due diligence I wasn't thorough enough and read 3-wire SPI as simply meaning no MISO connection.. this cost me a day of my life as I've already made the PCB's!). Now I'd really love to make async SPI transfers to my display, as while I'm using just a grayscale OLED, it still takes a good 4ms to write the whole display at 20MHz, which is far too long for a program that also has to monitor rotary encoders, buttons etc.

I had a look at what you did above, KurtE, but that's for the transfer function in SPI.h, and I've borrowed from the transfer16 function which doesn't yet have the EventResponder stuff setup in it. I think also you'll probably tell me there's a more efficient way to shift to SPI, or at least one that is better optimised for async.

The three relevant chunks are as follows. The first is the loop to push the buffer to the display. The second is the function it calls to push that to SPI, and the third is my crude 9-bit / 3-wire business in SPI.h.

Any help very, very greatly appreciated.

C++:
 for (uint8_t row = 0; row <= 63; row++) {
    uint8_t bytes_remaining = 128;
    ptr = buff + row * 128;
    while (bytes_remaining) {
      uint8_t to_write = min(bytes_remaining, 128);
      oledBufWrite(ptr, to_write);
      ptr += to_write;
      bytes_remaining -= to_write;
      yield();
    }
  }
C++:
void oledBufWrite(const uint8_t *oledData, size_t len) {
  SPI.beginTransaction(SPISettings(20000000, MSBFIRST, SPI_MODE3));
  digitalWriteFast(20,LOW);
  SPI.transfer9(oledData, len, false);
  digitalWriteFast(20,HIGH);
  SPI.endTransaction();
}
C++:
void SPIClass::transfer9(const void * buf, void * retbuf, size_t count, bool cmd) {

    if (count == 0) return;

    uint8_t *p_write = (uint8_t*)buf;
    uint8_t *p_read = (uint8_t*)retbuf;
    size_t count_read = count;
 
    uint32_t tcr = port().TCR;
    port().TCR = (tcr & 0xfffff000) | LPSPI_TCR_FRAMESZ(8);  // turn on 9 bit mode

    // Lets clear the reader queue
    port().CR = LPSPI_CR_RRF | LPSPI_CR_MEN;    // clear the queue and make sure still enabled.

    while (count > 0) {
        // Push out the next byte;
        uint16_t tempBig = p_write? *p_write++ : _transferWriteFill;
        if (!cmd) tempBig += 0x100;
        port().TDR = tempBig;
        count--; // how many bytes left to output.
        // Make sure queue is not full before pushing next byte out
        do {
            if ((port().RSR & LPSPI_RSR_RXEMPTY) == 0)  {
                uint8_t b = port().RDR;  // Read any pending RX bytes in
                if (p_read) *p_read++ = b;
                count_read--;
            }
        } while ((port().SR & LPSPI_SR_TDF) == 0) ;
    }

    // now lets wait for all of the read bytes to be returned...
    while (count_read) {
        if ((port().RSR & LPSPI_RSR_RXEMPTY) == 0)  {
            uint8_t b = port().RDR;  // Read any pending RX bytes in
            if (p_read) *p_read++ = b;
            count_read--;
        }
    }
 
    port().TCR = tcr;    // restore back
}
And accordingly this is in SPI.h:
C++:
    void inline transfer9(void *buf, size_t count, bool cmd) {
        transfer9(buf, NULL, count, cmd);
    }
    void transfer9(const void * buf, void * retbuf, size_t count, bool cmd);
 
Last edited:
Back
Top