How best to manage multiple SPI busses?

KurtE · May 5, 2017

Note: I put in a Pull Request to core project to fix DMAChannel.h issue I mentioned.

I now hae test program driving 3 Teensyview displays. The Test code is a mess, but, it tries to run each of the tests or partial tests without blocking the rest of the system. I had them running with no delays and I had the my Logic analyzer showing two of the three buses:
with MOSI, SCLK, CS, DC So the two were using all 8 channels.

Maybe later will use my older 16 channel one for all three buses. ...

This appears to be working well enough to beat up SPI Async code... So far so good for T3.6. Probably next setup will be to try on
T3.5 all three busses. Then T3.2 1 and TLC 2...

Code:

#include <SPI.h>
#include <TeensyView.h>  // Include the SFE_TeensyView library

///////////////////////////////////
// TeensyView Object Declaration //
///////////////////////////////////
//#define TLC_SPI1
//#define T36_SPI1
//#define T36_SPI2
//#define DEFAULT_PINS
#ifdef DEFAULT_PINS
#define PIN_RESET 15
#define PIN_DC    5
#define PIN_CS    10
#define PIN_SCK   13
#define PIN_MOSI  11
#endif

#ifdef TLC_SPI1
#define PIN_RESET 15
#define PIN_SCK   20
#define PIN_MOSI  21
#define PIN_DC    4
#define PIN_CS    3
#endif

#ifdef T36_SPI1
#define PIN_RESET 15
#define PIN_SCK   20
#define PIN_MOSI  21
#define PIN_DC    31
#define PIN_CS    32
#endif

#ifdef T36_SPI2
#define PIN_RESET 15
#define PIN_SCK   53
#define PIN_MOSI  52
#define PIN_DC    55
#define PIN_CS    51
#endif

// Kurt's setup
#ifndef PIN_SCK
#define PIN_RESET 15
#define PIN_SCK   13
#define PIN_MOSI  11
#define PIN_DC    21
#define PIN_CS    20
// Setup 2nd one SPI1
#define PIN_RESET1 16
#define PIN_SCK1  32
#define PIN_MOSI1 0
#define PIN_DC1   31
#define PIN_CS1  30

// Pins on connector on Beta T3.6 board (3.3, GND)(48, 47)(57 56) (51 52) (53 55)
#define PIN_RESET2 48
//#define PIN_MISO2 51
#define PIN_MOSI2 52
#define PIN_SCK2  53
#define PIN_DC2   55
#define PIN_CS2  56


#endif


TeensyView oled(PIN_RESET, PIN_DC, PIN_CS, PIN_SCK, PIN_MOSI);
TeensyView oled1(PIN_RESET1, PIN_DC1, PIN_CS1, PIN_SCK1, PIN_MOSI1, 64);
TeensyView oled2(PIN_RESET2, PIN_DC2, PIN_CS2, PIN_SCK2, PIN_MOSI2);


TeensyView *oleds[] = {&oled, &oled1, &oled2};
uint8_t oled_which_test[] = {0, 0, 0};
uint16_t test_iterations_left[] = {0xffff, 0xffff, 0xffff};
uint32_t last_test_start_time[] = {0, 0, 0};
uint32_t next_test_start_time[] = {0, 0, 0};
extern void testRects(TeensyView *_oled, bool draw_async, uint16_t &iterations_left);
extern void testCircles(TeensyView *_oled, bool draw_async, uint16_t &iterations_left);
extern void TestpixelsAsync(TeensyView *_oled, uint16_t &iterations_left);
extern void TestFillRects(TeensyView *_oled, uint16_t &iterations_left);
extern void testdrawline(TeensyView *_oled, uint16_t &iterations_left, uint32_t &next_test_start_time);

void setup()
{
  while (!Serial && millis() < 3000); 
  Serial.begin(38400);
  oled.begin();    // Initialize the OLED
  oled.clear(HARDWARE_MEM); // Clear the display's internal memory
  oled.display();  // Display what's in the buffer (splashscreen)
  oled1.begin();    // Initialize the OLED
  oled1.clear(HARDWARE_MEM); // Clear the display's internal memory
  oled1.display();  // Display what's in the buffer (splashscreen)'' 
  Serial.println("oled1 displayed");
  oled2.begin();    // Initialize the OLED
  oled2.clear(HARDWARE_MEM); // Clear the display's internal memory
  oled2.display();  // Display what's in the buffer (splashscreen)'' 
  Serial.println("oled2 displayed");
  delay(1000);     // Delay 1000 ms
  oled.clear(HARDWARE_MEM); // Clear the buffer.
  oled1.clear(HARDWARE_MEM); // Clear the buffer.
  oled2.clear(HARDWARE_MEM);
  randomSeed(analogRead(A0) + analogRead(A1));
}

void loop()
{
  // Lets see which of our displays is ready to display something different
  for (uint8_t i = 0; i < sizeof(oleds)/sizeof(oleds[0]); i++ ) {
    if ((millis() > next_test_start_time[i]) &&!oleds[i]->displayAsyncActive()) {
      last_test_start_time[i] = millis(); 
      switch(oled_which_test[i]) {
        case 0: 
          testRects(oleds[i], true, test_iterations_left[i]);
          break;
        case 1:
          testRects(oleds[i], true, test_iterations_left[i]);
          break;
        case 2: 
          TestpixelsAsync(oleds[i], test_iterations_left[i]);
          break;

        case 3:
          TestFillRects(oleds[i], test_iterations_left[i]);  
          break;      
         case 4:
          testdrawline(oleds[i], test_iterations_left[i], next_test_start_time[i]);  
          break; 
      }
      if (test_iterations_left[i] == 0) {
        oled_which_test[i]++;
        if (oled_which_test[i] > 4)
          oled_which_test[i] = 0;
        test_iterations_left[i] = 0xffff; // mark it special for first call
        next_test_start_time[i] = millis() + 100;
      } 
    } else if ((millis()-last_test_start_time[i]) > 500) {
      Serial.printf("Oled %d hung test: %d iter: %d\n ", i, oled_which_test[i], test_iterations_left[i]); 
      last_test_start_time[i] = millis();
    }

  }
}

void testRects(TeensyView *_oled, bool draw_async, uint16_t &iterations_left) {
  int           n, i, i2;
  int cx = _oled->getLCDWidth() / 2;
  int cy = _oled->getLCDHeight() / 2;

  _oled->clear(PAGE);
  n     = min( _oled->getLCDWidth(),  _oled->getLCDHeight());
  for(i=2; i<n; i+=6) {
    i2 = i / 2;
    _oled->rect(cx-i2, cy-i2, i, i);
  }
  if (draw_async) {
    _oled->displayAsync();
  } else {
    _oled->display();
  }
  iterations_left = 0;
}

void testCircles(TeensyView *_oled, bool draw_async, uint16_t &iterations_left) {
  uint16_t radius = 10;
  int           x, y, r2 = radius * 2,
                w =_oled->getLCDWidth()  + radius,
                h = _oled->getLCDHeight() + radius;

  _oled->clear(PAGE);
  for(x=0; x<w; x+=r2) {
    for(y=0; y<h; y+=r2) {
      _oled->circle(x, y, radius);
    }
  }
  if (draw_async) {
    _oled->displayAsync();
  } else {
    _oled->display();
  }
  iterations_left = 0;
}


void TestpixelsAsync(TeensyView *_oled, uint16_t &iterations_left)
{
  if (iterations_left == 0xffff) {
    _oled->clear(PAGE);
    iterations_left = 1024;
  }
  _oled->pixel(random(_oled->getLCDWidth()), random(_oled->getLCDHeight()));
  _oled->displayAsync();
  iterations_left--;
}

void TestFillRects(TeensyView *_oled, uint16_t &iterations_left)
{
  if (iterations_left == 0xffff) {
    _oled->clear(PAGE);
    iterations_left = 0;  // Not really, but makes it simple as we will update
  }

  _oled->rectFill(iterations_left, iterations_left, 
      _oled->getLCDWidth()-iterations_left*2, _oled->getLCDHeight()-iterations_left*2, (iterations_left&1)?0 : 1, NORM);

  _oled->displayAsync();
  iterations_left += 3; 
  if (iterations_left >=  _oled->getLCDHeight()/2) {
    iterations_left = 0;  // we are done.
  }
}

void testdrawline(TeensyView *_oled, uint16_t &iterations_left, uint32_t &next_test_start_time) {  
  //Serial.printf("testDrawline %x %x\n", _oled, iterations_left);
  if (iterations_left == 0xffff) {
    _oled->clear(PAGE);
    iterations_left = 0;  // Not really, but makes it simple as we will update
  }

  uint8_t line_test = iterations_left >> 8;
  uint8_t i = iterations_left & 0xff;

  switch(line_test) {
    case 0:
      _oled->line(0, 0, i, _oled->getLCDHeight()-1);
      _oled->displayAsync();
      i+=4;
      if (i >= _oled->getLCDWidth()) {
        i = 0;
        line_test++;
      }
      break;

    case 1:
      _oled->line(0, 0, _oled->getLCDWidth()-1, i);
      _oled->displayAsync();
      i+=4;
      if (i >= _oled->getLCDHeight()) {
        i = 0;
        line_test++;
        next_test_start_time = millis()+250;
      }
  
    case 2:
      if (i == 0) {
        _oled->clear(PAGE);
      }
      _oled->line(0, _oled->getLCDHeight()-1, i, 0);
      _oled->displayAsync();
      i+=4;
      if (i >= _oled->getLCDWidth()) {
        i = 0;
        line_test++;
      }
      break;

    case 3:
      _oled->line(0, _oled->getLCDHeight()-1, _oled->getLCDWidth()-1, i);
      _oled->displayAsync();
      i+=4;
      if (i >= _oled->getLCDHeight()) {
        i = 0;
        line_test++;
        next_test_start_time = millis()+250;
      }
      break;
    case 4:
      if (i == 0) {
        _oled->clear(PAGE);
      }
      _oled->line(_oled->getLCDWidth()-1, _oled->getLCDHeight()-1, i, 0);
      _oled->displayAsync();
      i+=4;
      if (i >= _oled->getLCDWidth()) {
        i = 0;
        line_test++;
      }
      break;

    case 5:
      _oled->line(_oled->getLCDWidth()-1, _oled->getLCDHeight()-1, 0, i);
      _oled->displayAsync();
      i+=4;
      if (i >= _oled->getLCDHeight()) {
        i = 0;
        line_test++;
        next_test_start_time = millis()+250;
      }
      break;
    case 6:
      if (i == 0) {
        _oled->clear(PAGE);
      }
      _oled->line(_oled->getLCDWidth()-1, 0, 0, i);
      _oled->displayAsync();
      i+=4;
      if (i >= _oled->getLCDHeight()) {
        i = 0;
        line_test++;
      }
      break;

    case 7:
      _oled->line(_oled->getLCDWidth()-1, 0, i, _oled->getLCDHeight()-1); 
      _oled->displayAsync();
      i+=4;
      if (i >= _oled->getLCDWidth()) {
        i = 0;
        line_test = 0;   // Say we are done
      }
    }
    iterations_left = (line_test << 8) | i;
}

KurtE · May 5, 2017

Yeah - The above program also works on the T3.5 on all three displays using DMA on all three SPI buses!

Again if anyone is interested in trying anything out, my current code for SPI including the One class, plus new transfers is up on my
fork of SPI (github/kurte/SPI) in the branch SPI-multi-one-class

I also pushed up my experimental version of the Sparkfun_Teensyview project in the branch Multiple-SPI

And currently my code is setup to handle 3 displays one of which is a 128x64 and others are Teensyview 128x32

defragster · May 5, 2017

Nice. So those three are all full (tiny) RAM buffered and the DMA is running from that RAM? Running in Parallel with full SPI speed (longest blocks with short gaps) with minimal overhead to tend the interrupts. No flashing or screen artifacts?

KurtE · May 6, 2017

Yes all three of them are SSD1306 SPI devices. The TeensyView ones have a 512 byte ram buffer (128x32/8) and the larger one has a 1K buffer (128*64/8).

The updates to the display are done with new method for Async update:
Which does beginTransaction and asserts CS and DC uses SPI to do the DMA write of 6 bytes for the header.

When it's callback is called it unasserts DC then does an Async write of the buffer.

When the callback is called again it unasserts CS and does end transaction.

With the test it is working pretty well. Still playing with the test some to see other issues. For example I am adding the scrolling text like example that the Adafruit test does, but using the methods that were defined in the Teensyview header. Found out several were not implemented...

The test method is not the greatest I have ever done, but I did not want it to work like the original which would do things like:

Code:

    _oled->scrollVertRight(0x00, 0x0F);
    delay(2000); 
    _oled->scrollStop();
    delay(1000);
    ...

So instead I hacked it up to call it multiple times, with sort of a state variable that if it returns 0 implies it is done, else it also can update another variable with when to call me next...

But this still causes delays... That is because if you look at the different scroll functions like:

Code:

void TeensyView::scrollVertRight(uint8_t start, uint8_t stop) {
  beginSPITransaction();
  command(SETVERTICALSCROLLAREA, false);
  command(0X00, false);
  command(_height, false);
  command(VERTICALRIGHTHORIZONTALSCROLL, false);
  command(0X00, false);
  command(start, false);
  command(0X00, false);
  command(stop, false);
  command(0X01, false);
  command(ACTIVATESCROLL, true);
  endSPITransaction();
}

You are waiting for it to output a 10 byte command. Also when this new added test runs the first time it is working fine, but when it gets called a second time there are artifacts and it is not scrolling... So need to see why (hook up LA again) and my guess is DC signal is not correct, but maybe something else...

Also as I am mainly using this for test case for testing out SPI, how far to take it... Example Should either add new methods/modify current ones to allow output command streams like above ASYNC? That is I could build a command string with all of the above bytes and issue the request in the same way that I do the header for the update screen....

But again how far do I take this?

Part of the answer is probably it depends. Will the SPI asynchronous support be accepted into the real SPI library? If not, this is obviously a waste of time.

If it is a yes or probably? Then having good cases of using it makes sense...
But it also then makes sense to take some time to flesh out a few more issues. Like currently the Async transfer only supports a maximum of 32767 bytes.

Why? 15 bit CITER/BITER values... Is that sufficient? It is not enough to do currently update my ili9341_t3n dma output of the frame buffer as it outputs 320*240*2 bytes: 153600

Can solve this several ways:
a) Live with the limit in api, and have the callback function issue new request for the next chunk of the screen buffer
b) like a) but have the internal code do the logical same on it's interrupt.... (Oops I used uint32_t for count, probably should be size_t)...
As to support > 64K?

c) like b) except maybe have SPI allocate DMASetting objects and chain them to each other... Might be good, this is what I do in my ili9341_t3n library, but maybe adds to much complexity to library...

Again how far to take it...

EDIT: fixed the scroll stuff - Was stupid mistake where some places where I was using built in functions to manipluate the DC state and remember it so don't have to redo and other places that did not use these inline functions so the DC state got out of synch...

KurtE · May 7, 2017

I played around a little more today with the TeensyView stuff. I ripped out all of the PUSHR/POPR (T3.x stuff) likewise remove the attempt to double buffer on TLC and simply use the new SPI stuff in the library like transfer buffer and also Async transfer buffer...

Thought I would then do another round of testing... Decided to get a few more displays arriving tomorrow from Amazon(1 Teensyview and 2 128x64 versions)... Finding I am rewiring several times, when wanting to test 3.5 and 3.6 with 3 displays each, T3.2 with 1 and TLC with 2...

Right now playing with TLC and still noticing some DMA issues... Was also running into some issues with my earlier double buffering stuff when it ran on SPI1 or ran in other optimizations...

With the DMA, the display running on SPI appears to run OK but the one on SPI1 not so well. So hooked up logic Analyzer and try looking at signals:

One thing I am noticing on SPI1, is each time I start a DMA transfer, the clock pin does a brief pulse (something like 60-80ns)... It does not happen on the SPI0 buss.

Note in the above picture the first 4 lines are for SPI and last 4 are for SPI1. You will see that there are two dma transfers there and you can see the little blip on the clock line on both transfers

My guess is it has to do with starting up the DMA transfer. The code currently does:

Code:

<  setup dmaRX and dmaTX for the transfer>

	port.C1 &= ~(SPI_C1_SPE);
	port.C2 |= SPI_C2_TXDMAE | SPI_C2_RXDMAE;
	port.C1 |= SPI_C1_SPE;
      
    _dmaRX->enable();
    _dmaTX->enable();

I think next up will setup an example program using the DmaSpi library to see if it shows the same. Earlier looking through their code it sort-of logically did some similar stuff, although the part the re-enables SPI-C1_SPE was actually done by it calling out to SPI.beginTransaction...

More digging...

KurtE · May 8, 2017

I am still playing with SPI1 on T-LC and having the issues I mentioned in previous posting. I have tried not disabling the SPE bit, which got rid of that little pulse on clock pin, but it then sometimes sends an extra byte... Like the first byte twice.

Wondering how DMASPI library was working with it. So made a new copy of the test program, switched SPI to SPI1... And tried running it..

I also saw the extra pulse here as well. I also jumpered pin 0 to pin 1 and it fails with buffers don't match errors.

Code:

Hi!
Buffers are prepared
Time for non-DMA transfer: 276us
src and dest match

Press a key to continue

DmaSpi::begin() : DmaSpi::start() : state_ = eStopped
DmaSpi::beginNextTransfer: no pending transfer
Transfer @ 0x20001774
Testing src -> dest, single transfer
--------------------------------------------------
Transfer @ 0x200017ac
DmaSpi::registerTransfer(0x20001774)
  DmaSpi::addTransferToQueue() : queueing transfer
  starting transfer
DmaSpi::beginNextTransfer: starting transfer @ 0x20001774
  this was the last in the queue
  real sink
  real source
post_cs S C1 C2: 20 50 24
RX: 40077006 1ffffe74 64 a01a0080 
TX: 1fffff3c 40077006 64 20520080 
DmaSpi::rxIsr_()
  finishCurrentTransfer() @ 0x20001774
  state = eRunning
DmaSpi::beginNextTransfer: no pending transfer
Finished DMA transfer
src and dest don't match
 src: 0x00 0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08 0x09 0x0a 0x0b 0x0c 0x0d 0x0e 0x0f 0x10 0x11 0x12 0x13 0x14 0x15 0x16 0x17 0x18 0x19 0x1a 0x1b 0x1c 0x1d 0x1e 0x1f 0x20 0x21 0x22 0x23 0x24 0x25 0x26 0x27 0x28 0x29 0x2a 0x2b 0x2c 0x2d 0x2e 0x2f 0x30 0x31 0x32 0x33 0x34 0x35 0x36 0x37 0x38 0x39 0x3a 0x3b 0x3c 0x3d 0x3e 0x3f 0x40 0x41 0x42 0x43 0x44 0x45 0x46 0x47 0x48 0x49 0x4a 0x4b 0x4c 0x4d 0x4e 0x4f 0x50 0x51 0x52 0x53 0x54 0x55 0x56 0x57 0x58 0x59 0x5a 0x5b 0x5c 0x5d 0x5e 0x5f 0x60 0x61 0x62 0x63 
dest: 0x00 0x00 0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08 0x09 0x0a 0x0b 0x0c 0x0d 0x0e 0x0f 0x10 0x11 0x12 0x13 0x14 0x15 0x16 0x17 0x18 0x19 0x1a 0x1b 0x1c 0x1d 0x1e 0x1f 0x20 0x21 0x22 0x23 0x24 0x25 0x26 0x27 0x28 0x29 0x2a 0x2b 0x2c 0x2d 0x2e 0x2f 0x30 0x31 0x32 0x33 0x34 0x35 0x36 0x37 0x38 0x39 0x3a 0x3b 0x3c 0x3d 0x3e 0x3f 0x40 0x41 0x42 0x43 0x44 0x45 0x46 0x47 0x48 0x49 0x4a 0x4b 0x4c 0x4d 0x4e 0x4f 0x50 0x51 0x52 0x53 0x54 0x55 0x56 0x57 0x58 0x59 0x5a 0x5b 0x5c 0x5d 0x5e 0x5f 0x60 0x61 0x62 
==================================================


Testing src -> discard, single transfer
--------------------------------------------------
Transfer @ 0x200017ac
DmaSpi::registerTransfer(0x20001774)
  DmaSpi::addTransferToQueue() : queueing transfer
  starting transfer
DmaSpi::beginNextTransfer: starting transfer @ 0x20001774
  this was the last in the queue
  dummy sink
  real source
post_cs S C1 C2: 20 50 24
RX: 40077006 1ffffe50 64 a0120080 
TX: 1fffff3c 40077006 64 20520080 
DmaSpi::rxIsr_()
  finishCurrentTransfer() @ 0x20001774
  state = eRunning
DmaSpi::beginNextTransfer: no pending transfer
Finished DMA transfer
last discarded value is 0x61
That appears to be wrong, it should be src[DMASIZE-1] which is 0x63
==================================================


Testing 0xFF dummy data -> dest, single transfer
--------------------------------------------------
Transfer @ 0x200017ac
DmaSpi::registerTransfer(0x20001774)
  DmaSpi::addTransferToQueue() : queueing transfer
  starting transfer
DmaSpi::beginNextTransfer: starting transfer @ 0x20001774
  this was the last in the queue
  real sink
  dummy source
post_cs S C1 C2: 20 50 24
RX: 40077006 1ffffe74 64 a01a0080 
TX: 20001784 40077006 64 20120080 
DmaSpi::rxIsr_()
  finishCurrentTransfer() @ 0x20001774
  state = eRunning
DmaSpi::beginNextTransfer: no pending transfer
Finished DMA transfer
src and dest don't match
 src: 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 
dest: 0x62 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 
==================================================


Testing multiple queued transfers
--------------------------------------------------
Transfer @ 0x200017ac
Transfer @ 0x20001790
DmaSpi::registerTransfer(0x20001774)
  DmaSpi::addTransferToQueue() : queueing transfer
  starting transfer
DmaSpi::beginNextTransfer: starting transfer @ 0x20001774
  this was the last in the queue
  real sink
  real source
post_cs S C1 C2: 20 50 24
RX: 40077006 1ffffe74 64 a01a0080 
TX: 1fffff3c 40077006 64 20520080 
DmaSpi::registerTransfer(0x20001790)
  DmaSpi::addTransferToQueue() : queueing transfer
DmaSpi::rxIsr_()
  finishCurrentTransfer() @ 0x20001774
  state = eRunning
DmaSpi::beginNextTransfer: starting transfer @ 0x20001790
  this was the last in the queue
  real sink
  real source
post_cs S C1 C2: 20 50 24
RX: 40077006 1ffffed8 64 a01a0080 
TX: 1fffff3c 40077006 64 20520080 
Finished DMA transfer
DmaSpi::rxIsr_()
  finishCurrentTransfer() @ 0x20001790
  state = eRunning
DmaSpi::beginNextTransfer: no pending transfer
Finished DMA transfer1
src and dest don't match
 src: 0x00 0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08 0x09 0x0a 0x0b 0x0c 0x0d 0x0e 0x0f 0x10 0x11 0x12 0x13 0x14 0x15 0x16 0x17 0x18 0x19 0x1a 0x1b 0x1c 0x1d 0x1e 0x1f 0x20 0x21 0x22 0x23 0x24 0x25 0x26 0x27 0x28 0x29 0x2a 0x2b 0x2c 0x2d 0x2e 0x2f 0x30 0x31 0x32 0x33 0x34 0x35 0x36 0x37 0x38 0x39 0x3a 0x3b 0x3c 0x3d 0x3e 0x3f 0x40 0x41 0x42 0x43 0x44 0x45 0x46 0x47 0x48 0x49 0x4a 0x4b 0x4c 0x4d 0x4e 0x4f 0x50 0x51 0x52 0x53 0x54 0x55 0x56 0x57 0x58 0x59 0x5a 0x5b 0x5c 0x5d 0x5e 0x5f 0x60 0x61 0x62 0x63 
dest: 0xff 0x00 0x00 0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08 0x09 0x0a 0x0b 0x0c 0x0d 0x0e 0x0f 0x10 0x11 0x12 0x13 0x14 0x15 0x16 0x17 0x18 0x19 0x1a 0x1b 0x1c 0x1d 0x1e 0x1f 0x20 0x21 0x22 0x23 0x24 0x25 0x26 0x27 0x28 0x29 0x2a 0x2b 0x2c 0x2d 0x2e 0x2f 0x30 0x31 0x32 0x33 0x34 0x35 0x36 0x37 0x38 0x39 0x3a 0x3b 0x3c 0x3d 0x3e 0x3f 0x40 0x41 0x42 0x43 0x44 0x45 0x46 0x47 0x48 0x49 0x4a 0x4b 0x4c 0x4d 0x4e 0x4f 0x50 0x51 0x52 0x53 0x54 0x55 0x56 0x57 0x58 0x59 0x5a 0x5b 0x5c 0x5d 0x5e 0x5f 0x60 0x61 
src and dest don't match
 src: 0x00 0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08 0x09 0x0a 0x0b 0x0c 0x0d 0x0e 0x0f 0x10 0x11 0x12 0x13 0x14 0x15 0x16 0x17 0x18 0x19 0x1a 0x1b 0x1c 0x1d 0x1e 0x1f 0x20 0x21 0x22 0x23 0x24 0x25 0x26 0x27 0x28 0x29 0x2a 0x2b 0x2c 0x2d 0x2e 0x2f 0x30 0x31 0x32 0x33 0x34 0x35 0x36 0x37 0x38 0x39 0x3a 0x3b 0x3c 0x3d 0x3e 0x3f 0x40 0x41 0x42 0x43 0x44 0x45 0x46 0x47 0x48 0x49 0x4a 0x4b 0x4c 0x4d 0x4e 0x4f 0x50 0x51 0x52 0x53 0x54 0x55 0x56 0x57 0x58 0x59 0x5a 0x5b 0x5c 0x5d 0x5e 0x5f 0x60 0x61 0x62 0x63 
dest: 0x62 0x00 0x00 0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08 0x09 0x0a 0x0b 0x0c 0x0d 0x0e 0x0f 0x10 0x11 0x12 0x13 0x14 0x15 0x16 0x17 0x18 0x19 0x1a 0x1b 0x1c 0x1d 0x1e 0x1f 0x20 0x21 0x22 0x23 0x24 0x25 0x26 0x27 0x28 0x29 0x2a 0x2b 0x2c 0x2d 0x2e 0x2f 0x30 0x31 0x32 0x33 0x34 0x35 0x36 0x37 0x38 0x39 0x3a 0x3b 0x3c 0x3d 0x3e 0x3f 0x40 0x41 0x42 0x43 0x44 0x45 0x46 0x47 0x48 0x49 0x4a 0x4b 0x4c 0x4d 0x4e 0x4f 0x50 0x51 0x52 0x53 0x54 0x55 0x56 0x57 0x58 0x59 0x5a 0x5b 0x5c 0x5d 0x5e 0x5f 0x60 0x61 
==================================================


Testing pause and restart
--------------------------------------------------
DmaSpi::registerTransfer(0x20001774)
  DmaSpi::addTransferToQueue() : queueing transfer
  starting transfer
DmaSpi::beginNextTransfer: starting transfer @ 0x20001774
  this was the last in the queue
  real sink
  real source
post_cs S C1 C2: 20 50 24
RX: 40077006 1ffffe74 64 a01a0080 
TX: 1fffff3c 40077006 64 20520080 
DmaSpi::registerTransfer(0x20001790)
  DmaSpi::addTransferToQueue() : queueing transfer
DmaSpi::rxIsr_()
  finishCurrentTransfer() @ 0x20001774
  state = eStopping
Time until stopped: 249 us
Finished DMA transfer
DMA SPI appears to have stopped (this is good)
restarting
DmaSpi::start() : state_ = eStopped
DmaSpi::beginNextTransfer: starting transfer @ 0x20001790
  this was the last in the queue
  real sink
  real source
post_cs S C1 C2: 20 50 24
RX: 40077006 1ffffed8 64 a01a0080 
TX: 1fffff3c 40077006 64 20520080 
DmaSpi::rxIsr_()
  finishCurrentTransfer() @ 0x20001790
  state = eRunning
DmaSpi::beginNextTransfer: no pending transfer
Finished DMA transfer1
src and dest don't match
 src: 0x00 0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08 0x09 0x0a 0x0b 0x0c 0x0d 0x0e 0x0f 0x10 0x11 0x12 0x13 0x14 0x15 0x16 0x17 0x18 0x19 0x1a 0x1b 0x1c 0x1d 0x1e 0x1f 0x20 0x21 0x22 0x23 0x24 0x25 0x26 0x27 0x28 0x29 0x2a 0x2b 0x2c 0x2d 0x2e 0x2f 0x30 0x31 0x32 0x33 0x34 0x35 0x36 0x37 0x38 0x39 0x3a 0x3b 0x3c 0x3d 0x3e 0x3f 0x40 0x41 0x42 0x43 0x44 0x45 0x46 0x47 0x48 0x49 0x4a 0x4b 0x4c 0x4d 0x4e 0x4f 0x50 0x51 0x52 0x53 0x54 0x55 0x56 0x57 0x58 0x59 0x5a 0x5b 0x5c 0x5d 0x5e 0x5f 0x60 0x61 0x62 0x63 
dest: 0x62 0x00 0x00 0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08 0x09 0x0a 0x0b 0x0c 0x0d 0x0e 0x0f 0x10 0x11 0x12 0x13 0x14 0x15 0x16 0x17 0x18 0x19 0x1a 0x1b 0x1c 0x1d 0x1e 0x1f 0x20 0x21 0x22 0x23 0x24 0x25 0x26 0x27 0x28 0x29 0x2a 0x2b 0x2c 0x2d 0x2e 0x2f 0x30 0x31 0x32 0x33 0x34 0x35 0x36 0x37 0x38 0x39 0x3a 0x3b 0x3c 0x3d 0x3e 0x3f 0x40 0x41 0x42 0x43 0x44 0x45 0x46 0x47 0x48 0x49 0x4a 0x4b 0x4c 0x4d 0x4e 0x4f 0x50 0x51 0x52 0x53 0x54 0x55 0x56 0x57 0x58 0x59 0x5a 0x5b 0x5c 0x5d 0x5e 0x5f 0x60 0x61 
src and dest don't match
 src: 0x00 0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08 0x09 0x0a 0x0b 0x0c 0x0d 0x0e 0x0f 0x10 0x11 0x12 0x13 0x14 0x15 0x16 0x17 0x18 0x19 0x1a 0x1b 0x1c 0x1d 0x1e 0x1f 0x20 0x21 0x22 0x23 0x24 0x25 0x26 0x27 0x28 0x29 0x2a 0x2b 0x2c 0x2d 0x2e 0x2f 0x30 0x31 0x32 0x33 0x34 0x35 0x36 0x37 0x38 0x39 0x3a 0x3b 0x3c 0x3d 0x3e 0x3f 0x40 0x41 0x42 0x43 0x44 0x45 0x46 0x47 0x48 0x49 0x4a 0x4b 0x4c 0x4d 0x4e 0x4f 0x50 0x51 0x52 0x53 0x54 0x55 0x56 0x57 0x58 0x59 0x5a 0x5b 0x5c 0x5d 0x5e 0x5f 0x60 0x61 0x62 0x63 
dest: 0x62 0x00 0x00 0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08 0x09 0x0a 0x0b 0x0c 0x0d 0x0e 0x0f 0x10 0x11 0x12 0x13 0x14 0x15 0x16 0x17 0x18 0x19 0x1a 0x1b 0x1c 0x1d 0x1e 0x1f 0x20 0x21 0x22 0x23 0x24 0x25 0x26 0x27 0x28 0x29 0x2a 0x2b 0x2c 0x2d 0x2e 0x2f 0x30 0x31 0x32 0x33 0x34 0x35 0x36 0x37 0x38 0x39 0x3a 0x3b 0x3c 0x3d 0x3e 0x3f 0x40 0x41 0x42 0x43 0x44 0x45 0x46 0x47 0x48 0x49 0x4a 0x4b 0x4c 0x4d 0x4e 0x4f 0x50 0x51 0x52 0x53 0x54 0x55 0x56 0x57 0x58 0x59 0x5a 0x5b 0x5c 0x5d 0x5e 0x5f 0x60 0x61 
==================================================


Testing src -> dest, with chip select object
--------------------------------------------------
Transfer @ 0x200017ac
DmaSpi::registerTransfer(0x20001774)
  DmaSpi::addTransferToQueue() : queueing transfer
  starting transfer
DmaSpi::beginNextTransfer: starting transfer @ 0x20001774
  this was the last in the queue
  real sink
  real source

KurtE · May 8, 2017

- I think I have made some progress on T-LC. My test program is now updating two SSD1306 displays (one Teensyview one 64 line version) and has been running for maybe 10 minutes now without hanging or obvious screwing up.

I updated my code to do similar stuff to what was mentioned in: http://www.thepositiverail.com/blog...ce-driver-for-the-kinetis-kl26-spi-peripheral

In particular, now like on the T3.x side I do a manual push (put data on DL) and then turn on the DMA...
However I was having issues of it hanging, because there was a window of time between when I enabled the TX versus enable the RX that maybe some data came in (probably handling interrupt). So, I disabled interrupts around some of this and now so far it appears happy:

currently the transfer function now looks like:

Code:

bool SPIClass::transfer(const void *buf, void *retbuf, size_t count, void(*callback)(void))
{
	if (_dma_state == DMAState::notAllocated) {
	//Serial.println("First dma call"); Serial.flush();

		_dmaTX = new DMAChannel();
		if (_dmaTX == nullptr) {
			return false;
		}

		_dmaTX->disable();
		_dmaTX->destination((volatile uint8_t&)port.DL);
		_dmaTX->disableOnCompletion();
		_dmaTX->triggerAtHardwareEvent(hardware.tx_dma_channel);


		_dmaRX = new DMAChannel();
		if (_dmaRX == NULL) {
			delete _dmaTX;
			_dmaRX = nullptr;
			return false;
		}
		_dmaRX->disable();
		_dmaRX->source((volatile uint8_t&)port.DL);
		_dmaRX->disableOnCompletion();
		_dmaRX->triggerAtHardwareEvent(hardware.rx_dma_channel);
		_dmaRX->attachInterrupt(hardware.dma_isr);
		_dmaRX->interruptAtCompletion();

		_dma_state = DMAState::idle;  // Should be first thing set!
		//Serial.println("end First dma call");
	}
  
	if (_dma_state == DMAState::active)
		return false; // already active
	// Now handle NULL pointers.
	uint8_t first_char;
	if (buf) {
		uint8_t *data_out = (uint8_t*)buf;
		first_char = *data_out++;
		_dmaTX->sourceBuffer(data_out, count-1);
	} else {
		first_char = _transferFillTXChar;
		_dmaTX->source(_transferFillTXChar);   // maybe have setable value
		_dmaTX->transferCount(count-1);
	}

	if (retbuf) {
		_dmaRX->destinationBuffer((uint8_t*)retbuf, count);
	} else {
		_dmaRX->destination(_dma_dummy_rx);    // NULL ?
		_dmaRX->transferCount(count);
	}
	_dma_callback = callback;

	//Serial.println("Before DMA C1");
	//uint8_t c1_save = port.C1;
	// port.C1 &= ~(SPI_C1_SPE);
	//Serial.println("Before DMA C2");
	// Try pushing the first character
    while (!(port.S & SPI_S_SPTEF));
	__disable_irq();
    port.DL = first_char;

	port.C2 |= SPI_C2_TXDMAE | SPI_C2_RXDMAE;

	// Restore SPE if it was on before this.
	//port.C1 |= (c1_save & SPI_C1_SPE);
	port.C1 |= SPI_C1_SPE;
      
    _dmaTX->enable();
    _dmaRX->enable();
	__enable_irq();
    _dma_state = DMAState::active;
    return true;
}

KurtE · May 10, 2017

Another quick update:

On T3.5 was able to remove disable/enable interrupt by doing the RX enable before the TX enable.

Yesterday I let some of the multiple display (SPI Buss) Async updates of Teensyview run for awhile with:
a T3.5 with 3 displays (TV, 128x64, TV)
a T3.6 with configured for 3 but only 2 on it (not enough displays) (TV, 128x64)
T-LC with two displays (TV, 128x64)

All of them ran for a few hours and did not show any cases where they hung on any of the SPI buses.

Yesterday and Today - Did a quick implementation of the Async call: transfer(buf, retbuf, cnt, callback)
for the AVR. I thought I had a second T2.0 around here but did not find it so tore apart Lidar-lite unit that has one in it and now have the demo running on a TV connected to it... Obviously the speed is not as good as the Async version works by using ISR for each byte of the transfer.

I pushed update to my SPI library fork/branch

Plus Updated my Sparkfun_TeensyView library - Where I removed all of the code that was trying to manipulate SPI registers directly but instead relies only on this new SPI proposed changes.

If anyone is interested could upload current Test program that is outputting to SSD1306 displays. Next up will probably do another SPI test app, to test the Async API in Read mode and Transfer mode...

Kurt

KurtE · May 13, 2017

Sometimes I find the best way to debug an API and/or figure out if the API is sufficient is to try it out and see what you can do. The current new stuff works great for the SSD-1306

For the fun of it, I have been playing around with the ILI9341 code. I started off with my ili9341_t3 version and now implement it without using the hardware specific knowledge like PUSHR/POPR, like queue lengths, also directly using DMA...

I can get some of the stuff to work pretty well, but running into some interesting issues and questions for myself/others. Example with the Frame buffer code when I wish to do the full screen update,
The code would do :

Code:

		beginSPITransaction();
		if (_standard) {
			// Doing full window. 
			setAddr(0, 0, _width-1, _height-1);
			writecommand_cont(ILI9341_RAMWR);

			// BUGBUG doing as one shot.  Not sure if should or not or do like
			// main code and break up into transactions...
			uint16_t *pfbtft_end = &_pfbtft[(ILI9341_TFTWIDTH*ILI9341_TFTHEIGHT)-1];	// setup 
			uint16_t *pftbft = _pfbtft;

			// Quick write out the data;
			while (pftbft < pfbtft_end) {
				writedata16_cont(*pftbft++);
			}
			writedata16_last(*pftbft);
		}
		endSPITransaction();

Where the writedata16_cont code would do a PUSHR and wait until queue was not full.. So the SPI runs at full speed.

How do I do this without using the PUSHR stuff? and Still get good performance... You can make it work by simply doing something like:

Code:

#define write_data16_cont(x)  _spi->transfer16(x)

But you are back to the Adafruit_ILI9341 code speed where gaps of time between each word.

The obvious thing you might want to use the new transfer buffer method: _spi.transfer(pftbft, NULL, buffersize)
However it will output the values in <MSB><LSB> order where we need it it <LSB><MSB> order, so that won't work here directly... It works great on SSD1306.

You could (I did) try caching many of these writes into a buffer and issue a transfer of that buffer. I did this with 80 entries and it did speed things up, however the first thing you figure out is that you need to pack the buffer up, which then does a transfer and you wait until the transfer is done and start building the next buffer. This speeds things up, with fewer gaps, but the gaps are bigger...

You could try using the new Async Method (I just did), where I have two buffers which I alternate using and while the first one is being output (by DMA through Async), I am building the 2nd one...
This is working better, but little more pain in the but...

Wondering if it would make sense to add another new transfer method?
SPI::Transfer16(const uint16_t *buf, uint16_t *retbuf, size_t count);

And if so maybe add the Async version of it ... same with callback function?

Also probably not as generic, but also wonder if it would make sense to have some method to write N copies of some value
That is I want to write the full buffer of zeros to the screen...

Now off to playing around.

Kurt
Update
Ok I added the Transfer16(buf, retbuf, cnt) to the library.
I also updated my function which allows me to set what character to use when you pass in NULL for buf==NULL, to take a word. You use the low byte if you are doing a transfer and you use the whole word if you you are doing transfer16....

Then figured I did not need any new methods to for example clear the screen to RED, or in generic case fill a rectangle.
The fill rectangle now looks something like:

Code:

		beginSPITransaction();
		setAddr(x, y, x+w-1, y+h-1);
		writecommand_cont(ILI9341_RAMWR);
		setDataMode();
		_spi->setTransferWriteFill(color);  // Set the transfer word.
		_spi->transfer16(NULL, NULL, w*h);	// Send out the number of words called for.		
		endSPITransaction();

Passing null for buf and retbuf, implies send the default fill and don't care about return values... Works.

Likewise it made the update screen from screen buffer very simple as well.

Code:

		beginSPITransaction();
		setAddr(0, 0, _width-1, _height-1);
		writecommand_cont(ILI9341_RAMWR);
		setDataMode();
		_spi->transfer16(_pfbtft, NULL, _width*_height);	// blast it out. 		
		endSPITransaction();

I have not done a 16 bit async transfer yet, that is next...

Also still need to update T-LC for this and maybe AVR...

tni · May 13, 2017

If you want to stick with 8-bit DMA for transfer16 (given the issues with the command word), you could swap byte-pairs in the minor loop. The minor loop adjustment can be used for either source or destination (DMA_TCD_NBYTES_SMLOE / DMA_TCD_NBYTES_DMLOE), so this would work for RX and TX.

Code:

#include <DMAChannel.h>
#include <array>

DMAChannel dma;

const size_t buffer_size = 100;

std::array<uint16_t, buffer_size> src_buffer;
std::array<uint16_t, buffer_size> dest_buffer;

auto& serial = Serial;

void setup() {
    serial.begin(115200);
    delay(2000);
    for(size_t i = 0; i < buffer_size; i++) {
        src_buffer[i] = 256 * i + (buffer_size - 1 - i);
    }
    
    dma.TCD->SADDR = (void*) (uintptr_t(src_buffer.data()) + 1);
    dma.TCD->ATTR_SRC = 0; // 8-bit read from source
    dma.TCD->SOFF = -1;
    dma.TCD->NBYTES_MLOFFYES = DMA_TCD_NBYTES_SMLOE |  
                               DMA_TCD_NBYTES_MLOFFYES_NBYTES(2) |
                               DMA_TCD_NBYTES_MLOFFYES_MLOFF(4);
    dma.TCD->SLAST = 0;
    dma.TCD->DADDR = dest_buffer.data();
    dma.TCD->DOFF = 1;
    dma.TCD->ATTR_DST = 0; // 8-bit write to dest
    dma.TCD->BITER = buffer_size;
    dma.TCD->CITER = buffer_size;
    dma.TCD->DLASTSGA = 0;
    dma.TCD->CSR = 0;
    dma.triggerContinuously();
    dma.disableOnCompletion();
    dma.enable();
    while(!dma.complete()) ;

    for(size_t i = 0; i < buffer_size; i++) {
        serial.printf("%04x -> %04x\n", src_buffer[i], dest_buffer[i]);
    }
}

void loop() {}

KurtE · May 14, 2017

Thanks TNI, will give it a try.

For some hit thumb with hammer fun, I may again verify that 16 bit transfer does not work on T3.2/5...

Currently hacking some of the code now, and seeing where I need to update the two DMAChannels for the different sizes of transfers. Before I was for example setting up the _dmaTX->destination to the PUSHR register being cast to uint8_t&... Which setup the ATTR_DST size.. But for this I think for the couple of different variations, I will need to simply update ATTR_DST...

At times with the DMAChannel stuff I try to decide on best approaches on should I just get it to work, or should I make changes to it and do Pull Requests.
Example: Sometimes I want the dmaChannel for TX to interrupt and other times I don't, I can turn on the interrupt using the interruptAtCompletion(), but I don't see any way to turn it off, except again by knowing the internals. That is I know to turn it on is: TCD->CSR |= DMA_TCD_CSR_INTMAJOR; So I know I need to turn that bit off, which I do... Likewise for disableOnCompletion (although I have not needed to turn off... So question to myself always is: Make it work... vs Make it easier for someone else to make it work on next project...

Also may look to fixing/enhancing the Async code to handle > 32767 bytes (or in 16 bit case words). I could leave it for callers to handle in their callback... But maybe should simply have my interrupt at completion function handle it... Could do it at the point just before I would have called their Async method (if not NULL), or maybe on the TX side, maybe should enable ISR on it's completion, so it could advance and keep SPI queue full...

tni · May 14, 2017

KurtE said:
For some hit thumb with hammer fun, I may again verify that 16 bit transfer does not work on T3.2/5...

16-bit writes to SPI0_PUSHR don't seem to work, even if CTAR0 is set up for 16-bit transfers...

tni · May 14, 2017

Is async transfer working for you with SPI1 on Teensy 3.5?

For me it doesn't. In fact, both 8-bit and 16-bit writes to SPI1 PUSHR corrupt the command word in random ways, depending on the data bytes.

KurtE · May 14, 2017

I have(had) it working, but there is only one DMA channel for it, so again it depends on type of transfer.

Currently I an only using DMA to do the write to PUSHR and I am using Interrupts for POPR... But may revise...

However I priming the pump by doing a manual PUSHR of the first item onto the Queue as to hopefully set the appropriate high word...

Have not tried 16 bit yet, debugging it still on T3.6...

KurtE · May 14, 2017

I have the 16 bit DMA transfer working for T3.6... But as suspected it does not work on T3.2...
Also again appears to not work for T3.5 SPI0...

So now to try to understand better the minor loop stuff...

tni · May 14, 2017

tni said:
Is async transfer working for you with SPI1 on Teensy 3.5? For me it doesn't.

The SPI clock was too high - the RX interrupt didn't keep up. With SPI clock of 10MHz, it's working.

In fact, both 8-bit and 16-bit writes to SPI1 PUSHR corrupt the command word in random ways, depending on the data bytes.

The DMA access to PUSHR must do something different compared to a CPU access. If I replace the CPU write with a manually triggered DMA transfer, the command word / upper part of PUSHR gets zeroed out and it works.

With SPI1_CTAR0 set to 16-bit ("SPI1_CTAR0 = SPI1_CTAR1;"), 16-bit transfers also work.

KurtE · May 14, 2017

I was afraid of that, which is why I mentioned earlier that I may need to switch to have to on interrupt as it should hopefully at worst slow down...

KurtE · May 15, 2017

Thanks for the hint.

tni said:
With SPI1_CTAR0 set to 16-bit ("SPI1_CTAR0 = SPI1_CTAR1;"), 16-bit transfers also work.

I updated my Transfer16 with async to hack the CTAR0 to output 16 bit mode and when it completes return it back to 8 bit mode and it appears now to be working on SPI on T3.2/5/6...

Will test again ON t3.6 for SPI1 and 2...

Plus from your message I think I need to take another pass at 3.5, to reverse the usage of interrupts in the RX and Transfer modes.

tni · May 15, 2017

KurtE said:
Plus from your message I think I need to take another pass at 3.5, to reverse the usage of interrupts in the RX and Transfer modes.

I tried DMA channel linking for SPI1, 16-bit transfer. Performance is quite good, 25.7Mbit/s.

The linking limits the major loop to 512, so additional handling for larger sizes is required. Scatter-gather DMA didn't properly work when I tried it (command word corruption).

Code:

#include <SPI.h>
#include <array>
#include <DMAChannel.h>

auto& spi = SPI1;
auto DMAMUX_SOURCE = DMAMUX_SOURCE_SPI1_RX;
KINETISK_SPI_t& spi_port() { return spi.SPIRegisters(); }

auto& serial = Serial;
SPISettings spi_settings(-1, LSBFIRST, SPI_MODE0);

const size_t buffer_size = 512;
static_assert(buffer_size <= 512, "DMA channel linking limits major loop to 512.");

std::array<uint16_t, buffer_size> src_buffer;
std::array<uint16_t, buffer_size> dest_buffer;

DMAChannel dma_rx;
DMAChannel dma_tx;

volatile bool spi_dma_done = false;

void rxIsr() {
    dma_rx.clearInterrupt();
    spi_dma_done = true;
}

void transferDma16(uint16_t* src, uint16_t* dest, size_t count) {
    spi_dma_done = false;

    spi_port().CTAR0 = spi_port().CTAR1;
    
    dma_rx.disable();
    dma_rx.source((uint16_t&) spi_port().POPR);
    dma_rx.destinationBuffer(dest, 2 * count);
    dma_rx.disableOnCompletion();
    dma_rx.attachInterrupt(rxIsr);
    dma_rx.interruptAtCompletion();
    dma_rx.triggerAtHardwareEvent(DMAMUX_SOURCE);

    dma_tx.disable();
    dma_tx.destination((uint16_t&) spi_port().PUSHR);
    dma_tx.sourceBuffer(src + 2, 4 * count - 4);
    dma_tx.disableOnCompletion();
    dma_tx.triggerAtTransfersOf(dma_rx);

    dma_tx.enable();

    spi_port().RSER = SPI_RSER_RFDF_RE | SPI_RSER_RFDF_DIRS; // DMA on receive FIFO drain flag
    spi_port().MCR = SPI_MCR_MSTR | SPI_MCR_CLR_TXF | SPI_MCR_CLR_RXF | SPI_MCR_PCSIS(0x1F);
    spi_port().SR = 0xFF0F0000;

    // put two words in flight - the T3.5 SPI1 RX queue has room for 2 entries
    spi_port().PUSHR = src[0] | SPI_PUSHR_CTAS(0) | SPI_PUSHR_CONT;
    (void) spi_port().PUSHR; // delay, or we might loose a TX entry
    spi_port().PUSHR = src[1] | SPI_PUSHR_CTAS(0) | SPI_PUSHR_CONT;
    
    dma_rx.enable();
}

void setup() {
    for(size_t i = 0; i < buffer_size; i++) {
        src_buffer[i] = i;
    }
    serial.begin(115200); delay(2000);
    serial.println("SPI transfer16 test start.");
    spi.begin();
    spi.beginTransaction(spi_settings);
    uint32_t start_time = micros();
    transferDma16(src_buffer.data(), dest_buffer.data(), buffer_size);
    while(!spi_dma_done) ;
    uint32_t end_time = micros();
    uint32_t elapsed = end_time - start_time;
    spi.endTransaction();
    serial.printf("rx buf: %x   dma ptr: %x\n", uint32_t(dest_buffer.data()), uint32_t(dma_rx.destinationAddress()));
    serial.printf("tx buf: %x   dma ptr: %x\n", uint32_t(src_buffer.data()), uint32_t(dma_tx.sourceAddress()));
    serial.printf("Elapsed: %u MBbit: %f\n", elapsed, float(sizeof(src_buffer)) * 8 / elapsed);
    for(size_t i = 0; i < buffer_size; i++) {
        serial.printf("%04x -> %04x\n", src_buffer[i], dest_buffer[i]);
    }
}

void loop() {}

KurtE · May 15, 2017

Thanks!

Looks like you have everything there to integrate it in... Probably also need to do same for 8 bit version, but probably simpler.

I was thinking I need to update the code to handle having the ISR handle cases where the transfer size requested is larger than the internals can handle...

I also should mention it looks a lot cleaner than having to use an SPI ISR to force feed one of the SPI areas...

And I keep having to look up things as my C++ knowledge is from ancient days

... Like the use of the word auto.

tni · May 15, 2017

Bad news. The RX queue being able to hold 2 entries appears to only work in some cases. Having 2 TX words in flight doesn't work at slower SPI speeds. 1 word in flight (still 18.3 MBit/s):

Code:

#include <SPI.h>
#include <array>
#include <DMAChannel.h>

auto& spi = SPI1;
auto DMAMUX_SOURCE = DMAMUX_SOURCE_SPI1_RX;
KINETISK_SPI_t& spi_port() { return spi.SPIRegisters(); }

auto& serial = Serial;
SPISettings spi_settings(-1, LSBFIRST, SPI_MODE0);

const size_t buffer_size = 512;
static_assert(buffer_size <= 512, "DMA channel linking limits major loop to 512.");

std::array<uint16_t, buffer_size> src_buffer;
std::array<uint16_t, buffer_size> dest_buffer;

DMAChannel dma_rx;
DMAChannel dma_tx;

volatile bool spi_dma_done = false;

void rxIsr() {
    dma_rx.clearInterrupt();
    spi_dma_done = true;
}

void transferDma16(uint16_t* src, uint16_t* dest, size_t count) {
    spi_dma_done = false;

    spi_port().CTAR0 = spi_port().CTAR1;
    
    dma_rx.disable();
    dma_rx.source((uint16_t&) spi_port().POPR);
    dma_rx.destinationBuffer(dest, 2 * count);
    dma_rx.disableOnCompletion();
    dma_rx.attachInterrupt(rxIsr);
    dma_rx.interruptAtCompletion();
    dma_rx.triggerAtHardwareEvent(DMAMUX_SOURCE);

    dma_tx.disable();
    dma_tx.destination((uint16_t&) spi_port().PUSHR);
    dma_tx.sourceBuffer(src + 1, 2 * count - 2);
    dma_tx.disableOnCompletion();
    dma_tx.triggerAtTransfersOf(dma_rx);

    dma_tx.enable();

    spi_port().RSER = SPI_RSER_RFDF_RE | SPI_RSER_RFDF_DIRS; // DMA on receive FIFO drain flag
    spi_port().MCR = SPI_MCR_MSTR | SPI_MCR_CLR_TXF | SPI_MCR_CLR_RXF | SPI_MCR_PCSIS(0x1F);
    spi_port().SR = 0xFF0F0000;

    spi_port().PUSHR = src[0] | SPI_PUSHR_CTAS(0) | SPI_PUSHR_CONT;

    dma_rx.enable();
}

void setup() {
    for(size_t i = 0; i < buffer_size; i++) {
        src_buffer[i] = i;
    }
    serial.begin(115200); delay(2000);
    serial.println("SPI transfer16 test start.");
    spi.begin();
    spi.beginTransaction(spi_settings);
    uint32_t start_time = micros();
    transferDma16(src_buffer.data(), dest_buffer.data(), buffer_size);
    while(!spi_dma_done) ;
    uint32_t end_time = micros();
    uint32_t elapsed = end_time - start_time;
    spi.endTransaction();
    serial.printf("rx buf: %x   dma ptr: %x\n", uint32_t(dest_buffer.data()), uint32_t(dma_rx.destinationAddress()));
    serial.printf("tx buf: %x   dma ptr: %x\n", uint32_t(src_buffer.data()), uint32_t(dma_tx.sourceAddress()));
    serial.printf("Elapsed: %u MBbit: %f\n", elapsed, float(sizeof(src_buffer)) * 8 / elapsed);
    for(size_t i = 0; i < buffer_size; i++) {
        serial.printf("%04x -> %04x\n", src_buffer[i], dest_buffer[i]);
    }
}

void loop() {}

KurtE · May 15, 2017

tni said:
Bad news. The RX queue being able to hold 2 entries appears to only work in some cases. Having 2 TX words in flight doesn't work at slower SPI speeds. 1 word in flight (still 18.3 MBit/s):

My assumption already was that the queue can only hold one item... However what typically happens is that when the queue was empty and you put an item into it, it will move that first item into the output shift register and then set the flag saying that the queue is not full... I was planning to check for the not full condition before pushing the 2nd item.

Kurt

tni · May 15, 2017

KurtE said:
My assumption already was that the queue can only hold one item...

The non-DMA code has 2 transmit words in flight. If there is a delay, 2 RX words must be held. I tested it, it works reliably no matter how long the delay. It may very well be that one item is in the RX FIFO and one is held back in the input shift register. The DMA transfer may trigger an early clearing of the shift register.

On Teensy LC, holding 2 RX words doesn't work at all, that's why interrupts must be disabled.

KurtE · May 15, 2017

I just updated my new Async Transfer and Transfer16 for T3.5 to use the linked channels. So far it is only pushing one item out at the start... May try two later.
It was nice to be able to remove the SPI interrupt handler as well as TX interrupt handler for the T3.5 which I had in there.

I tried new version on T3.6 on SPI and SPI1 and on T3.5 on SPI/SPI1/SPI2.

Next up add Transfer16 async to LC side and then probably add support for async transfer size > then one SPI DMA Channel can handle, which on
T.3x except T3.5 SPI1/2 is 32K and SPI1/2 on T3.5 is 512...

Don't remember how big on TLC... Will see again when I update.

KurtE · May 17, 2017

If anyone is playing along, earlier today I pushed a fix up for async transfer16 for T3.x. Where it had issues with write only operations or read only operations... Count issue.

To help test this code, I made a version of my ili9341_t3n library that uses the new stuff and remove all of the functionality that relied on knowing the T3.x SPI queues and as such SPIN, but instead rely on some of the new capabilities here.

Examples include things like:

a) fillScreen and fillRect use new transfer16 with count function. Both internal to the setAddr function which uses it to write the X0, X1 as one transfer and likewise Y0 and Y1...
And then to output the actual color data. It blasts it all out with the transfer16 by setting the Fill color and then passing NULL for buf and rtbuf, like:

Code:

		beginSPITransaction();
		setAddr(x, y, x+w-1, y+h-1);
		writecommand_cont(ILI9341_RAMWR);
		setDataMode();
		_spi->setTransferWriteFill(color);  // Set the transfer word.
		_spi->transfer16(NULL, NULL, w*h);	// Send out the number of words called for.		
		endSPITransaction();

This is similarly done for the HLINE and VLINE.

a1) WriteRect and likewise for the Frame buffer update screen, are done like a) but pass in the pointer to data to write to screen.

b) For functions like drawing opaque text or gradient fills or... That also loop outputting data, I could not do the simple transfer, like a). So started off by using temporary buffer, and fill it and then call the transfer16, which helped, but left gaps between these groups. So now have two buffers and I use the Async transfer16 method to write out the buffers. So have it output one buffer and fill next and when first one completes it starts up the next one... Which is doing pretty well.

c) ReadRect: like b) I have it setup to double buffer reading in. I have the first read done SYNCH as can not do anything until it returns data, and then I have it set that if the first read did not read in all of the data for the rectangle it issues an async read to start reading into 2nd buffer, while I then process the data from the first buffer...

If anyone is curious I have included this WIP (maybe only for test) library and a test program... Actually the library has the graphic test program as well...

Now to test the Async Update screen code of the library which again will help test some of the Async support, plus will use to test on handling Async write whose length > max DMA transfer setting. Currently my update already breaks it up into 3 writes, but once I fix library it should be able to be done all internal to SPI...

Warning, if you play with my included test program it is setup to only work on T3.6 as it is the only one with enough memory to have full frame buffer for this display...

Maybe later I will hack up this library farther to maybe allow a smaller frame buffer which you can turn on for regions of the display. That when on it sets clipping rectangle to that area...
Then maybe on T3.5 for example could have a frame buffer for half the screen. Where you could do something like:
Set the frame buffer for top half of display, call your stuff to update display, then update the top half.
Set the frame buffer to lower half....

You could then potentially setup the code that when the DMA operation completes and you get called back, you could setup to do the next part... But that is totally off subject, but might be interesting and useful.

Edit: Updated library - The DMA (Async) test failed, needed to add calls to SPI.beginTransaction/endTransaction. Also found case in library when in frame buffer mode was still doing begin/end and one spi.Transfer that should not be done.

How best to manage multiple SPI busses?

Senior Member+

Senior Member+

Senior Member+

Senior Member+

Senior Member+

Senior Member+

Senior Member+

Senior Member+

Senior Member+

Well-known member

Senior Member+

Well-known member

Well-known member

Senior Member+

Senior Member+

Well-known member

Senior Member+

Senior Member+

Well-known member

Senior Member+

Well-known member

Senior Member+

Well-known member

Senior Member+

Senior Member+

Attachments