Asynchronous/Non-Blocking read from SD card on Teensy 4.1

Hi all,

I'm currently working on creating an open-source ODE (optical drive emulator) for the Sega Saturn. I've been prototyping using a Teensy 4.1 and an FPGA, and just managed to create a basic 'modchip', which streams the correct security ring data from the Teensy->FPGA->Saturn using non-blocking SPI. Here it is in action: https://www.youtube.com/watch?v=gNPMG-Da28I

Also, the code in case anyone is interested:

Code:
#include <SPI.h>
#include <EventResponder.h>

#define SPI_SPEED 16000000
const int SPI0_CS_PIN = 10;
const int SPI1_CS_PIN = 0;
const int CD_COM_READY_INT = 31;
const int CD_DATA_READY_INT = 32;
volatile byte cd_data_ready;
volatile byte cd_com_ready;

EventResponder cd_data_event;
volatile bool cd_data_transfer_complete = false;

#define COM_TRANSFER_GET_CURRENT_VALS 0
#define COM_TRANSFER_SET_STATUS 1

EventResponder cd_com_event;
volatile bool cd_com_transfer_complete = false;
volatile bool cd_com_transfer_active = false;
uint8_t cd_com_transfer_type = 0;
bool cd_com_status_needs_update = false;

#define BUFFER_SIZE 0x12000l   // More than 64K
uint8_t *cd_data_send_buf;
DMAMEM uint8_t cd_data_recv_buf[BUFFER_SIZE];
uint8_t *cd_com_send_buf;
DMAMEM uint8_t cd_com_recv_buf[BUFFER_SIZE];

void cd_data_event_responder(EventResponderRef event_responder)
{
  digitalWriteFast(SPI0_CS_PIN, HIGH);
  cd_data_transfer_complete = true;
}

void cd_com_event_responder(EventResponderRef event_responder)
{
  digitalWriteFast(SPI1_CS_PIN, HIGH);
  cd_com_transfer_complete = true;
  cd_com_transfer_active = false;
}

void cd_data_interrupt_func() {
  cd_data_ready = true;
}

void cd_com_interrupt_func() {
  cd_com_ready = true;
}

void setup() {
  uint32_t i;

  Serial.begin(2000000);
  // start the SPI library:
  SPI.begin();
  SPI1.begin();
  SPI.beginTransaction(SPISettings(SPI_SPEED, MSBFIRST, SPI_MODE0));
  SPI1.beginTransaction(SPISettings(SPI_SPEED, MSBFIRST, SPI_MODE0));

  pinMode(SPI0_CS_PIN, OUTPUT);
  pinMode(SPI1_CS_PIN, OUTPUT);
  digitalWriteFast(SPI0_CS_PIN, HIGH);
  digitalWriteFast(SPI1_CS_PIN, HIGH);
  
  pinMode(CD_DATA_READY_INT, INPUT_PULLDOWN);
  pinMode(CD_COM_READY_INT, INPUT_PULLDOWN);
  attachInterrupt(digitalPinToInterrupt(CD_DATA_READY_INT), cd_data_interrupt_func, RISING);
  attachInterrupt(digitalPinToInterrupt(CD_COM_READY_INT), cd_com_interrupt_func, RISING);

  cd_data_send_buf = (uint8_t *)malloc(BUFFER_SIZE);
  memset(cd_data_send_buf, 0, BUFFER_SIZE);
  cd_com_send_buf = (uint8_t *)malloc(BUFFER_SIZE);
  memset(cd_com_send_buf, 0, BUFFER_SIZE);
  cd_data_send_buf[0] = 0x03;
  memset(&cd_data_send_buf[1], 0xff, 12);
  cd_com_send_buf[0] = 0x02;

  cd_data_event.attachImmediate(&cd_data_event_responder);
  cd_com_event.attachImmediate(&cd_com_event_responder);
  cd_data_ready = 0;
  cd_com_ready = 0;
  cd_data_transfer_complete = 0;
  cd_com_transfer_complete = 0;

  digitalWriteFast(SPI1_CS_PIN, LOW);

  /* Tell FPGA to just transfer Saturn status data for now. */
  SPI1.transfer(0x01);

  digitalWriteFast(SPI1_CS_PIN, HIGH);

  delay(100);
}

byte command_data[13];
byte status_data[13];
int mod_state = 0;
uint32_t cur_lba = 0;
uint32_t cur_channel = 0;

static uint32_t get_current_lba()
{
  uint32_t lba;

  lba = command_data[1] << 16;
  lba |= command_data[2] << 8;
  lba |= command_data[3];

  return lba;
}

static void lba_to_bcd_msf(uint32_t lba, uint8_t *min, uint8_t *sec, uint8_t *frame)
{
  /* 75 frames per second * 60 seconds per minute = 4500 frames in a minute. */
  *min = lba / 4500;
  *sec = (lba % 4500) / 75;
  *frame = (lba % 4500) % 75;

  /* Now convert to BCD? Or do before. Who cares I guess. */
  *min = (((*min) / 10) << 4) + ((*min) % 10);
  *sec = (((*sec) / 10) << 4) + ((*sec) % 10);
  *frame = (((*frame) / 10) << 4) + ((*frame) % 10);
}

/*
 *  A frame is 24 bytes, 12 bytes left, 12 bytes right channel. 98 frames make up a sector,
 *  so 98x24 = 2352.
 *  We're addressing by channel here, so there's 1176 (2352/2) individual
 *  channels of data since each channel is 2 bytes.
 */
static void get_sec_ring_data_channel(uint32_t channel_cnt, uint8_t *buf)
{
  uint8_t m, s, f;

  /* Subcode is apparently irrelevant, or ignored by Saturn. */
  buf[0] = 0x00;

  switch (channel_cnt)
  {
  case 0:
    buf[1] = 0xff;
    buf[2] = 0x00;
    break;

  case 1 ... 4:
    buf[1] = 0xff;
    buf[2] = 0xff;
    break;

  case 5:
    buf[1] = 0x00;
    buf[2] = 0xff;
    break;

  case 6:
    lba_to_bcd_msf(cur_lba, &m, &s, &f);
    buf[1] = s ^ 0x80;
    buf[2] = m ^ 0x01;
    break;

  case 7:
    lba_to_bcd_msf(cur_lba, &m, &s, &f);
    buf[1] = 0x02 ^ 0x60; /* CD mode 2, XOR against 0x60. */
    buf[2] = f;
    break;

  case 8:
    buf[1] = 0x28;
    buf[2] = 0x00;
    break;

  case 9:
    buf[1] = 0x1e;
    buf[2] = 0x28;
    break;
  
  case 10:
    buf[1] = 0x08;
    buf[2] = 0x80;
    break;

  case 11:
    buf[1] = 0x06;
    buf[2] = 0x48;
    break;

  case 12 ... 1173:
    buf[1] = 0x59;
    buf[2] = 0xa8;
    break;
    
  case 1174:
    buf[1] = 0xdd;
    buf[2] = 0x72;
    break;

  case 1175:
    buf[1] = 0x99;
    buf[2] = 0xe5;
    break;
    
  /* Shouldn't hit this. */
  default:
    buf[1] = 0x00;
    buf[2] = 0x00;
    break;
  }
}

/* Retrieves four frames from the given start frame. */
static void get_sec_ring_data_channels(uint32_t start_channel, uint8_t *buf)
{
  uint32_t i;
  for (i = 0; i < 4; i++)
    get_sec_ring_data_channel(start_channel + i, &buf[i * 3]);
}

static bool check_seek_to_sec_ring()
{
  if ((command_data[0] == 0x02) && (get_current_lba() >= 0x3ef00))
  {
    cur_lba = get_current_lba();
    cd_com_status_needs_update = true;
    Serial.println("mod_state = 1");
    return true;
  }

  return false;
}

static bool check_read_at_sec_ring()
{
  if ((command_data[0] == 0x06) && (get_current_lba() >= 0x3df00))
  {
    /* Need to start 4 blocks before requested LBA. */
    cur_lba = get_current_lba() - 4;
    cur_channel = 0;
    cd_com_status_needs_update = true;
    Serial.println("mod_state = 2");
    return true;
  }

  return false;
}

static bool check_normal_command()
{
  if ((command_data[0] != 0x00) && (get_current_lba() < 0x3d000))
  {
    cur_lba = 0;
    return true;
  }

  return false;
}

static uint8_t get_status_parity(uint8_t *buf)
{
  uint8_t parity, i;

  for (i = parity = 0; i < 11; i++)
    parity += buf[i];

  return ~parity;
}

static void get_seek_to_sec_ring_status(uint8_t *buf)
{
  buf[0] = 0xb6;
  buf[1] = 0x44;
  buf[2] = 0xf1;
  buf[3] = (cur_lba >> 16) & 0xff;
  buf[4] = (cur_lba >> 8) & 0xff;
  buf[5] = cur_lba & 0xff;
  buf[6] = 9;
  buf[7] = 9;
  buf[8] = 9;
  buf[9] = 9;
  buf[10] = 0;
  buf[11] = get_status_parity(buf);
  buf[12] = 0;
}

static void get_read_at_sec_ring_status(uint8_t *buf)
{
  uint8_t m, s, f;

  lba_to_bcd_msf(cur_lba, &m, &s, &f);
  buf[0] = 0x36;
  buf[1] = 0x01;
  buf[2] = 0xaa;
  buf[3] = 0x01;
  buf[4] = m;
  buf[5] = s;
  buf[6] = f;
  buf[7] = 4;
  buf[8] = m;
  buf[9] = s;
  buf[10] = f;
  buf[11] = get_status_parity(buf);
  buf[12] = 0;
}

void loop() {

  int i;

  if (cd_data_ready)
  {
      cd_data_transfer_complete = false;
      digitalWriteFast(SPI0_CS_PIN, LOW);
      SPI.transfer(cd_data_send_buf, cd_data_recv_buf, 13, cd_data_event);
      cd_data_ready = false;
  }

  if (cd_com_status_needs_update && !cd_com_transfer_active)
  {
      cd_com_transfer_complete = false;
      cd_com_send_buf[0] = 0x03;
      if (mod_state == 1)
        get_seek_to_sec_ring_status(&cd_com_send_buf[1]);
      else if (mod_state == 2)
        get_read_at_sec_ring_status(&cd_com_send_buf[1]);

      /* Need to set send buffer with correct status. */
      cd_com_transfer_type = COM_TRANSFER_SET_STATUS;
      cd_com_transfer_active = true;
      digitalWriteFast(SPI1_CS_PIN, LOW);
      SPI1.transfer(cd_com_send_buf, cd_com_recv_buf, 14, cd_com_event);
      cd_com_status_needs_update = false;
  }

  if (cd_com_ready && !cd_com_transfer_active)
  {
      cd_com_transfer_complete = false;
      digitalWriteFast(SPI1_CS_PIN, LOW);
      memset(cd_com_recv_buf, 0, 26);
      memset(&cd_com_send_buf[1], 0, 26);
      cd_com_send_buf[0] = 0x02;
      SPI1.transfer(cd_com_send_buf, cd_com_recv_buf, 27, cd_com_event);
      cd_com_transfer_type = COM_TRANSFER_GET_CURRENT_VALS;
      cd_com_transfer_active = true;
      cd_com_ready = false;
  }
  
  if (cd_data_transfer_complete || cd_com_transfer_complete)
  {
      if (cd_data_transfer_complete)
      {
          if (status_data[0] == 0xb6)
          {
              Serial.println("data");
              for (i = 0; i < 12; i++)
              {
                  Serial.println(cd_data_recv_buf[i + 1], HEX);
              }
              Serial.println("");
          }

          /* if cur_channel == 1176, update lba/msf, update status, loop back around. */
          if (mod_state == 2)
          {
            get_sec_ring_data_channels(cur_channel, &cd_data_send_buf[1]);
            cur_channel += 4;
            if (cur_channel == 1176)
            {
              /* Update LBA and status. */
              cur_lba++;
              cd_com_status_needs_update = true;
              cur_channel = 0;
            }
          }
          else
            memcpy(&cd_data_send_buf[1], &cd_data_recv_buf[1], 12);
          cd_data_transfer_complete = false;
      }
      else if (cd_com_transfer_complete && (cd_com_transfer_type == COM_TRANSFER_GET_CURRENT_VALS))
      {
          Serial.println("com");
          for (i = 0; i < 26; i++)
          {
              Serial.print(cd_com_recv_buf[i + 1], HEX);
              Serial.print(" ");
              if (i == 12)
                Serial.println("");
          }
          Serial.println("");
          Serial.println("");
          memcpy(status_data, &cd_com_recv_buf[1], 13);
          memcpy(command_data, &cd_com_recv_buf[14], 13);
          if (check_seek_to_sec_ring())
            mod_state = 1;
          else if (check_read_at_sec_ring())
            mod_state = 2;
          else if (check_normal_command())
          {
            if (mod_state != 0)
            {             
              digitalWriteFast(SPI1_CS_PIN, LOW);
              SPI1.transfer(0x01);
              digitalWriteFast(SPI1_CS_PIN, HIGH);
            }

            mod_state = 0;
          }
          
          cd_com_transfer_complete = false;
      }
      else if (cd_com_transfer_complete && (cd_com_transfer_type == COM_TRANSFER_SET_STATUS))
      {
          Serial.println("com status updated");
          for (i = 0; i < 13; i++)
          {
              Serial.print(cd_com_send_buf[i + 1], HEX);
              Serial.print(" ");

          }
          Serial.println("");
          Serial.println("");
          cd_com_transfer_complete = false;
      }
  }
}

So, now the next step is to begin transferring data from an SD card to the Saturn. I'd like to do this in a non-blocking way, just like I'm doing currently with the two SPI interfaces. I'm not having much luck finding example code that uses the SD card interface on the Teensy 4.1 in a non-blocking manner, and was hoping someone could point me to some if it exists.

I'm also curious if there will be a limit as to how many non-blocking transfers I can have going at once. I'm already using two SPI interfaces in this way, so I'm hoping I'll be able to have the SD card working too. I can likely cut that down to just one SPI interface without too much trouble if there's a limitation there.

Thanks for reading. :)
 
Well, doing some digging through the forum, I now see why I can't find any examples of this sort of thing: https://forum.pjrc.com/threads/7004...for-teensy-4-1?p=304278&viewfull=1#post304278

In an ideal world of infinite programming hours in every day, we'd have a SD library with non-blocking API, so you could begin a write to the card and later get a function called when it completes, or call a function to check whether the write is still in progress. But with SD & SdFat, writes are blocking. So to get sustained speed, you need interrupts or DMA to collect the incoming data into a fairly large buffer, so you don't lose data while the SD library is waiting on the card to complete writing.

So, it sounds like I'll need to implement this sort of thing myself. I'm probably going to give that a shot and report back here with how it goes. :)
 
Took me awhile to get some free time to try to get this working, but I've spent the last week hacking on this, and managed to implement asynchronous file reading in SdFat for Fat32/16/12 using ADMA2. The code needs a bit of cleanup, but once I feel like it's not too ugly I'll push the branch to my github repository.

There are a couple tricky parts here that will need to be solved for this to be fully usable:

1. Fragmented files. If a read/write is requested on a file, and the range requested to read/write on the file is on a set of clusters that aren't contiguous sectors on the SD card, it will need to be split up across multiple ADMA2 transfers. Apparently ADMA3 solves this (it allows you to specify an SD card command alongside ADMA2 descriptors) but since we've only got ADMA2, we'll need a way to extend the API to account for this. My idea is, use the status value in the event responder to indicate whether or not a continue is needed. I.e call something like:

Code:
triggerEvent(FILE_TRANSFER_NEEDS_CONTINUE);

and then the user would call a method that continues the transfer, something like

Code:
file.continueTransfer();

until the event responder gives a status of transfer complete.

2. Ideally, we'd keep the old synchronous methods of either FIFO/DMA, and asynchronous reads would be an extension using a separate method. I feel like this would allow for the best of both worlds, FIFO SDIO which should be fast for small, blocking transfers (which we need to discover which clusters/sectors to read on the SD card for ADMA) and then asynchronous methods using ADMA. For these to coexist, we'll need some way to make sure they don't interfere with each other.

Anyways, hopefully I'll get some more free time next month to get this cleaned up. Posting this here in the off chance anyone has any thoughts on this. :)
 
Really interested in this, hope someday it can become part of the SD / SdFat library for everyone.

Right now I have several other projects going, so can't do much for a while... other than say I'm really interested. :)
 
@Conmanx360 Amazing that you got this working. I am in the process of figuring this out for reads and writes. I totally get the messy code commit problem, but would you be willing to post some snippets of how you setup and trigger the ADMA transfers?
 
Woops, sorry for the late reply, I guess I didn't have email notifications setup for this thread. :) Uploaded a branch with my hacky work here: https://github.com/Conmanx360/SdFat/tree/WIP/async-read-v1

So far it only supports FAT32 and asynchronous reading, it also won't work for fragmented files right now. But, it works with both blocking FIFO and blocking DMA, and allows asynchronous reads on both setups. Here's the benchmarks from my modified TeensySdioDemo:

Code:
FIFO SDIO mode, asynchronous reads.

size,write,read
bytes,KB/sec,KB/sec
512,20479.80,933.48
1024,20495.06,1711.77
2048,20706.94,3144.24
4096,20648.58,4556.46
8192,20462.61,7171.67
16384,20648.27,11371.42
32768,20660.32,14161.65
65536,20260.97,17690.02
131072,20549.33,20243.32

totalMicros  25458973
yieldMicros  305391
yieldCalls   208
yieldMaxUsec 8974
Done

FIFO SDIO mode, synchronous reads.

size,write,read
bytes,KB/sec,KB/sec
512,20064.74,22333.29
1024,19865.23,22541.96
2048,20727.10,22699.15
4096,20796.62,22752.53
8192,20495.71,22777.24
16384,20721.31,22788.32
32768,20768.66,22792.22
65536,20390.89,22786.40
131072,20666.89,22784.17

totalMicros  7010495
yieldMicros  310492
yieldCalls   208
yieldMaxUsec 9337
Done

DMA SDIO mode, asynchronous reads.

size,write,read
bytes,KB/sec,KB/sec
512,538.89,936.18
1024,783.09,2183.26
2048,1633.95,3967.37
4096,4546.67,5080.45
8192,6102.41,11530.75
16384,8362.93,12872.34
32768,7329.16,13503.42
65536,7413.00,17327.72
131072,7779.26,20071.37

totalMicros  58460370
yieldMicros  21893651
yieldCalls   33435
yieldMaxUsec 11083
Done

DMA SDIO mode, synchronous reads.

size,write,read
bytes,KB/sec,KB/sec
512,425.77,1695.76
1024,923.66,2252.27
2048,1879.31,3953.39
4096,3547.92,6839.38
8192,5714.22,10722.49
16384,13146.31,13364.21
32768,6844.80,13579.69
65536,6934.56,13432.73
131072,7343.37,13579.14

totalMicros  56585764
yieldMicros  41248223
yieldCalls   66459
yieldMaxUsec 28351
Done

Asynchronous reads start to catch up synchronous FIFO reads once the transfer gets pretty large.

It seems to work well enough for what I want to use it for, but I would like to continue to clean this up so it's not so hacky and more usable by others. On my list of things to do:
-Add support for asynchronous writes.
-Add support for asynchronous reads/writes on fragmented files.
-Add support for exFat.
 
Oh, and a few benchmarks when reading from my CD file:

Code:
Starting read at 1b90, reading 174048 bytes
Starting sequential
Took 0.009705 seconds
Starting asynchronous transfer
Took 0.007942 seconds

Starting read at 2c370, reading 174048 bytes
Starting sequential
Took 0.008420 seconds
Starting asynchronous transfer
Took 0.007948 seconds

Starting read at 56b50, reading 174048 bytes
Starting sequential
Took 0.008420 seconds
Starting asynchronous transfer
Took 0.007949 seconds

These are comparing FIFO blocking reads to ADMA2 non-blocking ones, ADMA2 seems to be consistently faster although not by a crazy amount.
 
Super impressive work on the ADMA2 stuff and in general! I am pretty sure I could not have figured this out even after readying manual on ADMA2. Not straight forward at all and requires understanding the SD card data formats as well. Even with your read code I am not sure I could extend it to writes, but I might take a stab at it.

Thanks so much for sharing :) Your benchmarks confirm that async read/write will solve the problems I am having, but my application needs the writes so I will definitely be on the lookout for any updates to your fork.
 
Thanks, and no problem. :)

The ADMA2 stuff is actually not so bad, and to be honest after I came to understand it more we could probably do just fine without it. From what I can tell, it's mainly designed to help out in cases where you have multiple memory pages that are non-contiguous, like you would in a system with an MMU. That's not the case here, we have contiguous blocks of memory allocated, so we could get away with the SDMA system just fine probably. The one advantage ADMA2 gives is that we can add extra descriptors to start reading data mid-sector, i.e if the file data we want starts at byte 256 of a 512 byte sector, we can just add an initial descriptor that reads 256 bytes into a scratch buffer and begin reading mid-sector. This really only saves 1 or 2 sector reads, assuming we start and end our read/write mid-sector, so even then it's probably not a crazy speedup. ADMA3 would be nice though, since it'd solve the file fragmentation issue completely.

Like you said, the SD card data format stuff is where things actually get tricky. Figuring out the actual sector on the SD card that the data you want is located takes some work, and then figuring out how your data will fit into those sectors/file system clusters is also a bit of work. In the case of writes, if you want to start a write in the middle of an SD card sector, you're first going to have to read the sector in order to get first half of data that you don't want to overwrite. That would be stored into the scratch buffer we're using for ADMA2 alignment. I think it won't be too hard to add this, but I guess I'll see if/when I get to it.

All of this is technically feasible, the only thing I still haven't worked out is API design. I'll probably start working on that more once the underlying functionality is all working.
 
Really intrigued by this, I might have a good use for this in my video playing app, which currently does blocking reads and blocking buffer transfers to a screen. However, my cards are 64Gb+ and formatted exFAT. Is there a limitation or a reason that this wouldn't work on exFAT, or just that you didn't have the need to implement?
 
As far as I know, there shouldn't be any limitation or reason this same method wouldn't work on exFAT. The only reason I started with Fat32 was that I assumed it would be simpler to do first and it's what the SD card I've been working with is formatted as. Just from doing some surface level research, it might even be easier since it seems like exFAT has features that make fragmented files much less likely.
 
@Conmanx360 - I did some testing, and this works so well, got some great improvements in my application!

Quick background - I pre-encode video files to have a repeating set of video frame data in the format my screen buffer takes, followed by the audio data, repeated per frame. Then it's just a process of a loop per frame, which is a loop of reading chunks of video data into a partial screen buffer and flushing to screen, until that frame is complete, then read the audio data for the frame, then sit and wait for a period (frame_rate_time - read_time - buffer_time) before going on to the next frame, to maintain the encoded frame rate. The SDcard reads and buffer flush are blocking.

I found a 32Gb microSD card, moved some of my smaller (<4Gb) video files to it, modified my code to use SdFs and FsFile (instead of SdExFat and ExFile), then made a quick and dirty set of changes to support (mostly) async reads for the video frame data, and did some timing......

Video resolution 800x420, bit depth 24bpp, encoded frame rate 23.98fps
[table="width: 100%", align: left]
[tr]
[td]Regular[/td]
[td]AVG: frame: 40562µS (24.65fps), read: 24090µS + 723µS (40.9MB/Sec), buffer: 14639µS, house: 1022µS, wait: 1121µS, asyncWait: 0µS[/td]
[/tr]
[tr]
[td]Async blocking[/td]
[td]AVG: frame: 45877µS (21.80fps), read: 29155µS + 709µS (37.0MB/Sec), buffer: 14628µS, house: 1030µS, wait: -4258µS, asyncWait: 27165µS[/td]
[/tr]
[tr]
[td]Async[/td]
[td]AVG: frame: 32261µS (31.00fps), read: 28816µS + 724µS (37.1MB/Sec), buffer: 14677µS, house: 1013µS, wait: 9483µS, asyncWait: 12307µS[/td]
[/tr]
[/table]

Video resolution 800x310, bit depth 16bpp, encoded frame rate 23.98fps:
[table="width: 100%", align: left]
[tr]
[td]Regular[/td]
[td]AVG: frame: 25536µS (39.16fps), read: 12099µS + 706µS (39.3MB/Sec), buffer: 11622µS, house: 1021µS, wait: 16124µS, asyncWait: 0µS[/td]
[/tr]
[tr]
[td]Async blocking[/td]
[td]AVG: frame: 27791µS (35.98fps), read: 14279µS + 752µS (36.7MB/Sec), buffer: 11538µS, house: 1014µS, wait: 13872µS, asyncWait: 13692µS[/td]
[/tr]
[tr]
[td]Async[/td]
[td]AVG: frame: 18387µS (54.39fps), read: 14238µS + 711µS (36.5MB/Sec), buffer: 11597µS, house: 1001µS, wait: 23294µS, asyncWait: 3472µS[/td]
[/tr]
[/table]

"Regular" is my original blocking read code, "Async blocking" is a drop-in of the asyncRead immediately followed by a wait for the read to complete (to give a comparison of read speed), and then "Async" is where all but the first chunk of each frame is read asynchronously while the buffer flushes the previous one.....

the key metric is the first part of the stats, eg - frame: 18387µS (54.39fps) - which is the time spent doing work and the effective frame rate possible. The "wait:" metric is how long the frame loop spends waiting at the end before it's time to start the next frame, so when this is higher, it means the frame processing is faster.

The numbers pretty much "add up" - the asyncRead is slightly slower than a regular read for the buffer size (approx 96k), and the decrease in frame processing time for "Async" tests is mostly because the read happened in parallel to the blocking buffer flush

I can improve upon your already improved numbers because it was quicker (coding wise) to read the first chunk of a frame with a blocking read to have data in "hand" for the buffer flush. I can restructure my code more extensively to avoid this, but I'm not sure how the blocking audio read at the end of the video data reads will work if there is a queued async read for the first chunk of the next video frame. I also didn't use a double-buffer, the buffer flush from the buffer is faster than the next read into the same buffer, so there was very little corruption and was sufficient for the test as it won't impact the overall performance when implemented correctly.

So, all in all, this is really excellent, and I hope you keep working on this and bring it to the exFAT world, which I need due to the 4Gb file limit and general need for larger cards due to aforementioned bloated files :) Happy to answer any questions and potentially do more testing if I can help!

P.S. the data rates reported are higher than typical because I ran my Teensy CPU at 816Mhz and bump the SDclock to 99'000 Khz. My screen driver is 8 bit parallel, necessary for such a large resolution screen (800*480) and 24bpp. I can do 16 bit parallel too, but the gating factor is still the read, not the buffer transfer (in the stats above, the asyncWait time is how long the code waits for the previously triggered read to complete, after the buffer is flushed).
 
Update - I eliminated the single blocking read at the start of each frame, which gave a boost to FPS of 1-3fps, depending on the bitdepth of the file read. I also implemented a double buffer, which removed the video corruption, so everything works perfectly, just at ~50% higher potential frame rate :)

The only downside from the double buffer is DMA memory space - I don't have room in my app for a second buffer of equal size to my current one, alongside all the other needs it has for DMA memory outside the video player. I tried reducing the original buffer by half (insignificant performance hit under my previous way) to allow for two but, alas, as the asyncRead performance drops with a smaller buffers, this doesn't help.

But still, tremendously happy with the potential here, I hope you can see this through to completion, at least on the read side!
 
@beermat - Very cool to hear it's working well for you, I just pushed a patch for ExFat async read to my github branch. ExFat seems to be pretty similar to Fat12/16/32, only big difference being the introduction of 64 bit values. I ran some local tests on files larger than 4GB and things seem to be working, but I also wouldn't be surprised if there are bugs to be found. :)

The question of how a blocking SD card read will behave if there's currently an ongoing async transfer is an API design thing I haven't really sorted out yet. My gut feeling is to just say "don't do that", but there's also the potential to block until the async transfer is done, then begin the blocking transfer. Or, return some error value to indicate the read didn't fail, but an async transfer is currently in progress. Not sure what's best there.

I probably won't get around to implementing writes until next month or so, not sure when. I've done a bit of preliminary research into it, and I don't think it'll be too hard, but it's one of those things I can't do over a weekend. Theoretically it's the same sequence of behaviors as reads, except we'll need to potentially allocate new clusters if we end up writing past the current end of file, but there's probably little caveats to that I'll need to handle. Anyways, glad to hear other people are finding this useful. :)
 
@Conmanx360 - Just dropped in your library, switched my FS back to ExFat (and flipped in my bigger exFAT card) and - everything seems to be working swimmingly!! Same ~50% effective frame rate improvement on video playing, due to being able to have the read for the next chunk of data async with my blocking frame buffer flush to the screen. The largest file I have is a hair over over 10Gb - 4m:30s music video, pre-encoded at 800*420 resolution and 3 bytes (RGB) per pixel @ 23.98fps, worked like a charm playing at an (effective) frame rate of ~34fps -- with blocking reads, I could only barely make the required frame-rate, with some overclocking, so this is a big plus!

For now, my blocking audio data read, in the spirit of "don't do that", immediately follows the completion of the last async read for the video data for a frame, so I don't have any overlap there. It's a small read, in the stats above that show "read: AµS + BµS", the B is the audio read and is approx. 700µS, so no big deal.

Thanks very much, again, for this! I'm sure you will get a lot of requests for this, especially for async writes. For me, that'd be a nice to have, I have one area of my app that would benefit, but I know for others, it'd be of big benefit for their particular apps, as writing to the SDcard is on their critical performance path. Keep up the great work! If I stumble on any issues, I'll let you know, but so far, so good!
 
Hello,

We are using Teensy 4.1 for our project and was wondering if you get any change to implement non-blocking write functionality.
Non blocking read works like a charm. Thanks for all the effort.
 
Back
Top