MarcelBIenvu
New member
Hi everybody,
Not sure if it is the good place or not to post this...
I've been trying to patch the SdFat, from Bill Greiman, with some ADMA capabilities and non blocking read and write operations. Looking at the manual, there is room for improvement, like SDR104 mode speeds (require some tuning...).
I took bits from the NXP SDK for the RT1062 platform but it *should* also work with Teensy 3.6. I'm looking to modify objects from the Audio Library to work with the modifications made, but I can't figure out why it is working with the little benchmark program i made, but freeze when i try to play with the play_sd_raw object.
Here is the benchmark program
And the results with a Kingston Canvas Go Plus:
kHzSdClk = 100000 in SdioTeensy.cpp
kHzSdClk = 200000 {writing is done with simple DMA at this speed)
As you can see, the time taken to send the command is around 3us, so it can be relevant for Audio processing (FrankB's Teensy-WavePlayer or teensy-variable-playback for interpolation) and other interrupts related situations. the 106us for the 1 block reading is due to the CMD17 being still blocking method, as filesystem functions seems to rely on classic transfer methods to not crash. (I don't understand it fully)
Here is the bit of code added at the end of SdioTeensy.cpp:
the rdWrSectors function is modified to route the read/write operations depending on the card speed set with kHzSdClk. At 100000, I can read and write, but at 200000, only read operations are possible so it is asked to comment out CMD24 and CMD25 in the if() statement.
the isBusy function has been modified as well:
The way wait for Transfer Complete is handled is surely a little fiddly, but I cannot figure out how to call sd.isBusy() another way.
And the related variable and function added among declaration at the beginning of the file:
I can post the whole sources (PlatformIO format) if someone is interested to proof test the modifications (a little help?), or just want to look at it.
Sorry for the weird code or formatting, I'm definitely no developer, just trying to understand and apply
Have a great evening!
Not sure if it is the good place or not to post this...
I've been trying to patch the SdFat, from Bill Greiman, with some ADMA capabilities and non blocking read and write operations. Looking at the manual, there is room for improvement, like SDR104 mode speeds (require some tuning...).
I took bits from the NXP SDK for the RT1062 platform but it *should* also work with Teensy 3.6. I'm looking to modify objects from the Audio Library to work with the modifications made, but I can't figure out why it is working with the little benchmark program i made, but freeze when i try to play with the play_sd_raw object.
Here is the benchmark program
Code:
#include <Entropy.h>
#include "SdFat.h"
const size_t BUF_DIM = 131072;
SdFs sd;
FsFile file;
uint8_t toWrite[BUF_DIM];
uint8_t toRead[BUF_DIM];
bool sdBusy() {
return sd.card()->isBusy();
}
void runTest() {
file.preAllocate(BUF_DIM);
Serial.println("Preallocate OK");
if (!file.open("teensyTest.bin", O_RDWR | O_CREAT)) Serial.println("open failed");
Serial.println("Open OK");
for(uint32_t i = 0; i < BUF_DIM; i++) {
toWrite[i] = random(255);
}
Serial.print("\n------------------------------------------------------------------/ WRITING /------------------------------------------------------------------\n\n");
Serial.printf("sectors\t\toctets\t\tcmd\t\twrite\t\tspeed\n");
uint8_t* bufW = (uint8_t*)toWrite;
for(uint32_t ns = 1; ns < 512; ns *= 2) {
uint32_t nbWr = BUF_DIM / (ns * 512);
int32_t cmdTime, writeTime, wr = 0;
int32_t cmdTot = 0, writeTot = 0;
file.rewind();
bufW = toWrite;
for(uint16_t j = 0; j < nbWr; j++) {
cmdTime = writeTime = micros();
wr = file.write(bufW, ns * 512);
cmdTot += micros() - cmdTime;
while(sdBusy()) {}
writeTot += micros() - writeTime;
bufW += ns * 512;
}
writeTot /= nbWr;
cmdTot /= nbWr;
float speed = (float)(ns * 512 / writeTot);
Serial.printf("%u\t\t%1.3f Ko\t%u us\t\t%u us\t\t%1.2f Mo/s\n", wr / 512, wr / 1000.0, cmdTot, writeTot, speed);
}
Serial.print("\n------------------------------------------------------------------/ READING /------------------------------------------------------------------\n\n");
uint8_t* buf = (uint8_t*)toRead;
Serial.printf("sectors\t\toctets\t\tcmd\t\tread\t\tspeed\t\t\terrors\t\tproofs\n");
int64_t errors = 0;
for(uint32_t ns = 1; ns < 512; ns *= 2) {
uint32_t nbRd = BUF_DIM / (ns * 512);
int32_t cmdTime, readTime, rd = 0;
int32_t cmdTot = 0, readTot = 0;
file.rewind();
buf = toRead;
memset(toRead, 0, BUF_DIM);
for(uint16_t j = 0; j < nbRd; j++) {
cmdTime = readTime = micros();
rd = file.read(buf, ns * 512);
cmdTot += micros() - cmdTime;
while(sdBusy()) {}
readTot += micros() - readTime;
buf += ns * 512;
}
errors = memcmp(toWrite, toRead, BUF_DIM);
readTot /= nbRd;
cmdTot /= nbRd;
float speed = (float)(ns * 512 / readTot);
Serial.printf("%u\t\t%1.3f Ko\t", rd / 512, rd / 1000.0);
Serial.printf("%u us\t\t%u us\t", cmdTot, readTot);
Serial.printf("\t%1.2f Mo/s\t\t%u", speed, errors);
Serial.printf("\t\t[%u - %u | %u - %u | %u - %u]\n", toWrite[ns], toRead[ns], toWrite[ns + 64], toRead[ns + 64], toWrite[ns+256], toRead[ns+256]);
}
file.close();
sd.remove("teensyTest.bin");
sd.ls();
Serial.print("\nOver and out...");
}
void setup() {
Serial.begin(9600);
while (!Serial) {}
Serial.println("Type to begin tests...");
while (!Serial.available()) {}
if (!sd.begin(SdioConfig(DMA_SDIO))) {
Serial.println("begin failed");
}
Serial.println("Init OK");
Entropy.Initialize();
runTest();
}
void loop() {}
And the results with a Kingston Canvas Go Plus:
kHzSdClk = 100000 in SdioTeensy.cpp
Code:
------------------------------------------------------------------/ WRITING /------------------------------------------------------------------
sectors octets cmd write speed
1 0.512 Ko 4 us 493 us 1.00 Mo/s
2 1.024 Ko 358 us 520 us 1.00 Mo/s
4 2.048 Ko 366 us 582 us 3.00 Mo/s
8 4.096 Ko 338 us 606 us 6.00 Mo/s
16 8.192 Ko 381 us 840 us 9.00 Mo/s
32 16.384 Ko 259 us 684 us 23.00 Mo/s
64 32.768 Ko 375 us 2031 us 16.00 Mo/s
128 65.536 Ko 443 us 1920 us 34.00 Mo/s
256 131.072 Ko 527 us 6935 us 18.00 Mo/s
------------------------------------------------------------------/ READING /------------------------------------------------------------------
sectors octets cmd read speed errors proofs
1 0.512 Ko 139 us 141 us 3.00 Mo/s 0 [21 - 21 | 238 - 238 | 195 - 195]
2 1.024 Ko 4 us 198 us 5.00 Mo/s 0 [140 - 140 | 23 - 23 | 189 - 189]
4 2.048 Ko 4 us 196 us 10.00 Mo/s 0 [181 - 181 | 226 - 226 | 34 - 34]
8 4.096 Ko 4 us 213 us 19.00 Mo/s 0 [158 - 158 | 177 - 177 | 166 - 166]
16 8.192 Ko 4 us 461 us 17.00 Mo/s 0 [149 - 149 | 78 - 78 | 50 - 50]
32 16.384 Ko 4 us 645 us 25.00 Mo/s 0 [139 - 139 | 201 - 201 | 47 - 47]
64 32.768 Ko 4 us 960 us 34.00 Mo/s 0 [7 - 7 | 24 - 24 | 86 - 86]
128 65.536 Ko 4 us 1640 us 39.00 Mo/s 0 [24 - 24 | 208 - 208 | 122 - 122]
256 131.072 Ko 4 us 3008 us 43.00 Mo/s 0 [47 - 47 | 86 - 86 | 34 - 34]
kHzSdClk = 200000 {writing is done with simple DMA at this speed)
Code:
------------------------------------------------------------------/ READING /------------------------------------------------------------------
sectors octets cmd read speed errors proofs
1 0.512 Ko 106 us 108 us 4.00 Mo/s 0 [21 - 21 | 238 - 238 | 195 - 195]
2 1.024 Ko 3 us 141 us 7.00 Mo/s 0 [140 - 140 | 23 - 23 | 189 - 189]
4 2.048 Ko 3 us 157 us 13.00 Mo/s 0 [181 - 181 | 226 - 226 | 34 - 34]
8 4.096 Ko 3 us 152 us 26.00 Mo/s 0 [158 - 158 | 177 - 177 | 166 - 166]
16 8.192 Ko 3 us 302 us 27.00 Mo/s 0 [149 - 149 | 78 - 78 | 50 - 50]
32 16.384 Ko 3 us 414 us 39.00 Mo/s 0 [139 - 139 | 201 - 201 | 47 - 47]
64 32.768 Ko 3 us 620 us 52.00 Mo/s 0 [7 - 7 | 24 - 24 | 86 - 86]
128 65.536 Ko 3 us 967 us 67.00 Mo/s 0 [24 - 24 | 208 - 208 | 122 - 122]
256 131.072 Ko 3 us 1649 us 79.00 Mo/s 0 [47 - 47 | 86 - 86 | 34 - 34]
As you can see, the time taken to send the command is around 3us, so it can be relevant for Audio processing (FrankB's Teensy-WavePlayer or teensy-variable-playback for interpolation) and other interrupts related situations. the 106us for the 1 block reading is due to the CMD17 being still blocking method, as filesystem functions seems to rely on classic transfer methods to not crash. (I don't understand it fully)
Here is the bit of code added at the end of SdioTeensy.cpp:
Code:
#define ADMA2_DESCRIPTOR_LENGTH_SHIFT 16U
#define ADMA2_DESCRIPTOR_MAX_LENGTH_PER_ENTRY 0xFFFFUL
#define ADMA2_DESCRIPTOR_BUFFER_SIZE 32U
#define ADMA2_DESCRIPTOR_BUFFER_ALIGN_SIZE 4U
#define ADMA2_DESCRIPTOR_END_FLAG 1U << 1U
#define ADMA2_DESCRIPTOR_ACTIVITY2_FLAG 0x20
#define ADMA2_DESCRIPTOR_TRANSFER_FLAG 0x21
#define ADMA2_LENGTH_ALIGN 4U
#define SDHC_PROCTL_DMAS_MASK 0x300U
struct Sdhc_Adma2_Descriptor {
uint32_t attribute; /*!< The control and status field */
const uint32_t *address; /*!< The address field */
};
static uint32_t HostDmaBuffer[ADMA2_DESCRIPTOR_BUFFER_SIZE] __attribute__((aligned(ADMA2_DESCRIPTOR_BUFFER_ALIGN_SIZE)));
bool SetAdma2TableConfig(uint32_t *table, uint32_t tableWords, const uint32_t *data, uint32_t dataBytes) {
const uint32_t *startAddress = data;
uint32_t entries;
uint32_t i;
Sdhc_Adma2_Descriptor *adma2EntryAddress;
if (table == NULL || tableWords == 0UL || data == NULL || dataBytes == 0UL) {
return false;
}
else if (((uint32_t)startAddress % ADMA2_LENGTH_ALIGN) != 0UL) {
return false;
} else {
if (dataBytes % sizeof(uint32_t) != 0U) {
dataBytes += sizeof(uint32_t) - (dataBytes % sizeof(uint32_t)); /* make the data length as word-aligned */
}
entries = ((dataBytes / ADMA2_DESCRIPTOR_MAX_LENGTH_PER_ENTRY) + 1U);
if (entries > ((tableWords * sizeof(uint32_t)) / sizeof(Sdhc_Adma2_Descriptor))) { // TODO sizeof() !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
return false;
} else {
adma2EntryAddress = (Sdhc_Adma2_Descriptor*)(uint32_t)(table);
for (i = 0; i < entries; i++) {
/* Each descriptor for ADMA2 is 64-bit in length */
if ((dataBytes - ((uint32_t)startAddress - (uint32_t)data)) <= ADMA2_DESCRIPTOR_MAX_LENGTH_PER_ENTRY) {
/* The last piece of data, setting end flag in descriptor */
adma2EntryAddress[i].address = startAddress;
adma2EntryAddress[i].attribute = ((dataBytes - ((uint32_t)startAddress - (uint32_t)data)) << ADMA2_DESCRIPTOR_LENGTH_SHIFT);
adma2EntryAddress[i].attribute |= ADMA2_DESCRIPTOR_TRANSFER_FLAG | ADMA2_DESCRIPTOR_END_FLAG;
} else {
adma2EntryAddress[i].address = startAddress;
adma2EntryAddress[i].attribute = (((ADMA2_DESCRIPTOR_MAX_LENGTH_PER_ENTRY / sizeof(uint32_t)) * sizeof(uint32_t)) << ADMA2_DESCRIPTOR_LENGTH_SHIFT);
adma2EntryAddress[i].attribute |= ADMA2_DESCRIPTOR_TRANSFER_FLAG;
startAddress += (ADMA2_DESCRIPTOR_MAX_LENGTH_PER_ENTRY / sizeof(uint32_t));
}
//Serial.printf("\t\t________________________________________________________________\n");
//Serial.printf("\t\t| Attribute field | Length field | Address field |\n");
//Serial.printf("\t\t|Valid|End|Int|x|Act 1|Act 2| 16bit length | 32-bit length |\n");
//Serial.printf("\t\t| %u | %u | %u |x| %u | %u |", bitRead(adma2EntryAddress[i].attribute, 0), bitRead(adma2EntryAddress[i].attribute, 1), bitRead(adma2EntryAddress[i].attribute, 2), bitRead(adma2EntryAddress[i].attribute, 4), bitRead(adma2EntryAddress[i].attribute, 5));
//Serial.printf(" %u | %u |\n", adma2EntryAddress[i].attribute >> 16, adma2EntryAddress[i].address);
}
SDHC_DSADDR = 0U;
SDHC_ADSADDR = (uint32_t)table;
}
}
return true;
}
static void admaEnable(bool enable) {
if(enable) {
SDHC_PROCTL |= SDHC_PROCTL_DMAS(2);
} else {
SDHC_PROCTL &= ~SDHC_PROCTL_DMAS_MASK;
}
}
static bool sectorsReadWriteNonBlocking(uint32_t xfertyp, uint32_t sector, uint8_t* buf, size_t n) {
if (waitTimeout(isBusyCommandInhibit)) {
return false; // Caller will set errorCode.
}
uint32_t* temp = (uint32_t *)(uint32_t)buf; // 4 Bytes aligned pointer
if(!SetAdma2TableConfig(HostDmaBuffer, ADMA2_DESCRIPTOR_BUFFER_SIZE, temp, (n * 512))) return false; // Configure ADMA descriptor(s)
admaEnable(true); // Enable and select Adma
SDHC_IRQSIGEN |= SDHC_IRQSIGEN_MASK; // Enable interrupts Signal
SDHC_BLKATTR = SDHC_BLKATTR_BLKSIZE(512) | SDHC_BLKATTR_BLKCNT(n);
if(!cardCommand(xfertyp, sector)) return false;
return true;
}
static bool rdWrSectors(uint32_t xfertyp, uint32_t sector, uint8_t* buf, size_t n) {
if ((3 & (uint32_t)buf) || n == 0) {
return sdError(SD_CARD_ERROR_DMA);
}
if(isAdma) {
while(!waitDmaStatus()) {}
setIsAdma(false);
}
if (yieldTimeout(isBusyCMD13)) {
return sdError(SD_CARD_ERROR_CMD13);
}
enableDmaIrs();
if(xfertyp == CMD17_DMA_XFERTYP || xfertyp == CMD18_DMA_XFERTYP || xfertyp == CMD25_DMA_XFERTYP || xfertyp == CMD24_DMA_XFERTYP) {
if(!sectorsReadWriteNonBlocking(xfertyp, sector, buf, n)) return false;
setIsAdma(true);
if(xfertyp == CMD17_DMA_XFERTYP) { // Blocking... Ensure that common filesystem functions works
while(!waitDmaStatus()) {}
setIsAdma(false);
}
return true;
} else {
admaEnable(false);
SDHC_DSADDR = (uint32_t)buf;
SDHC_BLKATTR = SDHC_BLKATTR_BLKCNT(n) | SDHC_BLKATTR_BLKSIZE(512);
SDHC_IRQSIGEN = SDHC_IRQSIGEN_MASK;
if (!cardCommand(xfertyp, m_highCapacity ? sector : 512*sector)) {
return false;
}
return waitDmaStatus();
}
}
the rdWrSectors function is modified to route the read/write operations depending on the card speed set with kHzSdClk. At 100000, I can read and write, but at 200000, only read operations are possible so it is asked to comment out CMD24 and CMD25 in the if() statement.
the isBusy function has been modified as well:
Code:
if (isAdma) {
if(m_dmaBusy && !(SDHC_IRQSTAT & (SDHC_IRQSTAT_TC | SDHC_IRQSTAT_ERROR))) return true;
setIsAdma(false);
return false;
}
if (m_sdioConfig.useDma()) {
return m_busyFcn ? m_busyFcn() : m_initDone && isBusyCMD13();
} else {
if (m_transferActive) {
if (isBusyTransferComplete()) {
return true;
} ...
And the related variable and function added among declaration at the beginning of the file:
Code:
//==============================================================================
bool isAdma = false;
static void setIsAdma(bool enable) {isAdma = enable;}
//==============================================================================
Sorry for the weird code or formatting, I'm definitely no developer, just trying to understand and apply
Have a great evening!