Thanks everyone for the helpful suggestions. I dug in a bit more to understand the performance of different access patterns and to understand how data gets fetched from PSRAM.
It seems data is fetched in 32 byte bursts from the PSRAM. This lines up with the data sheet for one of the compatible PSRAM chips. You can observe this by looking at timing of sequential reads. Every 32nd read takes about 350 cycles (per ARM_DWT_CYCCNT) but subsequent reads of the next 31 bytes take only 11 cycles.
Looking at random access reads, I expected to see worst case 350 cycle reads since it would rarely hit the cache. Instead, I frequently saw up to double that. I don't know what explains this. Any thoughts?
In my quest for the fastest, worst case random access read, I ended up handwriting code to manually use the IP Command interface (Section 27.5.9). This gave me the most consistent timing for random reads at around 330 cycles. Almost all of the time (~290) is spent spin waiting for FLEXSPI_INTR_IPRXWA. There's obviously a correlation between that time and the PSRAM clock. Any suggestions on improving this or speeding it up would be appreciated.
This is sadly still too slow for my application of using the Teensy to emulate a banked memory controller clocked at 1MHz. I may need a different approach.
My full code is included below. (timing code and the manual FlexSPI code)
Code:
#include <Arduino.h>
EXTMEM uint8_t extmem_data[0x10000];
inline static uint8_t flexspi2_read(uint32_t addr) {
FLEXSPI2_IPCR0 = addr;
FLEXSPI2_IPCR1 = FLEXSPI_IPCR1_ISEQID(5);
FLEXSPI2_IPCMD = FLEXSPI_IPCMD_TRG;
while (!(FLEXSPI2_INTR & FLEXSPI_INTR_IPRXWA)) ;
uint32_t data = FLEXSPI2_RFDR0;
FLEXSPI2_INTR = FLEXSPI_INTR_IPCMDDONE | FLEXSPI_INTR_IPRXWA;
return data;
}
uint32_t spi_test(int num_iters) {
uint8_t result[10];
volatile uint32_t cycles = 0;
uint32_t *idxes = (uint32_t *)malloc(num_iters);
// fill in random indices to access
for (int i = 0; i < num_iters; i++) {
idxes[i] = random(0x10000);
}
for (int i = 0; i < num_iters; i++) {
cli()
cycles = ARM_DWT_CYCCNT;
result[i % 10] = flexspi2_read(idxes[i]);
uint32_t res = ARM_DWT_CYCCNT - cycles;
Serial.printf("%d: (%d) %d\n", i, idxes[i], res);
sei();
}
Serial.println("Done!");
return result[5];
}
uint32_t rand_test(uint8_t *data, int num_iters) {
uint8_t result[10];
volatile uint32_t cycles = 0;
uint32_t *idxes = (uint32_t *)malloc(num_iters);
// fill in random indices to access
for (int i = 0; i < num_iters; i++) {
idxes[i] = random(0x10000);
}
for (int i = 0; i < num_iters; i++) {
cli()
cycles = ARM_DWT_CYCCNT;
result[i % 10] = data[idxes[i]];
uint32_t res = ARM_DWT_CYCCNT - cycles;
Serial.printf("%d: (%d) %d\n", i, idxes[i], res);
sei();
}
Serial.println("Done!");
return result[5];
}
uint32_t seq_test(uint8_t *data, int num_iters) {
uint8_t result[10];
volatile uint32_t cycles = 0;
for (int i = 0; i < num_iters; i++) {
cli()
cycles = ARM_DWT_CYCCNT;
result[i % 10] = data[i];
uint32_t res = ARM_DWT_CYCCNT - cycles;
Serial.printf("%d: %d\n", i, res);
sei();
}
Serial.println("Done!");
return result[5];
}
void setup() {
while (!Serial) ;
const float clocks[4] = {396.0f, 720.0f, 664.62f, 528.0f};
const float frequency = clocks[(CCM_CBCMR >> 8) & 3] / (float)(((CCM_CBCMR >> 29) & 7) + 1);
Serial.printf("CCM_CBCMR=%08X (%.1f MHz)\n", CCM_CBCMR, frequency);
for (int i = 0; i < 0x10000; i++) {
extmem_data[i] = i;
}
Serial.println("Seq Test");
arm_dcache_flush_delete((void *)extmem_data, 0x10000);
seq_test(extmem_data, 0x100);
Serial.println("Rand Test");
arm_dcache_flush_delete((void *)extmem_data, 0x10000);
rand_test(extmem_data, 0x100);
Serial.println("SPI TEST");
arm_dcache_flush_delete((void *)extmem_data, 0x10000);
spi_test(0x100);
}
void loop() {}