Onboard flash can be faster...

Back to the topic at hand: making flash writes not block interrupts...
I've replaced the latter half of eeprom.c (everything after the eeprom_write_block function) like so:
Code:
#define LUT0(opcode, pads, operand) (FLEXSPI_LUT_INSTRUCTION((opcode), (pads), (operand)))
#define LUT1(opcode, pads, operand) (FLEXSPI_LUT_INSTRUCTION((opcode), (pads), (operand)) << 16)
#define CMD_SDR         FLEXSPI_LUT_OPCODE_CMD_SDR
#define ADDR_SDR        FLEXSPI_LUT_OPCODE_RADDR_SDR
#define READ_SDR        FLEXSPI_LUT_OPCODE_READ_SDR
#define WRITE_SDR       FLEXSPI_LUT_OPCODE_WRITE_SDR
#define MODE8_SDR       FLEXSPI_LUT_OPCODE_MODE8_SDR
#define DUMMY_SDR       FLEXSPI_LUT_OPCODE_DUMMY_SDR
#define JMP_ON_CS       FLEXSPI_LUT_OPCODE_JMP_ON_CS
#define PINS1           FLEXSPI_LUT_NUM_PADS_1
#define PINS4           FLEXSPI_LUT_NUM_PADS_4

void setup_flexspi() {
  __disable_irq();

  // unlock the LUT
  FLEXSPI_LUTKEY = FLEXSPI_LUTKEY_VALUE;
  FLEXSPI_LUTCR = FLEXSPI_LUTCR_UNLOCK;
  asm volatile("dmb");

  // LUT 0: ReadContinuous
  FLEXSPI_LUT0 = LUT0(CMD_SDR, PINS1, 0xEB) | LUT1(ADDR_SDR, PINS4, 24);
  FLEXSPI_LUT1 = LUT0(MODE8_SDR, PINS4, 0x20) | LUT1(DUMMY_SDR, PINS4, 4);
  FLEXSPI_LUT2 = LUT0(READ_SDR, PINS4, 4) | LUT1(JMP_ON_CS, 0, 1);
  FLEXSPI_LUT3 = 0;
  // LUT 1: Read Status Register 1
  FLEXSPI_LUT4 = LUT0(CMD_SDR, PINS1, 0x05) | LUT1(READ_SDR, PINS1, 1);
  FLEXSPI_LUT5 = \
  FLEXSPI_LUT6 = \
  FLEXSPI_LUT7 = 0;
  // LUT 2: Read Status Register 2
  FLEXSPI_LUT8 = LUT0(CMD_SDR, PINS1, 0x35) | LUT1(READ_SDR, PINS1, 1);
  FLEXSPI_LUT9 = \
  FLEXSPI_LUT10 = \
  FLEXSPI_LUT11 = 0;
  // LUT 3: Disable continuous read
  FLEXSPI_LUT12 = LUT0(CMD_SDR, PINS1, 0xFF);
  FLEXSPI_LUT13 = \
  FLEXSPI_LUT14 = \
  FLEXSPI_LUT15 = 0;
  // LUT 4: WriteEnable
  FLEXSPI_LUT16 = LUT0(CMD_SDR, PINS1, 0x06);
  FLEXSPI_LUT17 = \
  FLEXSPI_LUT18 = \
  FLEXSPI_LUT19 = 0;
  // LUT 5: EraseSector (4K)
  FLEXSPI_LUT20 = LUT0(CMD_SDR, PINS1, 0x20) | LUT1(ADDR_SDR, PINS1, 24);
  FLEXSPI_LUT21 = \
  FLEXSPI_LUT22 = \
  FLEXSPI_LUT23 = 0;
  // LUT 6: Erase 32K
  FLEXSPI_LUT24 = LUT0(CMD_SDR, PINS1, 0x52) | LUT1(ADDR_SDR, PINS1, 24);
  FLEXSPI_LUT25 = \
  FLEXSPI_LUT26 = \
  FLEXSPI_LUT27 = 0;
  // LUT 8: Erase 64K
  FLEXSPI_LUT32 = LUT0(CMD_SDR, PINS1, 0xD8) | LUT1(ADDR_SDR, PINS1, 24);
  FLEXSPI_LUT33 = \
  FLEXSPI_LUT34 = \
  FLEXSPI_LUT35 = 0;
  // LUT 9: PageProgram
  FLEXSPI_LUT36 = LUT0(CMD_SDR, PINS1, 0x32) | LUT1(ADDR_SDR, PINS1, 24);
  FLEXSPI_LUT37 = LUT0(WRITE_SDR, PINS4, 1);
  FLEXSPI_LUT38 = \
  FLEXSPI_LUT39 = 0;
  // LUT 12: Erase/Program Suspend
  FLEXSPI_LUT48 = LUT0(CMD_SDR, PINS1, 0x75);
  FLEXSPI_LUT49 = \
  FLEXSPI_LUT50 = \
  FLEXSPI_LUT51 = 0;
  // LUT 13: Read (non-continuous)
  FLEXSPI_LUT52 = LUT0(CMD_SDR, PINS1, 0xEB) | LUT1(ADDR_SDR, PINS4, 24);
  FLEXSPI_LUT53 = LUT0(MODE8_SDR, PINS4, 0xFF) | LUT1(DUMMY_SDR, PINS4, 4);
  FLEXSPI_LUT54 = LUT0(READ_SDR, PINS4, 4);
  FLEXSPI_LUT55 = 0;
  // LUT 14: Erase/Program Resume
  FLEXSPI_LUT56 = LUT0(CMD_SDR, PINS1, 0x7A);
  FLEXSPI_LUT57 = \
  FLEXSPI_LUT58 = \
  FLEXSPI_LUT59 = 0;

  // set AHB read command to ReadContinuous
  FLEXSPI_FLSHA1CR2 = FLEXSPI_FLSHCR2_ARDSEQID(0);

  // re-lock the LUT to prevent changes
  FLEXSPI_LUTKEY = FLEXSPI_LUTKEY_VALUE;
  FLEXSPI_LUTCR = FLEXSPI_LUTCR_LOCK;
  asm volatile("dmb");

  __enable_irq();
}

// disable/remove when not testing
#if 1
void nonblocking_flash_test(void);
#else
#define nonblocking_flash_test(...)
#endif

static void flash_wait()
{
  // tSUS = ~22.5us @ 133MHZ = 3000 cycles
  FLEXSPI_FLSHA1CR1 |= FLEXSPI_FLSHCR1_CSINTERVAL(3000);
  // change AHB read sequence to suspend->read->resume
  FLEXSPI_FLSHA1CR2 = FLEXSPI_FLSHCR2_ARDSEQNUM(2) | FLEXSPI_FLSHCR2_ARDSEQID(12);
  asm volatile("dmb");
  // changing CSINTERVAL won't work without a reset
  FLEXSPI_MCR0 |= FLEXSPI_MCR0_SWRESET;
  while (FLEXSPI_MCR0 & FLEXSPI_MCR0_SWRESET) ; // wait
  __enable_irq();

  nonblocking_flash_test();

  FLEXSPI_IPCR0 = 0;
  uint16_t status;
  while (1) {
    // for whatever reason, these commands cannot be sent as one sequence (second command doesn't finish)
    FLEXSPI_IPRXFCR = FLEXSPI_IPRXFCR_CLRIPRXF;
    FLEXSPI_IPCR1 = FLEXSPI_IPCR1_ISEQID(2);
    FLEXSPI_IPCMD = FLEXSPI_IPCMD_TRG;
    while (!(FLEXSPI_INTR & FLEXSPI_INTR_IPCMDDONE));
    FLEXSPI_INTR = FLEXSPI_INTR_IPCMDDONE;
    FLEXSPI_IPCR1 = FLEXSPI_IPCR1_ISEQID(1);
    FLEXSPI_IPCMD = FLEXSPI_IPCMD_TRG;
    while (!(FLEXSPI_INTR & FLEXSPI_INTR_IPCMDDONE));
    FLEXSPI_INTR = FLEXSPI_INTR_IPCMDDONE;
    status = FLEXSPI_RFDR0 << 8;
    status |= (uint8_t)FLEXSPI_RFDR2;
    // continue if BUSY is set
    if (status & 1) continue;
    // finish if there's no suspended operation
    if (!(status & 0x8000)) break;
    // otherwise manually trigger resume since apparently it doesn't always get triggered by AHB sequence...
    FLEXSPI_IPCR1 = FLEXSPI_IPCR1_ISEQID(14);
    FLEXSPI_IPCMD = FLEXSPI_IPCMD_TRG;
    while (!(FLEXSPI_INTR & FLEXSPI_INTR_IPCMDDONE));
    FLEXSPI_INTR = FLEXSPI_INTR_IPCMDDONE;
  }

  __disable_irq();
  // restore old values
  FLEXSPI_FLSHA1CR1 &= ~FLEXSPI_FLSHCR1_CSINTERVAL_MASK;
  FLEXSPI_FLSHA1CR2 = FLEXSPI_FLSHCR2_ARDSEQID(0);

  // purge stale data from FlexSPI's AHB FIFO
  FLEXSPI_MCR0 |= FLEXSPI_MCR0_SWRESET;
  while (FLEXSPI_MCR0 & FLEXSPI_MCR0_SWRESET) ; // wait
  __enable_irq();
}

static void flash_begin()
{
  __disable_irq();
  // disable continuous mode, then enable write commands
  FLEXSPI_IPCR0 = 0;
  FLEXSPI_IPCR1 = FLEXSPI_IPCR1_ISEQNUM(1) | FLEXSPI_IPCR1_ISEQID(3);
  FLEXSPI_IPCMD = FLEXSPI_IPCMD_TRG;
}

// write bytes into flash memory (which is already erased to 0xFF)
void eepromemu_flash_write(void *addr, const void *data, uint32_t len)
{
  flash_begin();
  arm_dcache_delete(addr, len); // purge old data from ARM's cache
  while (!(FLEXSPI_INTR & FLEXSPI_INTR_IPCMDDONE)) ; // wait
  FLEXSPI_INTR = FLEXSPI_INTR_IPCMDDONE;
  FLEXSPI_IPTXFCR = FLEXSPI_IPTXFCR_CLRIPTXF; // clear tx fifo
  FLEXSPI_IPCR0 = (uint32_t)addr & 0x00FFFFFF;
  FLEXSPI_IPCR1 = FLEXSPI_IPCR1_ISEQID(9) | FLEXSPI_IPCR1_IDATSZ(len);
  FLEXSPI_IPCMD = FLEXSPI_IPCMD_TRG;
  const uint8_t *src = (const uint8_t *)data;
  uint32_t n;
  while (!((n = FLEXSPI_INTR) & FLEXSPI_INTR_IPCMDDONE)) {
    if (len && n & FLEXSPI_INTR_IPTXWE) {
      uint32_t wrlen = len;
      if (wrlen > 8) wrlen = 8;
      if (wrlen > 0) {
        memcpy((void *)&FLEXSPI_TFDR0, src, wrlen);
        src += wrlen;
        len -= wrlen;
      }
      FLEXSPI_INTR = FLEXSPI_INTR_IPTXWE;
    }
  }
  FLEXSPI_INTR = FLEXSPI_INTR_IPCMDDONE | FLEXSPI_INTR_IPTXWE;
  flash_wait();
}

// erase a 4K sector
void eepromemu_flash_erase_sector(void *addr)
{
  // don't need flash_begin, all three commands are consecutive and can be issued as one sequence
  __disable_irq();
  FLEXSPI_IPCR0 = (uint32_t)addr & 0x00FFF000;
  // execute LUTs 3 + 4 + 5
  FLEXSPI_IPCR1 = FLEXSPI_IPCR1_ISEQNUM(2) | FLEXSPI_IPCR1_ISEQID(3);
  FLEXSPI_IPCMD = FLEXSPI_IPCMD_TRG;
  arm_dcache_delete((void*)((uint32_t)addr & 0xFFFFF000), 4096); // purge data from cache
  flash_wait();
}

void eepromemu_flash_erase_32K_block(void *addr)
{
  flash_begin();
  arm_dcache_delete((void *)((uint32_t)addr & 0xFFFF8000), 32768); // purge data from cache
  while (!(FLEXSPI_INTR & FLEXSPI_INTR_IPCMDDONE)) ; // wait
  FLEXSPI_INTR = FLEXSPI_INTR_IPCMDDONE;
  FLEXSPI_IPCR0 = (uint32_t)addr & 0x00FF8000;
  FLEXSPI_IPCR1 = FLEXSPI_IPCR1_ISEQID(6);
  FLEXSPI_IPCMD = FLEXSPI_IPCMD_TRG;
  while (!(FLEXSPI_INTR & FLEXSPI_INTR_IPCMDDONE)) ; // wait
  FLEXSPI_INTR = FLEXSPI_INTR_IPCMDDONE;
  flash_wait();
}

void eepromemu_flash_erase_64K_block(void *addr)
{
  flash_begin();
  arm_dcache_delete((void *)((uint32_t)addr & 0xFFFF0000), 65536); // purge data from cache
  while (!(FLEXSPI_INTR & FLEXSPI_INTR_IPCMDDONE)) ; // wait
  FLEXSPI_INTR = FLEXSPI_INTR_IPCMDDONE;
  FLEXSPI_IPCR0 = (uint32_t)addr & 0x00FF0000;
  FLEXSPI_IPCR1 = FLEXSPI_IPCR1_ISEQID(8);
  FLEXSPI_IPCMD = FLEXSPI_IPCMD_TRG;
  while (!(FLEXSPI_INTR & FLEXSPI_INTR_IPCMDDONE)) ; // wait
  FLEXSPI_INTR = FLEXSPI_INTR_IPCMDDONE;
  flash_wait();
}

And then we can make a simple test program that will access flash during programming, to test the suspend/resume commands:
Code:
#include <Arduino.h>
#include <EEPROM.h>

extern "C" {
// ideally this would be called by startup, soon after configure_cache() is done
void setup_flexspi();

void startup_middle_hook() {
  setup_flexspi();
}

FLASHMEM void nonblocking_flash_test(void) {
  puts(PSTR("flash erase/program is active"));
}
} // extern "C"

FLASHMEM void setup() {
  Serial.begin(0);
  while (!Serial);
  puts("Flash tester begin");
}

void loop() {
  static uint8_t eep_byte;

  delay(4000);
  auto old = EEPROM.read(eep_byte);
  printf("Writing eeprom index %u\n", eep_byte);
  EEPROM.write(eep_byte, ++old);
  if (EEPROM.read(eep_byte) != old) {
    puts("eeprom verification failed!");
    while(1);
  }
  ++eep_byte;
}
 
Any chance you could make a test case that shows this LittleFS slowness?
I have written an example that demonstrates the slowness:
Code:
#include <LittleFS.h>
#include <Entropy.h>
// NOTE: Entropy class defines "randomByte" and "randomWord" methods that aren't implemented!
LittleFS_Program progfs;
static elapsedMillis write_timer;
void setup() {
  Serial.begin(0);
  while (!Serial);
  Entropy.Initialize();
  if (!progfs.begin(2 << 20)) {
    Serial.println("Failed to initialize LittleFS");
    while(1);
  }
  progfs.format();
  Serial.println("FS formatted.");
  File f = progfs.open("test", FILE_WRITE);
  if (!f) {
    Serial.println("Failed to init testfile");
  }
  f.truncate(2048 * 128);
  f.close();
  Serial.println("Testfile initialized.");

  write_timer = 0;
}
void loop() {
  File f;
  uint8_t sector_buffer[128];
  // fill buffer with random data
  for (size_t i=0; i < sizeof(sector_buffer); i++) {
    sector_buffer[i] = Entropy.random(256);
  }
  // pick a random 128 byte sector to replace
  uint16_t sector = Entropy.random(2048);
  // wait until 4 seconds since last write finished
  while (write_timer < 4000);
  f = progfs.open("test", FILE_WRITE);
  if (!f) {
    Serial.println("Failed to open testfile");
    while (1);
  }
  Serial.print("Replacing sector ");
  Serial.print(sector);
  Serial.print("...");
  elapsedMicros measure;
  // set the position
  f.seek(sector * 128);
  // write
  f.write(sector_buffer, sizeof(sector_buffer));
  f.close();
  uint32_t usecs = measure;
  Serial.print("Done, duration: ");
  Serial.print(usecs);
  Serial.print(" usecs (");
  Serial.print(128 * 1000000 / usecs);
  Serial.println(" bytes/sec)");
 
  // reset update timer
  write_timer = 0;
}

Example output:
FS formatted.
Testfile initialized.
Replacing sector 1175...Done, duration: 274025 usecs (467 bytes/sec)
Replacing sector 18...Done, duration: 556007 usecs (230 bytes/sec)
Replacing sector 1152...Done, duration: 274986 usecs (465 bytes/sec)
Replacing sector 271...Done, duration: 548801 usecs (233 bytes/sec)
Replacing sector 1404...Done, duration: 265783 usecs (481 bytes/sec)
Replacing sector 409...Done, duration: 542935 usecs (235 bytes/sec)
Replacing sector 946...Done, duration: 402915 usecs (317 bytes/sec)
Replacing sector 302...Done, duration: 545897 usecs (234 bytes/sec)
Replacing sector 770...Done, duration: 407879 usecs (313 bytes/sec)
Replacing sector 214...Done, duration: 549767 usecs (232 bytes/sec)
Replacing sector 1207...Done, duration: 272842 usecs (469 bytes/sec)
Replacing sector 898...Done, duration: 403825 usecs (316 bytes/sec)
Replacing sector 623...Done, duration: 413808 usecs (309 bytes/sec)
Replacing sector 1056...Done, duration: 277788 usecs (460 bytes/sec)
The duration is very obviously affected by the position of the write (beginning of file = slower, end of file = faster).
I'm not sure the timing measurements are completely accurate either, since micros still relies on the systick interrupt updating systick_millis_count/systick_cycle_count and the interrupts are blocked during flash programming - they may actually be longer.
 
Last edited:
micros() is based off the LAST systick seen and ARM_DWT_CYCCNT then - as long as it isn't a whole (MILLI)-second away it will properly count elapsed micros.

Quick rewrite just looks for 4 seconds of ARM_DWT_CYCCNT and gives similar results here:
edit: though just seeing the second elapseMillis was not altered
Code:
#include <LittleFS.h>
#include <Entropy.h>
// NOTE: Entropy class defines "randomByte" and "randomWord" methods that aren't implemented!
LittleFS_Program progfs;
#define FOUR_SECS 4 * F_CPU
//static elapsedMillis write_timer;
static uint32_t write_timer;
void setup() {
  Serial.begin(0);
  while (!Serial);
  Entropy.Initialize();
  if (!progfs.begin(2 << 20)) {
    Serial.println("Failed to initialize LittleFS");
    while(1);
  }
  progfs.format();
  Serial.println("FS formatted.");
  File f = progfs.open("test", FILE_WRITE);
  if (!f) {
    Serial.println("Failed to init testfile");
  }
  f.truncate(2048 * 128);
  f.close();
  Serial.println("Testfile initialized.");
}
void loop() {
  File f;
  uint8_t sector_buffer[128];
  write_timer = ARM_DWT_CYCCNT;  // set update timer
  // fill buffer with random data
  for (size_t i=0; i < sizeof(sector_buffer); i++) {
    sector_buffer[i] = Entropy.random(256);
  }
  // pick a random 128 byte sector to replace
  uint16_t sector = Entropy.random(2048);
  // wait until 4 seconds since last write finished
  while (ARM_DWT_CYCCNT - write_timer < FOUR_SECS);
  f = progfs.open("test", FILE_WRITE);
  if (!f) {
    Serial.println("Failed to open testfile");
    while (1);
  }
  Serial.print("Replacing sector ");
  Serial.print(sector);
  Serial.print("...");
  elapsedMicros measure;
  // set the position
  f.seek(sector * 128);
  // write
  f.write(sector_buffer, sizeof(sector_buffer));
  f.close();
  uint32_t usecs = measure;
  Serial.print("Done, duration: ");
  Serial.print(usecs);
  Serial.print(" usecs (");
  Serial.print(128 * 1000000 / usecs);
  Serial.println(" bytes/sec)");
 }
 
Last edited:
as long as it isn't a whole second away it will properly count elapsed micros.
I think you mean millisecond, which is the period of the systick interrupt - and the flash procedures can easily disable interrupts for longer than that.
 
Suppose so - opps - lost track of units. As first written it would count ahead for ms misses, but that went away with edit for issue that or something caused limiting to 1 ms.
The way it should be done, is that systick_isr (seen here) should be comparing ARM_DWT_CYCCNT with CYCLES_PER_MILLISEC*systick_millis_count to work out what amount to add to systick_millis_count instead of just adding 1. That way it wouldn't drop milliseconds if the systick interrupt was delayed for more than 1 millisec (as long as it wasn't delayed long enough for ARM_DWT_CYCCNT to wrap passed its old value).
 
I would argue that systick really doesn't need to be running at 1000Hz, 100Hz would have a lot less impact.
 
Back
Top