BUG in arm_dcache_delete

Looked like Kurt's would work to catch the beginning wrapping backwards for Delete. Delete not used often, but it is done 'inline' so smaller is better.

Also @Mike - re: Interesting note in p#45 - note the pointer address values in the example? They are not given pointers inline with source code lines.
> important to see where the items are stored to know it is showing what it seems to be showing.
> the 32B boundary shifts under those allocs - so the overlap/underlap will not affect those ptr sets the same
> also somewhere under 32KB alloc the HAB forces data on restart
>> Might be best to manually pick the pointers from fixed 32KB offset, rather than letting the build pick them
--> that's how I came to the 'ugly' but effective use case for LittleFS DMAMEM survival

And for ref - this 2nd line should have buffer2 not buffer:
Code:
  Serial.printf("\n-------------  Kurt's first complete function in post 46 -----------------\n");
  Serial.printf("%x %x %x\n", (uint32_t)&ptr5, (uint32_t)[B][U]buffer[/U][/B], (uint32_t)&ptr6);
 
Paul's comment is pertinent to understanding the context:

"Normally arm_dcache_delete() is used before receiving data via DMA or from bus-master peripherals which write to memory. You want to delete anything the cache may have stored, so your next read is certain to access the physical memory."

It expect it also creates more consistent timing.
 
@defragster
Made a few corrections to the sketch:
Code:
/**
  \brief   Clean a D-Cache region
  \details Cleans D-Cache for the given address and size.
  \param[in]   addr    address
  \param[in]   dsize   size of memory block (in number of bytes)
*/

__attribute__((always_inline, unused))
static inline SCB_CleanDCache_Region(void *addr, uint32_t dsize)
{
  //#if defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U)
    if ( dsize > 0 ) {
      const uint32_t linesize = 32; /* in Cortex-M7 size of cache line is fixed to 8 words (32 bytes) */
      uint32_t op_addr = ((uint32_t)addr) & ~(linesize - 1);
      int32_t  op_size = (int32_t)(dsize + (((uint32_t)addr) & (linesize - 1)));

        asm("dsb");

      do {
        SCB_CACHE_DCIMVAC  = op_addr;
        op_addr +=          linesize;
        op_size -= (int32_t)linesize;
      } while ( op_size > 0 );

        asm("dsb");
        asm("isb");
    }
  //#endif
}

__attribute__((always_inline, unused))
static inline  larm_dcache_delete(void *addr, uint32_t size)
{
    uint32_t location = (uint32_t)addr & 0xFFFFFFE0;
    uint32_t end_addr = (uint32_t)addr + size;
    asm volatile("": : :"memory");
    asm("dsb");
    if (location != (uint32_t)addr) SCB_CACHE_DCCMVAC = location;  // make sure it is flushed first if unaligned
    if (end_addr & 0x1f) SCB_CACHE_DCCMVAC = (end_addr & 0xFFFFFFE0);  // make sure flush if end unaligned
    do {
      SCB_CACHE_DCIMVAC = location;
      location += 32;
    } while (location < end_addr);
    asm("dsb");
    asm("isb");
}

__attribute__((always_inline, unused))
static inline void farm_dcache_delete(void *addr, uint32_t size)
{
    uintptr_t location = (31 + (uintptr_t)addr) & 0xFFFFFFE0;
    uintptr_t end_addr = ((uintptr_t)addr + size) & 0xFFFFFFE0;
    asm volatile("": : :"memory");
    asm("dsb");
    while (location < end_addr) {
        SCB_CACHE_DCIMVAC = location;
        location += 32;
    };
    asm("dsb");
    asm("isb");
}

const int ledPin = 13;

DMAMEM uint8_t *ptr1;
DMAMEM uint8_t buffer[100];
DMAMEM uint8_t *ptr2;
DMAMEM uint8_t *ptr3;
DMAMEM uint8_t buffer1[100];
DMAMEM uint8_t *ptr4;
DMAMEM uint8_t *ptr5;
DMAMEM uint8_t buffer2[100];
DMAMEM uint8_t *ptr6;
DMAMEM uint8_t *ptr7;
DMAMEM uint8_t buffer3[100];
DMAMEM uint8_t *ptr8;

void setup() {
  // initialize the digital pin as an output.
  pinMode(ledPin, OUTPUT);

  while(!Serial);
  Serial.begin(115200);
  
  Serial.printf("\n-------------  arm_dcache_delete -----------------\n");
  Serial.printf("%x %x %x\n", (uint32_t)&ptr1, (uint32_t)buffer, (uint32_t)&ptr2);
  ptr1 = buffer;
  ptr2 = buffer;
  //arm_dcache_flush(buffer, sizeof(buffer));
  Serial.printf("%x %x\n", (uint32_t)ptr1, (uint32_t)ptr2);
  arm_dcache_delete(buffer, sizeof(buffer));
  Serial.printf("%x %x\n", (uint32_t)ptr1, (uint32_t)ptr2);

  Serial.printf("\n-------------  SCB_CleanDCache_Region -----------------\n");
  Serial.printf("%x %x %x\n", (uint32_t)&ptr3, (uint32_t)buffer1, (uint32_t)&ptr4);
  ptr3 = buffer1;
  ptr4 = buffer1;
  Serial.printf("%x %x\n", (uint32_t)ptr3, (uint32_t)ptr4);
  SCB_CleanDCache_Region(buffer1, sizeof(buffer1));
  Serial.printf("%x %x\n", (uint32_t)ptr1, (uint32_t)ptr2);

  Serial.printf("\n-------------  Kurt's first complete function in post 46 -----------------\n");
  Serial.printf("%x %x %x\n", (uint32_t)&ptr5, (uint32_t)buffer2, (uint32_t)&ptr6);
  ptr5 = buffer2;
  ptr6 = buffer2;
  Serial.printf("%x %x\n", (uint32_t)ptr5, (uint32_t)ptr6);
  larm_dcache_delete(buffer2, sizeof(buffer2));
  Serial.printf("%x %x\n", (uint32_t)ptr5, (uint32_t)ptr6);


  Serial.printf("\n-------------  Franks arm_dcache_delete in post 30-----------------\n");
  Serial.printf("%x %x %x\n", (uint32_t)&ptr7, (uint32_t)buffer, (uint32_t)&ptr8);
  ptr7 = buffer3;
  ptr8 = buffer3;
  //arm_dcache_flush(buffer, sizeof(buffer));
  Serial.printf("%x %x\n", (uint32_t)ptr7, (uint32_t)ptr8);
  farm_dcache_delete(buffer3, sizeof(buffer3));
  Serial.printf("%x %x\n", (uint32_t)ptr7, (uint32_t)ptr8);

  
}

// the loop() methor runs over and over again,
// as long as the board has power

void loop() {
  digitalWrite(ledPin, HIGH);   // set the LED on
  delay(1000);                  // wait for a second
  digitalWrite(ledPin, LOW);    // set the LED off
  delay(1000);                  // wait for a second
}

results the same:
Code:
-------------  arm_dcache_delete -----------------
20200064 20200000 20200068
20200000 20200000
9f237e51 8688cc5

-------------  SCB_CleanDCache_Region -----------------
202000d0 2020006c 202000d4
2020006c 2020006c
9f237e51 8688cc5

-------------  Kurt's first complete function in post 46 -----------------
2020013c 202000d8 20200140
202000d8 202000d8
202000d8 202000d8

-------------  Franks arm_dcache_delete in post 30-----------------
20200144 20200000 20200148
2020014c 2020014c
2020014c 2020014c
Running them individually as I did in the first test case pretty much showed the same results - Kurt's method worked as well as Franks.

EDIT: Ok I am done for the night getting late in NY now, at least for me.
 
@defragster
Made a few corrections to the sketch:

...
Running them individually as I did in the first test case pretty much showed the same results - Kurt's method worked as well as Franks.

EDIT: Ok I am done for the night getting late in NY now, at least for me.

It looked like the ptr 5 & 6 jumped after buffer2 - but wasn't clear for sure what &buffer2 was - also where it stacked up in the 32 byte offset series.

Another recent tool might give a clearer read on the end result. Here is the example - not including the alternates in recent examples at this time.

MemoryHexDump gives a clear view of memory changes across a range. And ptrD - jumping the 32KB and also allows for 'known' unchanging results where offsets can be calculated in known 32 byte offsets for parallel tests.

Code:
#include <MemoryHexDump.h>  // https://github.com/KurtE/MemoryHexDump
uint8_t *ptrD = (uint8_t *)0x20208000;

void setup() {
  Serial.begin(115200);
  while(!Serial);
  if ( CrashReport) Serial.print(CrashReport);
  Serial.printf("\nptrD== %x\n", (uint32_t)ptrD);
  for (int ii=0; ii<2048; ii++ ) ptrD[ii]=0x1b;
  MemoryHexDump(Serial, ptrD , 2048, true, "*** Just in Cache ***\n", 20);

  arm_dcache_delete(&ptrD[10], 100);
  MemoryHexDump(Serial, ptrD , 2048, true, "*** Cache delete 10,100 ***\n", 20);
}
void loop() {
}

Shows this:
Code:
ptrD== 20208000
*** Just in Cache ***
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
...	 126 duplicate line(s) removed.
202087F0 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
*** Cache delete 10,100 ***
20208000 - 1D 5A 4C 2B 18 AE B8 CA  87 D2 9A CF 17 E6 99 84  : .ZL+.... ........
20208010 - B7 18 84 01 8D 09 21 41  A8 3C 38 8F 32 8A 0E B1  : ......!A .<8.2...
20208020 - 81 31 33 13 A2 93 2A EA  3D 26 13 58 B3 EA F5 8E  : .13...*. =&.X....
20208030 - 0C 84 44 FB 20 1B E3 BF  20 C7 A4 85 9E 85 AB 09  : ..D. ...  .......
20208040 - CE 18 23 C3 87 3F 5B 36  66 9D DB C3 A5 85 E3 9F  : ..#..?[6 f.......
20208050 - 33 86 BC 4F 26 CD 51 CC  6F 96 D8 FC 95 39 84 4A  : 3..O&.Q. o....9.J
20208060 - B4 39 E0 C1 62 18 87 3A  A0 CD F2 AB 76 91 B0 BF  : .9..b..: ....v...
20208070 - 1F 8C 4C 36 A1 0A 40 BF  B1 B0 CE 49 DE AF CE A9  : ..L6..@. ...I....
20208080 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
...	 118 duplicate line(s) removed.
202087F0 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........

<note>: could have picked better numbers that 10,100 like 36,50 ... but that was first pass before dinner

Using :: arm_dcache_delete(&ptrD[36], 50);
Code:
*** Cache delete 36,50 ***
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208010 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208020 - 81 31 33 13 A2 93 2A EA  3D 26 13 58 B3 EA F5 8E  : .13...*. =&.X....
20208030 - 0C 84 44 FB 20 1B E3 BF  20 C7 A4 85 9E 85 AB 09  : ..D. ...  .......
20208040 - CE 18 23 C3 87 3F 5B 36  66 9D DB C3 A5 85 E3 9F  : ..#..?[6 f.......
20208050 - 33 86 BC 4F 26 CD 51 CC  6F 96 D8 FC 95 39 84 4A  : 3..O&.Q. o....9.J
20208060 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
...	 120 duplicate line(s) removed.
202087F0 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
 
@mjs513 sounds like some more fun reading.

I wanted to next test doing an DMA operation to that memory to see what makes it through. But so far not having luck with DMA memory to memory...
Code:
#include <DMAChannel.h>
DMAChannel dma;
DMAMEM char *ptr1;
DMAMEM char dmaBuffer[100];
DMAMEM char *ptr2;

char low_buffer[100];

void setup() {
  while (!Serial);
  Serial.begin(115200);
  Serial.printf("%x %x %x\n", (uint32_t)&ptr1, (uint32_t)dmaBuffer, (uint32_t)&ptr2);
  ptr1 = dmaBuffer;
  ptr2 = dmaBuffer;
  Serial.printf("%x %x\n", (uint32_t)ptr1, (uint32_t)ptr2);
  arm_dcache_delete(dmaBuffer, sizeof(dmaBuffer));
  Serial.printf("%x %x\n", (uint32_t)ptr1, (uint32_t)ptr2);

  strcpy(low_buffer, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz0123456789");
  dma.sourceBuffer((uint8_t*)low_buffer, sizeof(low_buffer));
  dma.destinationBuffer((uint8_t*)dmaBuffer, sizeof(dmaBuffer));
  dma.transferCount(sizeof(dmaBuffer));
  dma.disableOnCompletion();
  dma.enable();
  dma.clearComplete();
  dma.triggerContinuously();
  delay(100);
  Serial.println((char*)low_buffer);
  Serial.println((char*)dmaBuffer);
}
void loop() {
}
Probably something obvious... One thing I noticed is the triggerContinuously() is not implemented... Other ways people do this?

Kurt

Follow on to this:
I tried first pass at implementing the triggerContinously...
Code:
	void triggerContinuously(void) {
		// TODO: update this for IMXRT.  On Kinetis, a small handful
		// of DMAMUX slots were dedicated to "always on".  On IMXRT,
		// all of them can work as "always on" by setting their
		// DMAMUX_CHCFG_A_ON bit.
		volatile uint32_t *mux = &DMAMUX_CHCFG0 + channel;
		//mux = (volatile uint32_t *)&(DMAMUX_CHCFG0) + channel;
		*mux = 0;
		*mux = DMAMUX_CHCFG_A_ON | DMAMUX_CHCFG_ENBL;
	}
Does this look right?

Note: With out this update it when I was printing out the two lines. only the first character looked like it was transferred.

With this change it looks like: most of it...
Code:
20200064 20200000 20200068
20200000 20200000
9f5a8ac6 2133d8f3
abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz0123456789
abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz01234567[COLOR="#FF0000"]ANb�ƊZ���3![/COLOR]
But the end is garbage...
Like the last 3 bytes 8 9 and NULL.

This is still currently using the in the build arm_dcache_delete

Edit: I am assuming it would transfer the 100 bytes in 100ms... Could/should test to see if completed bit is set...

Edit2: Trying to keep it simple until I get the simple cases to work.
 
Follow on: using my later version (fixing the wrong ; on while statement)
Code:
#include <DMAChannel.h>
DMAChannel dma;
DMAMEM char *ptr1;
DMAMEM char dmaBuffer[100];
DMAMEM char *ptr2;

char low_buffer[100];

__attribute__((always_inline, unused))
static inline void safer_arm_dcache_delete(void *addr, uint32_t size)
{
  uint32_t location = (uint32_t)addr & 0xFFFFFFE0;
  uint32_t end_addr = (uint32_t)addr + size;
  asm volatile("": : :"memory");
  asm("dsb");
  if (location != (uint32_t)addr) {
    SCB_CACHE_DCCIMVAC = location;  // make sure it is flushed and delete it
    location += 32;  // don't process this one again.
  }
  if (end_addr & 0x1f) {
    end_addr &= 0xFFFFFFE0;
    // see if there was a first write that it did not cover this one as well
    if (end_addr > location) SCB_CACHE_DCCIMVAC = end_addr;  // flush and delete it and decrement size back to not do this page again.
  }
  while (location < end_addr)
  {
    SCB_CACHE_DCIMVAC = location;
    location += 32;
  }
  asm("dsb");
  asm("isb");
}
void setup() {
  while (!Serial);
  Serial.begin(115200);
  if (CrashReport) {
    Serial.print(CrashReport);
  }

  Serial.printf("%x %x %x\n", (uint32_t)&ptr1, (uint32_t)dmaBuffer, (uint32_t)&ptr2);
  ptr1 = dmaBuffer;
  ptr2 = dmaBuffer;
  Serial.printf("%x %x\n", (uint32_t)ptr1, (uint32_t)ptr2);
  //arm_dcache_delete(dmaBuffer, sizeof(dmaBuffer));
  safer_arm_dcache_delete(dmaBuffer, sizeof(dmaBuffer));
  Serial.printf("%x %x\n", (uint32_t)ptr1, (uint32_t)ptr2);

  strcpy(low_buffer, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz0123456789");
  dma.sourceBuffer((uint8_t*)low_buffer, sizeof(low_buffer));
  dma.destinationBuffer((uint8_t*)dmaBuffer, sizeof(dmaBuffer));
  dma.transferCount(sizeof(dmaBuffer));
  dma.disableOnCompletion();
  dma.enable();
  dma.clearComplete();
  dma.triggerContinuously();
  delay(100);
  Serial.println((char*)low_buffer);
  Serial.println((char*)dmaBuffer);
}
void loop() {
}

Output looks correct:
Code:
20200064 20200000 20200068
20200000 20200000
20200000 20200000
abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz0123456789
abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz0123456789
Sot the pointers at the end were not mucked with and the DMA transfer happened and we got the right bytes... So cache deleted...
 
Working on sketch using the above, just updated with Kurt's Safer:

IGNORE THIS POST : example improved in following post
 
Last edited:
NEW better more orthogonal test case with Kurt's 'Safer'.

Changed test to FLUSH known "0x2D" to RAM - then FILL cache with "0x1D" and re-use same memory space to the addresses align easier to read.

Each tested in sequence with same PARAMS - then repeat the PJRC current arm_dcache_delete() to show it does the right thing when other fail.

Results in turn as commented: >> output Presented in clearer fashion below ... code here updated
Code:
ptrD== 20208000

*** Just in Cache ***
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
...	 126 duplicate line(s) removed.
202087F0 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........

[B]*** ARM Cache delete 36,50 ***[/B]
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
[B]20208010 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........  // EXPECTED RESULTS
20208020 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
...	 2 duplicate line(s) removed.
20208050 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208060 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
[/B]...	 120 duplicate line(s) removed.
202087F0 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
Code:
*** Just in Cache ***
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
...	 126 duplicate line(s) removed.
202087F0 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........

[B]*** SCB_CleanDCache_Region 36,50 ***[/B]
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
[B]20208010 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........  // EXPECTED RESULTS
20208020 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
...	 2 duplicate line(s) removed.
20208050 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208060 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
[/B]...	 120 duplicate line(s) removed.
202087F0 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
Code:
*** Just in Cache ***
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
...	 126 duplicate line(s) removed.
202087F0 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........

[B]*** Kurt's safer_arm_dcache_delete 36,50 ***[/B]
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
...	 126 duplicate line(s) removed.
202087F0 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
Code:
*** Just in Cache ***
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
...	 126 duplicate line(s) removed.
202087F0 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........

[B]*** Frank's farm_dcache_delete 36,50 ***[/B]
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
...	 126 duplicate line(s) removed.
202087F0 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
Code:
*** Just in Cache ***
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
...	 126 duplicate line(s) removed.
202087F0 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........

[B]*** ARM Cache delete 36,50 ***[/B]
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
[B]20208010 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........  // EXPECTED RESULTS
20208020 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
...	 2 duplicate line(s) removed.
20208050 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208060 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........[/B]
...	 120 duplicate line(s) removed.
202087F0 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........


AND the code for review: { UPDATED to match p#59 output }
Code:
#include <MemoryHexDump.h>  // https://github.com/KurtE/MemoryHexDump
uint8_t *ptrD = (uint8_t *)0x20208000;

/**
  \brief   Clean a D-Cache region
  \details Cleans D-Cache for the given address and size.
  \param[in]   addr    address
  \param[in]   dsize   size of memory block (in number of bytes)
*/

__attribute__((always_inline, unused))
static inline void SCB_CleanDCache_Region(void *addr, uint32_t dsize)
{
  //#if defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U)
  if ( dsize > 0 ) {
    const uint32_t linesize = 32; /* in Cortex-M7 size of cache line is fixed to 8 words (32 bytes) */
    uint32_t op_addr = ((uint32_t)addr) & ~(linesize - 1);
    int32_t  op_size = (int32_t)(dsize + (((uint32_t)addr) & (linesize - 1)));

    asm("dsb");

    do {
      SCB_CACHE_DCIMVAC  = op_addr;
      op_addr +=          linesize;
      op_size -= (int32_t)linesize;
    } while ( op_size > 0 );

    asm("dsb");
    asm("isb");
  }
  //#endif
}

__attribute__((always_inline, unused))
static inline void safer_arm_dcache_delete(void *addr, uint32_t size)
{
  uint32_t location = (uint32_t)addr & 0xFFFFFFE0;
  uint32_t end_addr = (uint32_t)addr + size;
  asm volatile("": : :"memory");
  asm("dsb");
  if (location != (uint32_t)addr) {
    SCB_CACHE_DCCIMVAC = location;  // make sure it is flushed and delete it
    location += 32;  // don't process this one again.
  }
  if (end_addr & 0x1f) {
    end_addr &= 0xFFFFFFE0;
    // see if there was a first write that it did not cover this one as well
    if (end_addr > location) SCB_CACHE_DCCIMVAC = end_addr;  // flush and delete it and decrement size back to not do this page again.
  }
  while (location < end_addr)
  {
    SCB_CACHE_DCIMVAC = location;
    location += 32;
  }
  asm("dsb");
  asm("isb");
}

__attribute__((always_inline, unused))
static inline void farm_dcache_delete(void *addr, uint32_t size)
{
  uintptr_t location = (31 + (uintptr_t)addr) & 0xFFFFFFE0;
  uintptr_t end_addr = ((uintptr_t)addr + size) & 0xFFFFFFE0;
  asm volatile("": : :"memory");
  asm("dsb");
  while (location < end_addr) {
    SCB_CACHE_DCIMVAC = location;
    location += 32;
  };
  asm("dsb");
  asm("isb");
}

void ramCacheReset() {
  for (int ii = 0; ii < 2048; ii++ ) ptrD[ii] = 0x2d; // write cache 0x2d
  arm_dcache_flush(ptrD, 2048); // PUT KNOWN VALUES IN RAM
  for (int ii = 0; ii < 2048; ii++ ) ptrD[ii] = 0x1b; // write cache 0x1a
  MemoryHexDump(Serial, ptrD , 2048, true, "\n*** Just in Cache ***\n", 20);
}


void setup() {
  pinMode(LED_BUILTIN, OUTPUT);
  Serial.begin(115200);
  while (!Serial);
  if ( CrashReport) Serial.print(CrashReport);
  Serial.printf("\nptrD== %x\n", (uint32_t)ptrD);

  ramCacheReset();
  arm_dcache_delete(&ptrD[36], 74);
  MemoryHexDump(Serial, ptrD , 160, false, "\n*** ARM Cache delete 36,74 ***\n");

  ramCacheReset();
  SCB_CleanDCache_Region(&ptrD[36], 74);
  MemoryHexDump(Serial, ptrD , 160, false, "\n*** SCB_CleanDCache_Region 36,74 ***\n");

  ramCacheReset();
  safer_arm_dcache_delete(&ptrD[36], 74);
  MemoryHexDump(Serial, ptrD , 160, false, "\n*** Kurt's safer_arm_dcache_delete 36,74 ***\n");

  ramCacheReset();
  farm_dcache_delete(&ptrD[36], 74);
  MemoryHexDump(Serial, ptrD , 160, false, "\n*** Frank's farm_dcache_delete 36,74 ***\n");

  ramCacheReset();
  arm_dcache_delete(&ptrD[36], 74);
  MemoryHexDump(Serial, ptrD , 160, false, "\n*** ARM Cache delete 36,74 ***\n");
}

void loop() {
  digitalWrite(LED_BUILTIN, HIGH);   // set the LED on
  delay(1000);                  // wait for a second
  digitalWrite(LED_BUILTIN, LOW);    // set the LED off
  delay(1000);                  // wait for a second
}
 
Last edited:
Edit p#58 to NOT REMOVE dupes and limit the output to the area at hand.
>> CODE above p#58 updated

Changed to fixed 160 byte dump:: MemoryHexDump(Serial, ptrD , 160, false, "\n*** ARM Cache delete 36,74 ***\n");

Looks like they are both doing a FLUSH on PRE and POST 32 byte regions as intended.
** Lines are of course 16 bytes.

Code:
ptrD== 20208000

*** Just in Cache ***
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
...	 126 duplicate line(s) removed.
202087F0 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........

*** ARM Cache delete 36,74 ***
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208010 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208020 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208030 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208040 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208050 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208060 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208070 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208080 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208090 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
Code:
*** Just in Cache ***
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
...	 126 duplicate line(s) removed.
202087F0 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........

*** SCB_CleanDCache_Region 36,74 ***
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208010 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208020 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208030 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208040 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208050 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208060 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208070 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208080 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208090 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
Code:
*** Just in Cache ***
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
...	 126 duplicate line(s) removed.
202087F0 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........

[B]*** Kurt's safer_arm_dcache_delete 36,74 ***[/B]    [U]// DESIRED BEHAVIOR[/U]
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208010 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208020 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208030 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208040 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208050 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208060 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208070 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208080 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208090 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
Code:
*** Just in Cache ***
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
...	 126 duplicate line(s) removed.
202087F0 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........

[B]*** Frank's farm_dcache_delete 36,74 ***[/B]    [U]// DESIRED BEHAVIOR[/U]
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208010 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208020 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208030 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208040 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208050 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208060 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208070 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208080 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208090 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
Code:
*** Just in Cache ***
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
...	 126 duplicate line(s) removed.
202087F0 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........

*** ARM Cache delete 36,74 ***
20208000 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208010 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208020 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208030 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208040 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208050 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208060 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208070 - 2D 2D 2D 2D 2D 2D 2D 2D  2D 2D 2D 2D 2D 2D 2D 2D  : -------- --------
20208080 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
20208090 - 1B 1B 1B 1B 1B 1B 1B 1B  1B 1B 1B 1B 1B 1B 1B 1B  : ........ ........
 
Last edited:
Good morning all including(@PaulStoffregen, @Frank B, @defragster, @mjs513.....)

Paul, At this point, I personally believe there is enough stuff here for you to wade through when you gets time. So until then I think I will go back and playing with some other projects, like
the MTP integration.

My quick summary of my takes of it up till now include:

1) Frank B - You found a real bug and potentially a reasonable fix.

2) Paul - There are at least a few ways you can address this:
a) Just say no (document) - say if your data does not start at 32 byte boundary or the count is not a multiple of 32, don't use this
b) Put simple test at start of this method to reinforce this...
c) Use one of the possible fixes, including Franks, or the simple test first/last areas...

3) Not sure how many people this issue impacts? As there is almost 0 documentation on it. The product pages I don't believe mention it... So unclear to me what the impact would be to
fix or not fix... That is if current people understand it is dangerous and setup to deal with it and slowing this down would be detrimental... Or if others don't realize this and have been
scratching their head trying to figure out why something is not working the way they think it is supposed to.

4) DMA operations from memory to memory are fickle! I put in what I think a valid implementation for: dma.triggerContinuously();
Currently it does nothing... And sometimes it works, sometimes it does not.

That is I put in test case: I was playing with I put the DMAMEM stuff into a struct, such that I (not the compiler) decided where the items went...
Code:
typedef struct {
  char dmaBuffer[100];
  char *ptr1;
  char *ptr2;
} data_both_ends_t;
DMAChannel dma;

DMAMEM data_both_ends_t dmam;
So in the code when I think I am setting up the DMA operation...
Code:
  strcpy(low_buffer, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz0123456789");
  dma.sourceBuffer((uint8_t*)low_buffer, sizeof(low_buffer));
  dma.destinationBuffer((uint8_t*)dmam.dmaBuffer, sizeof(dmam.dmaBuffer));
  dma.transferCount(sizeof(dmam.dmaBuffer));
  dma.disableOnCompletion();
  dma.enable();
  dma.clearComplete();
  dma.triggerContinuously();
  delay(100);
This appears to mostly work to transfer the data...

However if I change the structure to be:
Code:
[CODE]typedef struct {
  char *ptr1;
  char dmaBuffer[100];
  char *ptr2;
} data_both_ends_t;
Then the DMA operation appears to fail and nothing is transferred. So it wants the destination buffer to be 32 byte aligned...

So again Paul, let me know after wading through all of this, if you need any additional information.

Now back to playing!
 
@Paul, @all : Not sure where this fits in the list of p#60 choices:

> Document the ARM Cache delete() 32 byte boundary behavior on the 1062.
> Fix malloc() code to flush() internal linked list data structures when they are changed in DMAMEM
-> this could also help with 'warm restart' behavior for use or debug analysis
> Document DMAMEM cache behavior. It is a recurring issue since first 1062 Beta when QSPI was perceived to be working, thanks to cache support.
 
Sorry about the delay on this. This week I've been slowly going through the many messages on the first 1.55 beta thread. Just now got to msg #365 on that thread, which points here. I'm still about 9 days behind, but catching up. Seems a lot can happen in 9 days. :(

FWIW, I also saw this thread several days ago. At the time I only saved a bookmark to read it later, because I was still working through the huge 1.55-beta1 thread...

About this specific issue, for the upcoming 1.55 release I'm going to "fix" this with better documentation in the code comments.

https://github.com/PaulStoffregen/cores/commit/c9638be5059c5a65164890b001f0354a515403d5

https://github.com/PaulStoffregen/cores/commit/7798d24cb6d7dcadbddb4bceee04f55f96318299

Code:
// Delete data from the cache, without touching memory
//[B]
// WARNING: This function is DANGEROUS!!  The address must be
// 32 byte aligned and the size must be a multiple of 32 bytes.
//
// DO NOT USE this function with arbitrarily aligned data,
// especially pointers from malloc() or C++ new.  The ARM cache
// can only delete with granularity of 32 byte cache rows.  If
// you attempt to delete improperly aligned data, any other
// cached variables shared within the same 32 byte cache row(s)
// will become collateral damage!
//
// If you wish to assure some variable or array or other data
// is not cached, use arm_dcache_flush_delete().  This
// arm_dcache_delete() should only be used for very special
// cases like DMA buffers or hardware testing & benchmarks.
//
// See this forum thread for more detail:
// https://forum.pjrc.com/threads/68100-BUG-in-arm_dcache_delete[/B]
//
// Normally arm_dcache_delete() is used before receiving data via
// DMA or from bus-master peripherals which write to memory.  You
// want to delete anything the cache may have stored, so your next
// read is certain to access the physical memory.


To specifically answer this "why not just fix the bug" question:

Of course it works with aligned data.
The problem is unaligend data.
Malloc returns unaligend data.
Yes you can do workarounds in your code.
3rd party code does not have Teensy workarounds.

So why not just fix the bug?

The reason why is the Cortex-M7 cache hardware works in 32 byte rows. The hardware simply does not support deleting data from the cache with finer granularity. The logical and within the code isn't a bug, it's simply a reflection of the reality regarding how the hardware actually works.

Regarding code like shown in msg #9, I would say the real bug is a matter of choosing which cache function to call. Assuming the intention was to make sure the freshly allocated memory wasn't in the cache, arm_dcache_flush_delete() should have been used. Hopefully these new comments in imxrt.h will help avoid this problem in the future.
 
4) DMA operations from memory to memory are fickle! I put in what I think a valid implementation for: dma.triggerContinuously();
Currently it does nothing... And sometimes it works, sometimes it does not.

I'm having a deja vu moment. Was this also mentioned on another thread?

(please understand I'm still going through the two 1.55 beta threads thoroughly in chronological order, after having skimmed them only slightly while the messages were posted as I was working on MacOS & Windows 7 support and all 3 OS-specific code to avoid infinite loops of re-detecting unworkable locked chips - which still mostly lock up the GUI, but I will use this to fix those cases in beta3)
 
Just to chime in a last time:

I doubt you read this thread or understood the problem.
To mention (again...) the other issue of the existing code: Call it with a (somewhere calculated) length of 0.
Every other (non - arduino) code does nothing with length = 0.

By the way, 0 is a multiple of 32.
Would be so easy to fix by changing the loop, no extra code needed.

I'd really like to hear a comprehensible reason why it this behavior is intended.
 
Honestly, I never even thought about anyone passing zero length to those functions.

Can probably just add a check for zero. They're inline functions and the length is almost always a constant in practical use cases, so the compiler should just optimize it away.
 
I always figured it was deliberate as change to while loop would fix it:

Edit: sorry tablet typing before:

Meant to say change:
Code:
__attribute__((always_inline, unused))
static inline void arm_dcache_delete(void *addr, uint32_t size)
{
	uint32_t location = (uint32_t)addr & 0xFFFFFFE0;
	uint32_t end_addr = (uint32_t)addr + size;
	asm volatile("": : :"memory");
	asm("dsb");
	do {
		SCB_CACHE_DCIMVAC = location;
		location += 32;
	} while (location < end_addr);
	asm("dsb");
	asm("isb");
}
to:
Code:
__attribute__((always_inline, unused))
static inline void arm_dcache_delete(void *addr, uint32_t size)
{
	uint32_t location = (uint32_t)addr & 0xFFFFFFE0;
	uint32_t end_addr = (uint32_t)addr + size;
	asm volatile("": : :"memory");
	asm("dsb");
	while (location < end_addr) {
		SCB_CACHE_DCIMVAC = location;
		location += 32;
	}
	asm("dsb");
	asm("isb");
}
 
Last edited:
Changing the while loop would be consistent with the CMSIS-CORE implementation of SCB_InvalidateDCache_by_Addr(). But IMO, a ">0 multiple of 32" comment is OK.
 
Ok,

that was some time ago now. It doesn't look like anything will change in this core function.

For those who want to use this to tell the cache that data is no longer needed - This can speed up _other_ parts of the program drastically, because there is no unnecessary data in the small cache and more important things may have slower access.
I recommend to use this replacement.

It also has the advantage that it doesn't mess up when size = 0.

Code:
__attribute__((always_inline, unused))
static inline void better_arm_dcache_delete(void *addr, uint32_t size)
{
    uintptr_t location = (31 + (uintptr_t)addr) & 0xFFFFFFE0;
    uintptr_t end_addr = ((uintptr_t)addr + size) & 0xFFFFFFE0;
    asm volatile("dsb": : :"memory");
    while (location < end_addr) {
        SCB_CACHE_DCIMVAC = location;
        location += 32;
    };
    asm("dsb");
    asm("isb");
}

Note, if not aligned, it may delete 2 cache lines less. So, you don't have to align your memory (if not needed for other things) . IF aligned (which is always the case with DMA) it works at before and is not slower.
I have NO idea what the advantage is to keep a buggy version.
That was it from me here so far.

Have fun,
Frank
 
Last edited:
Back
Top