What does my program need in order to use PSRAM on the Teensy 4.1?

luke baja

New member
If I have a Teensy 4.1 with 2 extra PSRAM chips soldered on the back, will variables automatically be stored there when built in memory is full, or do I have to write code specifically to store data there? I know I can use static allocation with EXTMEM, but would prefer not to mess with that and just use dynamic allocation if possible. This might be an obvious question but I am a JavaScript person so I've never had to deal with memory allocation or any low level interaction with hardware.
 
Either statically allocate variables in EXTMEM, or statically allocate a buffer in EXTMEM and port any malloc realization that works with buffer, so that you will allocate memory practically with standard C/C++, but in this EXTMEM.

Be careful, EXTMEM is slow. Especially if you access randomly. If you will access for example integers with odd address, you will have a real failure. It is works still in normal memory with huge slowdown, but it is really forbidden in EXTMEM.
 
...statically allocate a buffer in EXTMEM and port any malloc realization that works with buffer...

Teensyduino contains the smalloc library. You can use it to dynamically allocate memory from the PSRAM chips (EXTMEM)

This:
Code:
#include "smalloc.h"
//....

byte* buf = (byte*) sm_malloc(1024 * 1024);

will dynamically allocate a 1MB buffer on the PSRAM chip.
 
There is a 32KB processor cache on the PSRAM dataspace as well, so speed for localized access can be fast.

Also IIRC tested in Beta and there is no problem/limit or problem on word boundary access. Byte sizes of 1,2, or 4 all work just fine AFAIK.

As noted there is support for dynamic malloc access to PSRAM data space. And also compile time EXTMEM 'reservation' and allocation. Though there is no initialization done on PSRAM memory space. So it cannot be assigned data values at compile time.
 
Also IIRC tested in Beta and there is no problem/limit or problem on word boundary access.

Please, find below the simple test program that fail on boundary access.

CrashReport:
A problem occurred at (system time) 22:17:52
Code was executing from address 0x108
CFSR: 1000000
(UNALIGNED) Unaligned access UsageFault
Temperature inside the chip was 42.03 °C
Startup CPU clock speed is 600MHz
Reboot was caused by auto reboot after fault or bad interrupt detected

Code:
#pragma GCC push_options
#pragma GCC optimize ("Ofast")

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <Arduino.h>


unsigned char  DMAMEM DmaMem[480*1024];
unsigned char  EXTMEM ExtMem[16*1024*1024];
unsigned char         GenMem[300*1024];

// each class is only 8 bytes
class cTestData1
{ public: unsigned char A1, A2, A3, A4, A5, A6, A7, A8;
};


class cTestData2
{ public: unsigned short A1, A2, A3, A4;
};


class cTestData3
{ public: unsigned int A1:24, A2:20, A3:13, A4:7;
};


class cTestData4
{ public: unsigned int A1, A2;
};


void TestDataS(unsigned char *p1, unsigned char *p2) // copy data from one test memory block to other and measure the total amount of CPU tacts for 64K transfer
{ unsigned long begin, end;
  cTestData1 *pT1;
  cTestData2 *pT2;
  cTestData3 *pT3;
  cTestData4 *pT4;
  cTestData4 *pT41;
  unsigned int *pI1, *pI2;
  pT1=(cTestData1*)p1;
  pT2=(cTestData2*)p2;
  begin=ARM_DWT_CYCCNT;
  for(int i=0; i<8192; i++)
  { pT2[i].A1=pT1[i].A1;
    pT2[i].A2=pT1[i].A2;
    pT2[i].A3=pT1[i].A3;
    pT2[i].A4=pT1[i].A4+pT1[i].A5+pT1[i].A6+pT1[i].A7+pT1[i].A8;
  }
  end=ARM_DWT_CYCCNT;
  Serial.print(end-begin); Serial.print(" ");

  pT2=(cTestData2*)p2;
  pT3=(cTestData3*)p1;
  begin=ARM_DWT_CYCCNT;
  for(int i=0; i<8192; i++)
  { pT3[i].A1=pT2[i].A1;
    pT3[i].A2=pT2[i].A2;
    pT3[i].A3=pT2[i].A3;
    pT3[i].A4=pT2[i].A4;
  }
  end=ARM_DWT_CYCCNT;
  Serial.print(end-begin); Serial.print(" ");

  pT3=(cTestData3*)p1;
  pT4=(cTestData4*)p2;
  begin=ARM_DWT_CYCCNT;
  for(int i=0; i<8192; i++)
  { pT4[i].A1=pT3[i].A1;
    pT4[i].A2=pT3[i].A2+pT3[i].A3+pT3[i].A4;
  }
  end=ARM_DWT_CYCCNT;
  Serial.print(end-begin); Serial.print(" ");

  pT4=(cTestData4*)p2;
  pT1=(cTestData1*)p1;
  begin=ARM_DWT_CYCCNT;
  for(int i=0; i<8192; i++)
  { pT1[i].A1=pT4[i].A1;
    pT1[i].A2=pT4[i].A1;
    pT1[i].A3=pT4[i].A1;
    pT1[i].A4=pT4[i].A1;

    pT1[i].A5=pT4[i].A2;
    pT1[i].A6=pT4[i].A2;
    pT1[i].A7=pT4[i].A2;
    pT1[i].A8=pT4[i].A2;
  }
  end=ARM_DWT_CYCCNT;
  Serial.print(end-begin); Serial.print(" ");

  pT4=(cTestData4*)p1;
  pT41=(cTestData4*)p2;
  begin=ARM_DWT_CYCCNT;
  for(int i=0; i<8192; i++)
  { pT41[i].A1=pT4[i].A1;
    pT41[i].A2=pT4[i].A2;
  }
  end=ARM_DWT_CYCCNT;

  Serial.print(end-begin); Serial.print(" ");

  pI1=(unsigned int*)p2;
  pI2=(unsigned int*)p1;
  begin=ARM_DWT_CYCCNT;
  for(int i=0; i<16384; i++)
    pI2[i]=pI1[i];
  end=ARM_DWT_CYCCNT;

  Serial.println(end-begin);
}


int ShiftList1[] = {0, 4, 8, 12, 16, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
int ShiftList2[] = {0, 4, 8, 12, 16, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};


void TestData(unsigned char *p)
{ Serial.printf("Testing memory of %p\n", p);
  for(int i=0; i<sizeof(ShiftList1)/sizeof(ShiftList1[0]); i++)
  { Serial.printf("%p(%d) to %p(%d) started\n", p+ShiftList1[i], ShiftList1[i], p+65536+ShiftList1[i], 65536+ShiftList1[i]);
    TestDataS(p+ShiftList1[i], p+65536+ShiftList1[i]);
  }
}


void setup()
{ while(!Serial);
  if (CrashReport)
  { Serial.print(CrashReport);
    delay(10000);
  }
}


void loop()
{
  TestData(GenMem);
  TestData(DmaMem);
  TestData(ExtMem);
}

#pragma GCC pop_options
 
@was-ja : sorry just seeing you replied to this.

Attempting to pinpoint which instruction was causing the fault it was modified as below.
NOTE: The code below initially tested on a T_4.1 with only 8MB PSRAM, running on a second with 16MB works the same with the edit
See: unsigned char EXTMEM ExtMem[16 * 1024 * 1024]; versus unsigned char EXTMEM ExtMem[8 * 1024 * 1024];

There are TWO BOLD RED LINES - one is active - the other can be swapped in using TeensyDuino 1.57 and with either of them active the code runs to "DONE!" completion.
NOTE: other commented lines in that area of code were first attempt to locate the error line : all then commented out to find the problematic one that 'hides/removes' the error.

That alternate code printing " A " is where it faults when the #if 1 if changed to #if 0

Using the altered #if 1 adding that code between instructions eliminates the fault. The "if ( i == 0 )" can be any number in the range that doesn't get optimized away.

This leaves open the question as to EXACTLY why the code as written faults.

It acts the same with or without the top and bottom #pragma changes

Code:
#pragma GCC push_options
#pragma GCC optimize ("Ofast")

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <Arduino.h>


unsigned char  DMAMEM DmaMem[480 * 1024];
unsigned char  EXTMEM ExtMem[8 * 1024 * 1024];
unsigned char         GenMem[300 * 1024];

// each class is only 8 bytes
class cTestData1
{
public: unsigned char A1, A2, A3, A4, A5, A6, A7, A8;
};


class cTestData2
{
public: unsigned short A1, A2, A3, A4;
};


class cTestData3
{
public: unsigned int A1: 24, A2: 20, A3: 13, A4: 7;
};


class cTestData4
{
public: unsigned int A1, A2;
};


void TestDataS(unsigned char *p1, unsigned char *p2) // copy data from one test memory block to other and measure the total amount of CPU tacts for 64K transfer
{ unsigned long begin, end;
  cTestData1 *pT1;
  cTestData2 *pT2;
  cTestData3 *pT3;
  cTestData4 *pT4;
  cTestData4 *pT41;
  unsigned int *pI1, *pI2;
  pT1 = (cTestData1*)p1;
  pT2 = (cTestData2*)p2;
  begin = ARM_DWT_CYCCNT;
  for (int i = 0; i < 8192; i++)
  { pT2[i].A1 = pT1[i].A1;
    pT2[i].A2 = pT1[i].A2;
    pT2[i].A3 = pT1[i].A3;
    pT2[i].A4 = pT1[i].A4 + pT1[i].A5 + pT1[i].A6 + pT1[i].A7 + pT1[i].A8;
  }
  end = ARM_DWT_CYCCNT;
  Serial.print(end - begin); Serial.print(" A "); Serial.flush();

  pT2 = (cTestData2*)p2;
  pT3 = (cTestData3*)p1;
  begin = ARM_DWT_CYCCNT;
#if 1
  for (int i = 0; i < 8192; i++)
  { 
//    CrashReport.breadcrumb( 1, i );
//    CrashReport.breadcrumb( 2, 1 );
    pT3[i].A1 = pT2[i].A1;
[B][COLOR="#FF0000"]// >>    CrashReport.breadcrumb( 2, 2 ); // Adding this line alone allows completion without error : REQ TD_1.57
    if ( i == 0 ) Serial.print("\t A2 \t"); // OR: Adding this line alone allows completion without error[/COLOR][/B]
    pT3[i].A2 = pT2[i].A2;
//    CrashReport.breadcrumb( 2, 3 );
    pT3[i].A3 = pT2[i].A3;
//    CrashReport.breadcrumb( 2, 4 );
    pT3[i].A4 = pT2[i].A4;
//    CrashReport.breadcrumb( 2, 5 );
  }
#else 
  for (int i = 0; i < 8192; i++)
  { 
    pT3[i].A1 = pT2[i].A1;
    pT3[i].A2 = pT2[i].A2;
    pT3[i].A3 = pT2[i].A3;
    pT3[i].A4 = pT2[i].A4;
  }
#endif
  end = ARM_DWT_CYCCNT;
  Serial.print(end - begin); Serial.print(" B "); Serial.flush();

  pT3 = (cTestData3*)p1;
  pT4 = (cTestData4*)p2;
  begin = ARM_DWT_CYCCNT;
  for (int i = 0; i < 8192; i++)
  { pT4[i].A1 = pT3[i].A1;
    pT4[i].A2 = pT3[i].A2 + pT3[i].A3 + pT3[i].A4;
  }
  end = ARM_DWT_CYCCNT;
  Serial.print(end - begin); Serial.print(" C "); Serial.flush();

  pT4 = (cTestData4*)p2;
  pT1 = (cTestData1*)p1;
  begin = ARM_DWT_CYCCNT;
  for (int i = 0; i < 8192; i++)
  { pT1[i].A1 = pT4[i].A1;
    pT1[i].A2 = pT4[i].A1;
    pT1[i].A3 = pT4[i].A1;
    pT1[i].A4 = pT4[i].A1;

    pT1[i].A5 = pT4[i].A2;
    pT1[i].A6 = pT4[i].A2;
    pT1[i].A7 = pT4[i].A2;
    pT1[i].A8 = pT4[i].A2;
  }
  end = ARM_DWT_CYCCNT;
  Serial.print(end - begin); Serial.print(" D "); Serial.flush();

  pT4 = (cTestData4*)p1;
  pT41 = (cTestData4*)p2;
  begin = ARM_DWT_CYCCNT;
  for (int i = 0; i < 8192; i++)
  { pT41[i].A1 = pT4[i].A1;
    pT41[i].A2 = pT4[i].A2;
  }
  end = ARM_DWT_CYCCNT;

  Serial.print(end - begin); Serial.print(" E "); Serial.flush();

  pI1 = (unsigned int*)p2;
  pI2 = (unsigned int*)p1;
  begin = ARM_DWT_CYCCNT;
  for (int i = 0; i < 16384; i++)
    pI2[i] = pI1[i];
  end = ARM_DWT_CYCCNT;

  Serial.println(end - begin); Serial.flush();
  Serial.println(" DONE! "); Serial.flush();
}


int ShiftList1[] = {0, 4, 8, 12, 16, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
int ShiftList2[] = {0, 4, 8, 12, 16, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};


void TestData(unsigned char *p)
{ 
  Serial.printf("Testing memory of %p\n", p); Serial.flush();
  for (uint32_t i = 0; i < sizeof(ShiftList1) / sizeof(ShiftList1[0]); i++)
  //for (uint32_t i = 8; i < sizeof(ShiftList1) / sizeof(ShiftList1[0]); i++)
  { Serial.printf("#i=%u %p(%d) to %p(%d) started\n", i, p + ShiftList1[i], ShiftList1[i], p + 65536 + ShiftList1[i], 65536 + ShiftList1[i]);
    Serial.flush();
    TestDataS(p + ShiftList1[i], p + 65536 + ShiftList1[i]);
  }
}


void setup()
{
  Serial.begin(115200);
  while (!Serial);
  Serial.println("\n" __FILE__ " " __DATE__ " " __TIME__);  Serial.flush();
  if (CrashReport)
  { Serial.print(CrashReport);
    delay(10000);
  }
  else
  {
    TestData(GenMem);
    TestData(DmaMem);
    TestData(ExtMem);
  }
}


void loop()
{
}

#pragma GCC pop_options


NO ERROR Result when run above with line edit to: #if 1
NOTE: ALL the CYCLE COUNTS take MUCH LONGER
Code:
...
#i=8 0x70000001(1) to 0x70010001(65537) started
[B][COLOR="#FF0000"]3788753 A 	 A2 [/COLOR][/B]	5106783 B 4550928 C 3834864 D 3904465 E 3804917
 DONE! 
...

Result when run above with line edit to: #if 0
Code:
C:\T_Drive\tCode\T4\PSRAMfault\PSRAMfault.ino Aug  3 2022 03:01:29
Testing memory of 0x20002064
#i=0 0x20002064(0) to 0x20012064(65536) started
98517 A 155834 B 82113 C 98485 D 32786 E 32799
 DONE! 
#i=1 0x20002068(4) to 0x20012068(65540) started
98543 A 155827 B 82109 C 98505 D 32784 E 32796
 DONE! 
#i=2 0x2000206c(8) to 0x2001206c(65544) started
98517 A 155862 B 82110 C 98485 D 32786 E 32799
 DONE! 
#i=3 0x20002070(12) to 0x20012070(65548) started
98519 A 155865 B 82114 C 98484 D 32784 E 32797
 DONE! 
#i=4 0x20002074(16) to 0x20012074(65552) started
98517 A 155835 B 82141 C 98485 D 32786 E 32799
 DONE! 
#i=5 0x20002064(0) to 0x20012064(65536) started
98515 A 155832 B 82110 C 98515 D 32786 E 32799
 DONE! 
#i=6 0x20002064(0) to 0x20012064(65536) started
98517 A 155831 B 82114 C 98485 D 32786 E 32829
 DONE! 
#i=7 0x20002064(0) to 0x20012064(65536) started
98517 A 155836 B 82110 C 98485 D 32786 E 32799
 DONE! 
[B]#i=8 0x20002065(1) to 0x20012065(65537) started
[COLOR="#FF0000"]123127 A[/COLOR] 
[/B]C:\T_Drive\tCode\T4\PSRAMfault\PSRAMfault.ino Aug  3 2022 03:01:29
CrashReport:
  A problem occurred at (system time) 3:2:13
  Code was executing from address 0x10C
  CFSR: 1000000
	(UNALIGNED) Unaligned access UsageFault
  Temperature inside the chip was 44.38 °C
  Startup CPU clock speed is 600MHz
  Reboot was caused by auto reboot after fault or bad interrupt detected
 
Back
Top