Forum Rule: Always post complete source code & details to reproduce any issue!
Results 1 to 6 of 6

Thread: What does my program need in order to use PSRAM on the Teensy 4.1?

  1. #1
    Junior Member
    Join Date
    Jan 2022
    Posts
    1

    What does my program need in order to use PSRAM on the Teensy 4.1?

    If I have a Teensy 4.1 with 2 extra PSRAM chips soldered on the back, will variables automatically be stored there when built in memory is full, or do I have to write code specifically to store data there? I know I can use static allocation with EXTMEM, but would prefer not to mess with that and just use dynamic allocation if possible. This might be an obvious question but I am a JavaScript person so I've never had to deal with memory allocation or any low level interaction with hardware.

  2. #2
    Member
    Join Date
    Nov 2021
    Location
    Germany
    Posts
    48
    Either statically allocate variables in EXTMEM, or statically allocate a buffer in EXTMEM and port any malloc realization that works with buffer, so that you will allocate memory practically with standard C/C++, but in this EXTMEM.

    Be careful, EXTMEM is slow. Especially if you access randomly. If you will access for example integers with odd address, you will have a real failure. It is works still in normal memory with huge slowdown, but it is really forbidden in EXTMEM.

  3. #3
    Senior Member
    Join Date
    Apr 2014
    Location
    Germany
    Posts
    1,710
    ...statically allocate a buffer in EXTMEM and port any malloc realization that works with buffer...
    Teensyduino contains the smalloc library. You can use it to dynamically allocate memory from the PSRAM chips (EXTMEM)

    This:
    Code:
    #include "smalloc.h"
    //....
    
    byte* buf = (byte*) sm_malloc(1024 * 1024);
    will dynamically allocate a 1MB buffer on the PSRAM chip.

  4. #4
    Member
    Join Date
    Nov 2021
    Location
    Germany
    Posts
    48
    Quote Originally Posted by luni View Post
    Teensyduino contains the smalloc library.
    super, it is even simpler.

  5. #5
    Senior Member+ defragster's Avatar
    Join Date
    Feb 2015
    Posts
    15,553
    There is a 32KB processor cache on the PSRAM dataspace as well, so speed for localized access can be fast.

    Also IIRC tested in Beta and there is no problem/limit or problem on word boundary access. Byte sizes of 1,2, or 4 all work just fine AFAIK.

    As noted there is support for dynamic malloc access to PSRAM data space. And also compile time EXTMEM 'reservation' and allocation. Though there is no initialization done on PSRAM memory space. So it cannot be assigned data values at compile time.

  6. #6
    Member
    Join Date
    Nov 2021
    Location
    Germany
    Posts
    48
    Quote Originally Posted by defragster View Post
    Also IIRC tested in Beta and there is no problem/limit or problem on word boundary access.
    Please, find below the simple test program that fail on boundary access.

    CrashReport:
    A problem occurred at (system time) 22:17:52
    Code was executing from address 0x108
    CFSR: 1000000
    (UNALIGNED) Unaligned access UsageFault
    Temperature inside the chip was 42.03 C
    Startup CPU clock speed is 600MHz
    Reboot was caused by auto reboot after fault or bad interrupt detected
    Code:
    #pragma GCC push_options
    #pragma GCC optimize ("Ofast")
    
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <math.h>
    #include <Arduino.h>
    
    
    unsigned char  DMAMEM DmaMem[480*1024];
    unsigned char  EXTMEM ExtMem[16*1024*1024];
    unsigned char         GenMem[300*1024];
    
    // each class is only 8 bytes
    class cTestData1
    { public: unsigned char A1, A2, A3, A4, A5, A6, A7, A8;
    };
    
    
    class cTestData2
    { public: unsigned short A1, A2, A3, A4;
    };
    
    
    class cTestData3
    { public: unsigned int A1:24, A2:20, A3:13, A4:7;
    };
    
    
    class cTestData4
    { public: unsigned int A1, A2;
    };
    
    
    void TestDataS(unsigned char *p1, unsigned char *p2) // copy data from one test memory block to other and measure the total amount of CPU tacts for 64K transfer
    { unsigned long begin, end;
      cTestData1 *pT1;
      cTestData2 *pT2;
      cTestData3 *pT3;
      cTestData4 *pT4;
      cTestData4 *pT41;
      unsigned int *pI1, *pI2;
      pT1=(cTestData1*)p1;
      pT2=(cTestData2*)p2;
      begin=ARM_DWT_CYCCNT;
      for(int i=0; i<8192; i++)
      { pT2[i].A1=pT1[i].A1;
        pT2[i].A2=pT1[i].A2;
        pT2[i].A3=pT1[i].A3;
        pT2[i].A4=pT1[i].A4+pT1[i].A5+pT1[i].A6+pT1[i].A7+pT1[i].A8;
      }
      end=ARM_DWT_CYCCNT;
      Serial.print(end-begin); Serial.print(" ");
    
      pT2=(cTestData2*)p2;
      pT3=(cTestData3*)p1;
      begin=ARM_DWT_CYCCNT;
      for(int i=0; i<8192; i++)
      { pT3[i].A1=pT2[i].A1;
        pT3[i].A2=pT2[i].A2;
        pT3[i].A3=pT2[i].A3;
        pT3[i].A4=pT2[i].A4;
      }
      end=ARM_DWT_CYCCNT;
      Serial.print(end-begin); Serial.print(" ");
    
      pT3=(cTestData3*)p1;
      pT4=(cTestData4*)p2;
      begin=ARM_DWT_CYCCNT;
      for(int i=0; i<8192; i++)
      { pT4[i].A1=pT3[i].A1;
        pT4[i].A2=pT3[i].A2+pT3[i].A3+pT3[i].A4;
      }
      end=ARM_DWT_CYCCNT;
      Serial.print(end-begin); Serial.print(" ");
    
      pT4=(cTestData4*)p2;
      pT1=(cTestData1*)p1;
      begin=ARM_DWT_CYCCNT;
      for(int i=0; i<8192; i++)
      { pT1[i].A1=pT4[i].A1;
        pT1[i].A2=pT4[i].A1;
        pT1[i].A3=pT4[i].A1;
        pT1[i].A4=pT4[i].A1;
    
        pT1[i].A5=pT4[i].A2;
        pT1[i].A6=pT4[i].A2;
        pT1[i].A7=pT4[i].A2;
        pT1[i].A8=pT4[i].A2;
      }
      end=ARM_DWT_CYCCNT;
      Serial.print(end-begin); Serial.print(" ");
    
      pT4=(cTestData4*)p1;
      pT41=(cTestData4*)p2;
      begin=ARM_DWT_CYCCNT;
      for(int i=0; i<8192; i++)
      { pT41[i].A1=pT4[i].A1;
        pT41[i].A2=pT4[i].A2;
      }
      end=ARM_DWT_CYCCNT;
    
      Serial.print(end-begin); Serial.print(" ");
    
      pI1=(unsigned int*)p2;
      pI2=(unsigned int*)p1;
      begin=ARM_DWT_CYCCNT;
      for(int i=0; i<16384; i++)
        pI2[i]=pI1[i];
      end=ARM_DWT_CYCCNT;
    
      Serial.println(end-begin);
    }
    
    
    int ShiftList1[] = {0, 4, 8, 12, 16, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
    int ShiftList2[] = {0, 4, 8, 12, 16, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
    
    
    void TestData(unsigned char *p)
    { Serial.printf("Testing memory of %p\n", p);
      for(int i=0; i<sizeof(ShiftList1)/sizeof(ShiftList1[0]); i++)
      { Serial.printf("%p(%d) to %p(%d) started\n", p+ShiftList1[i], ShiftList1[i], p+65536+ShiftList1[i], 65536+ShiftList1[i]);
        TestDataS(p+ShiftList1[i], p+65536+ShiftList1[i]);
      }
    }
    
    
    void setup()
    { while(!Serial);
      if (CrashReport)
      { Serial.print(CrashReport);
        delay(10000);
      }
    }
    
    
    void loop()
    {
      TestData(GenMem);
      TestData(DmaMem);
      TestData(ExtMem);
    }
    
    #pragma GCC pop_options

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •