teensy 3.0 memory to memory DMA -- help

Status
Not open for further replies.

manitou

Senior Member+
Well, I thought I'd try to get a DMA sketch working for teensy 3.0. I looked at the I2S DMA example and chapter 21 of the ARM manual -- but all those registers are overwhelming. I have developed similar memory to memory DMA sketches on Maple and DUE. Here is my non-working attempt on teensy

Code:
/**********************************************************************
 teensy  only 16KB RAM
 use DMA for memcpy memset  ch 21  pg 387
 I2S example uses DMA_MUX ??
 */

#define WORDS 1000
uint32_t src[WORDS],dst[WORDS];

//int32_t DMAMEM _dma_Buffer_A[WORDS];
//int32_t DMAMEM _dma_Buffer_B[WORDS];

volatile int DMAdone=0;

// CINT - DMA Clear Interrupt Request Register
#define DMA_CINT_CINT(n) ((uint8_t)(n & 3)<<0) // Clear Interrupt Request
#define DMA_CINT_CAIR ((uint8_t)1<<6) // Clear All Interrupt Requests
#define DMA_CINT_NOP ((uint8_t)1<<7) // NOP

#define DMA_CITER_MASK ((uint16_t)0x7FFF) // Loop count mask
#define DMA_CITER_ELINK ((uint16_t)1<<15) // Enable channel linking on minor-loop complete
#define DMA_BITER_MASK ((uint16_t)0x7FFF) // Loop count mask
#define DMA_BITER_ELINK ((uint16_t)1<<15) // Enable channel linking on min

// ERQ - DMA Enable Request Register
#define DMA_ERQ_ERQ0 ((uint32_t)1<<0) // Enable DMA Request 0
#define DMA_ERQ_ERQ1 ((uint32_t)1<<1) // Enable DMA Request 1
#define DMA_ERQ_ERQ2 ((uint32_t)1<<2) // Enable DMA Request 2
#define DMA_ERQ_ERQ3 ((uint32_t)1<<3) // Enable DMA Request 3

// SERQ - DMA Set Enable Request Register
#define DMA_SERQ_SERQ(n) ((uint8_t)(n & 3)<<0) // Set Enable Request
#define DMA_SERQ_SAER ((uint8_t)1<<6) // Set All Enable Requests
#define DMA_SERQ_NOP ((uint8_t)1<<7) // NOP

#define DMA_CR_EMLM ((uint32_t)0x80) // Enable Minor Loop Mapping

void dma_ch0_isr(void)
{
  DMAdone=1;
  DMA_CINT = DMA_CINT_CINT(0); // use the Clear Intr. Request register
}

void dma_init() {
    // Enable IRQ on the DMA channel 0
    // NVIC_ENABLE_IRQ(IRQ_DMA_ERROR);
    NVIC_ENABLE_IRQ(IRQ_DMA_CH0);

    // Set inactive
    DMA_TCD0_CSR &= ~(DMA_TCD_CSR_ACTIVE);
    // Control register
    DMA_CR = 0 // Normal
// | DMA_CR_EDBG_MASK // Stall DMA transfers when debugger is halted (avoid noise)
      | DMA_CR_EMLM; // Enable minor looping

    // fill the TCD regs
    DMA_TCD0_SADDR = (const volatile void *) src ; // alternated with _dma_Buffer_B by our interrupt handler
    DMA_TCD0_SOFF = 4; // 4 byte offset
    DMA_TCD0_ATTR = DMA_TCD_ATTR_SMOD(0) // No source modulo
                            | DMA_TCD_ATTR_SSIZE(DMA_TCD_ATTR_SIZE_32BIT)
                            | DMA_TCD_ATTR_DMOD(0) // No destination modulo
                            | DMA_TCD_ATTR_DSIZE(DMA_TCD_ATTR_SIZE_32BIT);
    DMA_TCD0_NBYTES_MLNO = 4; // Transfer 4 bytes in each service request
    DMA_TCD0_SLAST = 0; // source address will always be newly written before each new start
    DMA_TCD0_DADDR = (volatile void *) dst; // Destination 
    DMA_TCD0_DOFF = 4; // destination offset after each write
    DMA_TCD0_DLASTSGA = 0; // No scatter/gather
    DMA_TCD0_CITER_ELINKNO = WORDS & DMA_CITER_MASK; // major loop iteration count 
    DMA_TCD0_BITER_ELINKNO = WORDS & DMA_BITER_MASK; // major loop iteration count
    DMA_TCD0_CSR = DMA_TCD_CSR_INTMAJOR; // interrupt on major loop completion
// no stalls          | DMA_TCD_CSR_BWC(3); // DMA bandwidth control

    // enable DMA channel 0 requests
    DMA_ERQ = DMA_ERQ_ERQ0;
    DMA_SERQ = DMA_SERQ_SERQ(0);

    // Set active
//    DMA_TCD0_CSR |= DMA_TCD_CSR_ACTIVE;

    // To initiate from software, set DMA_CSR[start]
    DMA_TCD0_CSR |= DMA_TCD_CSR_START;
}

void setup(){
	int i;
	Serial.begin(9600);
	while(!Serial);
	for (i=0;i<WORDS;i++){
		dst[i]=0;
		src[i]=i;
	}
	dma_init();
	delay(1000);
	if (DMAdone) Serial.println("done");
	Serial.println(dst[4]);
	Serial.println( DMA_TCD0_CSR,HEX);
}

void loop(){
	int i,t1,t2;
	
	for (i=0;i<WORDS;i++){
		dst[i]=0;
		src[i]=i;
	}
#if 0
	memcpy32(dst,src,WORDS);
	Serial.println(dst[3],DEC);
        memset32(dst,45,WORDS);
        Serial.println(dst[3],DEC);
        t1=micros();
        memcpy32(dst,src,WORDS);
        t2 = micros() - t1;
        Serial.print("memcpy32 ");Serial.println(t2,DEC);
        
        t1=micros();
        memset32(dst,66,WORDS);
        t2 = micros() - t1;
        Serial.print("memset32 ");Serial.println(t2,DEC);
#endif
        
        t1=micros();
        for(i=0;i<WORDS;i++) dst[i] = src[i];
        t2 = micros() - t1;
        Serial.print("loop ");Serial.println(t2,DEC);
        dst[3]=99;
        t1=micros();
        memcpy(dst,src,4*WORDS);
        t2 = micros() - t1;
        Serial.print("memcpy ");Serial.println(t2,DEC);
        Serial.println(dst[3],DEC);
        t1=micros();
        memset(dst,66,4*WORDS);
        t2 = micros() - t1;
        Serial.print("memset ");Serial.println(t2,DEC);
        Serial.println(dst[3],HEX);
        t1=micros();
        for(i=0;i<WORDS;i++) dst[i] = 66;
        t2 = micros() - t1;
        Serial.print("set loop ");Serial.println(t2,DEC);
        Serial.println();
	delay(3000);
}

I have tried random changes to the various DMA regs to no avail ... I was hoping the dma_init would do a copy from src[] to dst[], then I was going to generalize to memcpy32()

Any hints would be appreciated.

thanks
 
Software triggered DMA is much easier. You don't need to mess with interrupts, unless you want asynchronous notification of completion. Normally it will complete very quickly... the time to move the TCD in and out of the DMA engine plus the raw bus speed (192 Mbyte/sec at 96 MHz), unless other DMA transfers are in operation. You do not need to use the request bits or the DMA request mux, since you'll manually trigger. Most of the complex DMA features are completely unnecessary.

Just write the transfer to want to the TCD registers. You really only care about the source and destination address and increment, and the attributes to configure for 32 bit bus operations. Set the minor loop size to do the entire operation, and the 2 iteration counts to 1, since you want to do the entire transfer upon a single trigger. The registers that adjust things after the transfer don't matter... unless you want to reuse the settings without writing the entire TCD again. To trigger the DMA, write the CSR last, setting the start bit to make it go. When it's done, the DONE bit will be set.

Like this....

Code:
int src[3] = {12345678, 9782742, -5829348};
int dst[3] = {8327, -14984320, 489823};

void setup()
{
        while (!Serial) ; // wait
        delay(500);
        Serial.println("Before memcpy32:");
        print_buffer(src, 3, "  src");
        print_buffer(dst, 3, "  dst");
        memcpy32(dst, src, 3);
        Serial.println("After memcpy32:");
        print_buffer(src, 3, "  src");
        print_buffer(dst, 3, "  dst");

}

void loop() 
{
}

void print_buffer(const int *buf, int num, const char *name)
{
        Serial.print(name);
        Serial.print(": ");
        for (int i=0; i < num; i++) {
                Serial.print(buf[i]);
                if (i < num-1) Serial.print(", ");
        }
        Serial.println();
}

void memcpy32(int *dest, const int *src, unsigned int count)
{
        DMA_TCD1_SADDR = src;
        DMA_TCD1_SOFF = 4;
        DMA_TCD1_ATTR = DMA_TCD_ATTR_SSIZE(2) | DMA_TCD_ATTR_DSIZE(2);
        DMA_TCD1_NBYTES_MLNO = count * 4;
        DMA_TCD1_SLAST = 0;
        DMA_TCD1_DADDR = dest;
        DMA_TCD1_DOFF = 4;
        DMA_TCD1_CITER_ELINKNO = 1;
        DMA_TCD1_DLASTSGA = 0;
        DMA_TCD1_BITER_ELINKNO = 1;
        DMA_TCD1_CSR = DMA_TCD_CSR_START;
        while (!(DMA_TCD1_CSR & DMA_TCD_CSR_DONE)) /* wait */ ;
}
 
Last edited:
Excellent.

thanks

The DMA memcpy takes only 23us for (1000 4-byte words), compared with 95us for the newlib (v1.18) memcpy. The teensy was running at 96MHz. At 48MHz, the times are about the same, so I infer the memory runs at the bus speed (48MHz). The newlib memcpy() uses an unrolled loop (64) of ldr.w str.w. v1.20 of newlib actually has an ARM-specific memcpy.S that uses unrolled loop (64) of ldrd/strd.

Modified the sketch to use ISR on DMA completion, and then started DMA memcpy32() followed by library memcpy(), using two different destination vectors. The DMA version completed first taking 27us, and the memcpy() took 105us -- total for both was 107us.

Similar tests were run on maple and DUE, see mem2mem.txt at
https://github.com/manitou48/DUEZoo
 
Last edited:
Status
Not open for further replies.
Back
Top