Large DMA transfers

xxxajk

Well-known member
Noticing that if I try and do a DMA transfer to/from SPI to DMAMEM that if the transfer > 64K I end up with garbage on the output, and no clue if the input is corrupting anything.
So the question is, _WITHOUT_ using any ISRs, how can I do a continuous transfer of 200KB in the background using DMA?

Attaching pictures and sketch...

1MHz/2Mhz work as expected, but then, it's only looping 25000/50000 bytes
1MHz.png
2MHz.png


But 4MHz,8MHz, nup...
4MHz.png
8MHz.png


And here's the sketch:
C:
#include <Arduino.h>
#include <DMAChannel.h>
#include <SPI.h>

/*
 * Windowed SPI slave mode loopback using DMA
 *
 * Connect pin 23 to pin 13 for sampler clock
 * Connect pin 9  to 10 for !CS (spi frame reset)
 * Connect pin 11 to 12 for loopback
 *
 *
 * Cicular single buffer copies back to self via DMA. RX should be behind TX, always.
 *
 */

/*
 *  Set the sample rate to one of 1,2,4,8
 * 
 *  Monitor pins 11/12 on oscilloscope in dropout mode.
 *  1MHz MAX_BUFFER_SIZE: 200000, window_bytes: 25000  use 1000nS for detecting dropouts WORKS
 *  2MHz MAX_BUFFER_SIZE: 200000, window_bytes: 50000  use  500nS for detecting dropouts WORKS
 *  4MHz MAX_BUFFER_SIZE: 200000, window_bytes: 100000 use  250nS for detecting dropouts BAD_OUTPUT
 *  8MHz MAX_BUFFER_SIZE: 200000, window_bytes: 200000 use  125nS for detecting dropouts BAD_OUTPUT
 * 
 */
uint32_t sample_rate = 1; // MHz


#define SPIS SPI
#define SPIS_CSI 10
#define SPIS_CSO 9
#define CLOCK_OUT 23
#define SPIS_DMAMUX_SOURCE_RX DMAMUX_SOURCE_LPSPI4_RX
#define SPIS_DMAMUX_SOURCE_TX DMAMUX_SOURCE_LPSPI4_TX

#define NSEC_SAM ((uint32_t)200000000u)
#define NSECS(x) ((uint32_t)((uint32_t)1000u/x))
#define MAX_RATE ((uint32_t)8u)
#define BYTES_AT_MHZ(x) ((uint32_t)((NSEC_SAM/NSECS(x))/8u))
#define MAX_BUFFER_SIZE (BYTES_AT_MHZ(MAX_RATE))

DMAChannel rx(false);
DMAChannel tx(false);
IMXRT_LPSPI_t *spis_regs = &IMXRT_LPSPI4_S;


uint32_t windowed_bytes;
DMAMEM uint8_t sample_buffer[MAX_BUFFER_SIZE]__attribute__((aligned(32)));
extern "C" uint32_t set_arm_clock(uint32_t frequency);

void initSPISlaveDMA() {
        rx.begin(true);
        rx.source((uint8_t &) spis_regs->RDR);
        rx.destinationCircular(sample_buffer, windowed_bytes);
        rx.triggerAtHardwareEvent(SPIS_DMAMUX_SOURCE_RX);
        rx.disable();
        
        tx.begin(true);
        tx.sourceCircular(sample_buffer, windowed_bytes);
        tx.destination((uint8_t &) spis_regs->TDR);
        tx.triggerAtHardwareEvent(SPIS_DMAMUX_SOURCE_TX);
        tx.disable();
}

void initSPISlave() {
        spis_regs->CR &= ~LPSPI_CR_MEN;
        spis_regs->CR = LPSPI_CR_RST; //Master Logic reset! (Control Register => Software Reset)
        spis_regs->CR &=  ~LPSPI_CR_RST; //Master Logic reset! (Control Register => Software Reset)
        spis_regs->TCR = LPSPI_TCR_FRAMESZ(7); //8 bit Mode
        spis_regs->DER = LPSPI_DER_RDDE | LPI2C_MDER_TDDE; //RX/TX DMA Request Enable
        spis_regs->FCR = 4; // buffer 4 bytes
        spis_regs->CR |= LPSPI_CR_MEN; //Enable SPI Module!
}

bool started=false;

void stop_clock() {
        digitalWriteFast(SPIS_CSO, HIGH); // De-select SPI
        analogWrite(CLOCK_OUT, 7); // stop clocking
        if(started) {
                rx.disable();
                tx.disable();
                started=false;
        }
}

void start_clock() {
        rx.destinationCircular(sample_buffer, windowed_bytes);
        tx.sourceCircular(sample_buffer, windowed_bytes);
        rx.enable();
        tx.enable();
        started=true;
        digitalWriteFast(SPIS_CSO, LOW); // should reset input and I hope output...
        analogWrite(CLOCK_OUT, 4); // 3 or 4, close enough
}

void set_clock_freq() {
        stop_clock();
        analogWriteFrequency(CLOCK_OUT,1000000*sample_rate);
}

void set_windowed_bytes(void) {
        if(sample_rate>8) sample_rate=8;
        if(sample_rate<1) sample_rate=1;
        windowed_bytes=BYTES_AT_MHZ(sample_rate);
        Serial.printf("\nMAX_BUFFER_SIZE: %i, windowed_bytes: %i\n\n", MAX_BUFFER_SIZE, windowed_bytes);
}

void dump_buf() {
        int j=0;
        for(int i=0; i<windowed_bytes; i++) {
                Serial.printf("%0.2x ", sample_buffer[i]);
                j++;
                if(j==64) {
                        j=0;
                        Serial.printf("\n");
                }
        }
        Serial.printf("\n%i\n\n",windowed_bytes);
        Serial.flush();
}
void poison_samples(uint32_t count) {
        for(int i=0; i<MAX_BUFFER_SIZE; i++) {
                sample_buffer[i]=i<count?0x55:0x81;
        }       
}

void setup()
{
        // manipulate clock.
        set_arm_clock(64000000); // 640, slight overclocking required for perfect clocking.
        analogWriteResolution(3);
        pinMode(SPIS_CSO, OUTPUT); // pin 9, connects to pin 10
        pinMode(CLOCK_OUT, OUTPUT); // pin 23, outputs to external logic and pin 13 (CLK)
        set_clock_freq();
        while (!Serial) ; // wait
        SPIS.begin();
        pinMode(SPIS_CSI, INPUT);
        SPIS.setCS(SPIS_CSI);
        IOMUXC_SW_PAD_CTL_PAD_GPIO_B0_01 = IOMUXC_PAD_DSE(3) | IOMUXC_PAD_SPEED(3) | IOMUXC_PAD_PKE; /* LPSPI4 IN  (MISO) 12 */
        IOMUXC_SW_PAD_CTL_PAD_GPIO_B0_02 = IOMUXC_PAD_DSE(3) | IOMUXC_PAD_SPEED(3) | IOMUXC_PAD_PKE; /* LPSPI4 OUT (MOSI) 11 */
        IOMUXC_SW_PAD_CTL_PAD_GPIO_B0_03 = IOMUXC_PAD_DSE(3) | IOMUXC_PAD_SPEED(3) | IOMUXC_PAD_PKE; /* LPSPI4 CLK (SCLK)  13 */
        IOMUXC_SW_PAD_CTL_PAD_GPIO_B0_00 = IOMUXC_PAD_DSE(3) | IOMUXC_PAD_SPEED(3) | IOMUXC_PAD_PKE; /* LPSPI4 CSI (NCS)  10 */
        set_windowed_bytes();
        poison_samples(windowed_bytes); // test if we go too far
        initSPISlave();
        initSPISlaveDMA();
        // for now...
        start_clock();
        
}

void loop()
{
        // Nothing to see here...
}
 
Most (all?) of the DMAChannel stuff seems only to make use of the major loop, which can only do 32767 transfers (section 6.5.5.32 in the Reference Manual); depending on the transfer size of 8, 16, 32 or 64 bits, you could get up to 262,136 bytes in one go, in theory. Using SPI you might be more limited - maybe to 32-bit values? Not sure ... and your transfer size would have to be a multiple of 4 bytes, of course.

If you look into using the minor loop, you can get more data transferred, but it seems you can only transfer 16 values per minor loop, because that's the FIFO depth on the LPSPI peripheral. Still, that does mean you could get 32767*16*4 = 2,097,088 bytes transferred in one go.

If you further look into chaining DMA requests by using the DLASTSGA register (section 6.5.5.30), e.g. via the replaceSettingsOnCompletion() method, you can essentially get arbitrarily-long transfers.

All this seems to require long and hard perusal of the code and Reference Manual, though I must confess I haven't spent a lot of time looking for library examples.
 
The question is a bit ambiguous … but I’m pretty sure it’s about SPI transfers, not memory-to-memory.

I just noticed the ‘scope settings … don’t use 1x on your probes, 10x is the correct setting for most purposes.
 
The question is a bit ambiguous … but I’m pretty sure it’s about SPI transfers, not memory-to-memory.

I just noticed the ‘scope settings … don’t use 1x on your probes, 10x is the correct setting for most purposes.
Yeah, memory -> spi (loop back to) spi -> memory. can't use larger words either, unfortunately, since that would impose too much latency, where the output will actually be sometimes modified and looped back.
Think of it as thus:
I have a bit stream, that has some kind of meaning that goes out to other hardware, which immediately may modify what's looped back, and eventually will want to see and react to those bits 200mS later in the future... think like a gigantic shift register FIFO of sorts, or a digital delay-line.


Scope... I could switch to 10X, but everything here is totally fine with 1x... yeah yeah impedance, etc, yadda-yadda... doesn't affect the output anyway, could just as well used digital mode, meh. ;-) As long as I can read the signals, and trigger at the correct point, doesn't matter. my probes are the usual crap 1x/10x , some are worse than others and from my older analog scope (20MHz) but it doesn't matter...
Usually I only use 10x mode when I am dealing with something that has a super weak current, like checking a crystal oscillator is working.
i don't mind seeing the extra reflections/current disturbances either, or the cross-talk. It's all just prototype and "figure it out" stuff anyway.
 
Maybe this can help?


Also check the API and comments in DMAChannel.h.

Already went through the DMA in the core... what's odd is that:
1: it accepts counts that can be too high
2: Seems to be able to do 65535 transactions maximum on T4.1 (T4.0? dunno, might check that, should be the same?)
3: the excessive counts seem to touch other locations in the DMA controller, somehow...

I dunno how to do chaining 4 DMA (at 50,000 each, will just get me to 200,000) for one of the directions... So I'd need to eat 8 of the 16 in that case... :-/
Lastly, seems that DMA is fast enough at 8MHz to not require any buffers on TX, which isn't a huge surprise to me. I haven't tried it directly to/from PSRAM yet, but I suspect the hiccups could be too much, even at 11 times the data rate.
 
Most (all?) of the DMAChannel stuff seems only to make use of the major loop, which can only do 32767 transfers (section 6.5.5.32 in the Reference Manual);

Which date is the RM?
i.MX RT1060X Processor Reference Manual, Rev. 1, 05/2022 stops at 6.5.5.3...
 
I've done large transfers with two channels. The first one copies ($X-$Y) bytes and links to the second channel after a major loop. The second channel copies $Y bytes and links to the first channel after every minor loop. The loop count of the second channel determines how many multiples of $X bytes get copied in total. For memory to memory (i.e. untriggered) transfers this requires the bug mentioned here to be fixed, but it should work fine with hardware triggered transfers.
 
I've done large transfers with two channels. The first one copies ($X-$Y) bytes and links to the second channel after a major loop. The second channel copies $Y bytes and links to the first channel after every minor loop. The loop count of the second channel determines how many multiples of $X bytes get copied in total. For memory to memory (i.e. untriggered) transfers this requires the bug mentioned here to be fixed, but it should work fine with hardware triggered transfers.
Excellent. So you trigger a DMA to program a DMA from a linked list... I like it. (or are you just chaining DMA...)
Got an example? Sizes I'd need are 25000, 50000, 100000 and 200000, ideally just the last two...
 
Last edited:
In any case, once working, and because cache coherency doesn't matter until I manually halt the loop (to inspect it), where I would end up using DMA (again) to copy to a PSRAM block later on.
Eventually I will be needing to do 2 of these loop back in parallel, for a total of two (either 100000, or 200000 byte arenas) which just manage to fit in DMAMEM. thankfully I will have several milliseconds to do copy to swap from PSRAM, but it also must basically be able to continue from the last point after the manual stop and swap plus some advancement in bytes, but that should be manageable I'd think, with some additional tricks, although not immediately critically required.
 
Doing more datasheet reading, and wouldn't using scatter/gather with a looping list do what I need from a single channel?
 
... a circular data queue in which the size of the queue is a power of 2... ugh

You can usually achieve pretty much the same thing with this size and memory alignment requirement by using ordinary mode with the registers to adjust the destination upon the last transfer, or by replacing the whole TCD when complete. Maybe the special circular buffer mode has some unique uses, but I've personally never really understood why NXP bothered to put it into the hardware. My hunch is they probably has some important large customer demanding a specific circular buffer feature and maybe some stubborn folks unwilling to hear that the hardware offers plenty of ways to achieve such a thing. Or maybe the circular buffer feature came earlier (in their long history of iMX application processors) and the more powerful features added later make it unneeded, but NXP couldn't remove it because plenty of software was already written? Just guesswork really, but from experience using DMA the specific mode for circular buffer really feels unnecessary since the hardware offers better ways to acheive this result.
 
Last edited:
Doing more datasheet reading, and wouldn't using scatter/gather with a looping list do what I need from a single channel?
It would, but if the total range is large (or non-consecutive) the amount of DMASetting variables required tends to blow up pretty quickly. And you have to remember that when a new DMASetting is loaded in, the old state simply gets overwritten; it doesn't get written back to the original memory that it was loaded from.
 
You can usually achieve pretty much the same thing with this size and memory alignment requirement by using ordinary mode with the registers to adjust the destination upon the last transfer, or by replacing the whole TCD when complete. Maybe the special circular buffer mode has some unique uses, but I've personally never really understood why NXP bothered to put it into the hardware. My hunch is they probably has some important large customer demanding a specific circular buffer feature and maybe some stubborn folks unwilling to hear that the hardware offers plenty of ways to achieve such a thing. Or maybe the circular buffer feature came earlier (in their long history of iMX application processors) and the more powerful features added later make it unneeded, but NXP couldn't remove it because plenty of software was already written? Just guesswork really, but from experience using DMA the specific mode for circular buffer really feels unnecessary since the hardware offers better ways to acheive this result.
For me the primary use of the circular buffer is for hardware scrolling a display, for example: a 320x200x8bpp framebuffer fits inside 65536 bytes. So you allocate 64KB on a 64KB boundary and lock down the upper 16 DMA address bits using SMOD, program to transfer 320*200 bytes, and whenever you want to scroll horizontally or vertically you only have to update the lower half of SADDR - when the DMA reaches the end of the 64KB window it will automatically wrap back to the top and continue.
 
Doing more datasheet reading, and wouldn't using scatter/gather with a looping list do what I need from a single channel?
Yup, as implied in
If you further look into chaining DMA requests by using the DLASTSGA register (section 6.5.5.30), e.g. via the replaceSettingsOnCompletion() method, you can essentially get arbitrarily-long transfers.
I said you’d need to do plenty of datasheet reading :)

Given your need for 8-bit transfers, each TCD could have a major count of 25,000, so you’d need 8 sets of settings for a 200,000 bytes fire-and-forget transfer. Very doable.
 
You can safely set NBYTES to whatever value you like and still have 8 bit transfers (since that depends on ATTR_SRC and ATTR_DST). The simplest solution to accomplish a transfer of 200000 bytes would be NBYTES=8 and CITER/BITER=25000 but you're not going to get that using only the API functions, because they try to be clever and set the access size based on the transfer size (support for 8 byte transfers is missing anyway and would default to single byte). Just set the registers manually.
 
You can safely set NBYTES to whatever value you like
Not quite true, I discovered, though your suggestion of NBYTES=8 will be OK. I think the minor loop is relatively dumb, and fires out NBYTES whenever requested, all in one go. Because the SPI FIFO is only 16 words, setting NBYTES to a greater value than this will overflow the FIFO.
 
Not quite true, I discovered, though your suggestion of NBYTES=8 will be OK. I think the minor loop is relatively dumb, and fires out NBYTES whenever requested, all in one go. Because the SPI FIFO is only 16 words, setting NBYTES to a greater value than this will overflow the FIFO.
Then you set the SPI FIFO watermark to 8 bytes to ensure they will fit.
 
Sure, but you cannot set NBYTES greater than 16 (for SPI), so saying "You can safely set NBYTES to whatever value you like" is misleading...
 
It would, but if the total range is large (or non-consecutive) the amount of DMASetting variables required tends to blow up pretty quickly. And you have to remember that when a new DMASetting is loaded in, the old state simply gets overwritten; it doesn't get written back to the original memory that it was loaded from.
It is totally contiguous, long list? That's totally fine, especially if I can point the tail back to the head an loop it.
of the DMAMEM's 512K, there would be a maximum of two 200,000 byte buffers so:
524288-400000=124288 byes of DMAMEM left over for other things, which is more than enough.
Data would be shoved to/from PSRAM at various times anyway (both blocks), and perhaps, just perhaps if PSRAM's interface can cope (which it should, but I need to test that!) then DMAMEM wouldn't be touched ever. I'm just not going to try using it directly at this point, just in case the caches from PSRAM can't deal with 4 DMA streams in parallel, even if the two sets would be hitting the same cache lines pretty much the majority of the time. As I mentioned before, PSRAM is clocked at 11 times the data rate and in quad mode...

8MHz per byte v.s. 88MHz/nybble... even with overhead, that should keep up, but as per usual, I keep a "Plan B" in-mind just in case.
 
Yup, as implied in

I said you’d need to do plenty of datasheet reading :)

Given your need for 8-bit transfers, each TCD could have a major count of 25,000, so you’d need 8 sets of settings for a 200,000 bytes fire-and-forget transfer. Very doable.
Yeah, but can the last (TAIL) TCD point to the first (HEAD) TCD automatically? The terse example from NXP (AN14300) just has two tiny chunks, and from what I understand, fires once, and stops. It's not even clear what the struct actually looks like for this, unless I was do grab the whole thing for expresso, which I guess I could do...

Point is that I'd like it to loop over and over, with no CPU interaction whatsoever, except to pause it, where if I need to, I would sync the dma cache, and move the data to PSRAM.

Once PSRAM is filled with the various patterns, they can be replayed at any time later, and stored to SD/USB or sent over ethernet to be recalled as needed. Currently 16MB limit of PSRAM is *just* enough to get what I need done for the 8MHz looped sample rate, although I *should* be able to get away with 4MHz, I might not be able to due to nyquist limits, because I am trapping high-to-low edges, and those come from a pretty odd not-synced clock, which changes starting with a 16MHz clock divided by 13 (Yes, strange), and gets converted to a 1uS pulse within that 200mS loop, and it's not aligned in any real particular place, because there's no way to get the clock, so I have to sample it... Because the position of it can constantly change, I can't use a timer to count the time between edges, and keep them in order and within the 200mS...

Think of it like the usual digital trace mode on a 'scope, but it plays the sniffed edges (1uS pulse actually) back, and when it returns the second time, the position of that pulse can actually be moved, or totally eliminated alltogether.

For another analogy, for the really old people looking in, imagine an 8track tape, or a radio studio cart, that has 200mS of tape in the loop, and that you could play, and get back a slightly delayed possible copy, or modified thing back, which gets recorded right back to tape, which, loops, forever... untill you press the "change track" button on it.

And lastly, for the super old or retro nerds, remember CRT memory?How you had to loopback refresh it? Yeah, same thing here.
 
The problem I'm having here is how do I write an example that demonstrates repeatedly copying a 200000 byte buffer in a loop without any CPU interaction? Loop the data through SPI and just break at random intervals to compare sent vs received data?
 
Yeah, but can the last (TAIL) TCD point to the first (HEAD) TCD automatically?
Don't see why not - since you're loading the entire TCD from a memory image each time, that image can have arbitrary contents including looping back to the start.

That does sound like you'll be hogging the DMA hardware permanently, though, which may give rise to all sorts of other problems. @jmarsh is your real expert here...
 
Back
Top