Code:
// Fast parallel NeoPixel driver, generalized to 32 channels, timing set for a 256MHz Teensy 3.6.
// Loren Carpenter, January 2020.
// Runs at ~1 microsecond per bit, or ~1.3 million pixels per second. You will need a NOP delay with fewer channels.
// The bit assignment is somewhat arbitrary and is easily reconfigured.
#define TRANSPOSE_B(R,S0,S1,S2,S3,S4,S5,S6,S7,M) \ // Port B uses bits 0-3,16-19
{ int t; \
R = S0>>M; \
t = S1>>M; asm volatile ( "bfi %[d],%[s],#1,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S2>>M; asm volatile ( "bfi %[d],%[s],#2,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S3>>M; asm volatile ( "bfi %[d],%[s],#3,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S4>>M; asm volatile ( "bfi %[d],%[s],#16,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S5>>M; asm volatile ( "bfi %[d],%[s],#17,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S6>>M; asm volatile ( "bfi %[d],%[s],#18,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S7>>M; asm volatile ( "bfi %[d],%[s],#19,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
R &= 0xF000F; } // don't set unused pins
#define TRANSPOSE_8(R,S0,S1,S2,S3,S4,S5,S6,S7,M) \ // Ports C and D use bits 0-7
{ int t; \
R = S0>>M; \
t = S1>>M; asm volatile ( "bfi %[d],%[s],#1,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S2>>M; asm volatile ( "bfi %[d],%[s],#2,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S3>>M; asm volatile ( "bfi %[d],%[s],#3,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S4>>M; asm volatile ( "bfi %[d],%[s],#4,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S5>>M; asm volatile ( "bfi %[d],%[s],#5,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S6>>M; asm volatile ( "bfi %[d],%[s],#6,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S7>>M; asm volatile ( "bfi %[d],%[s],#7,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); }
#define TRANSPOSE_C(R,S0,S1,S2,S3,S4,S5,S6,S7,M) TRANSPOSE_8(R,S0,S1,S2,S3,S4,S5,S6,S7,M)
#define TRANSPOSE_D(R,S0,S1,S2,S3,S4,S5,S6,S7,M) TRANSPOSE_8(R,S0,S1,S2,S3,S4,S5,S6,S7,M)
#define TRANSPOSE_E(R,S0,S1,S2,S3,S4,S5,S6,S7,M) \ // Port E uses bits 0-5,10,11
{ int t; \
R = S0>>M; \
t = S1>>M; asm volatile ( "bfi %[d],%[s],#1,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S2>>M; asm volatile ( "bfi %[d],%[s],#2,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S3>>M; asm volatile ( "bfi %[d],%[s],#3,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S4>>M; asm volatile ( "bfi %[d],%[s],#4,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S5>>M; asm volatile ( "bfi %[d],%[s],#5,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S6>>M; asm volatile ( "bfi %[d],%[s],#10,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S7>>M; asm volatile ( "bfi %[d],%[s],#11,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
R &= 0xC3F; }
#define NOP __asm__ __volatile__ (" mov r0,r0 \n\t" : : )
#define NOP10 {NOP;NOP;NOP;NOP;NOP;NOP;NOP;NOP;NOP;NOP;}
#define NOP30 {NOP10;NOP10;NOP10;}
#define NOP40 {NOP10;NOP10;NOP10;NOP10;}
#define NLED 1000
#define NRGB (NLED*3)
// Some colors to send, in NeoPixel GRB order
unsigned char s1[NRGB], s2[NRGB], s3[NRGB], s4[NRGB], s5[NRGB], s6[NRGB], s7[NRGB], s8[NRGB];
unsigned char s9[NRGB], s10[NRGB], s11[NRGB], s12[NRGB], s13[NRGB], s14[NRGB], s15[NRGB], s16[NRGB];
unsigned char s17[NRGB], s18[NRGB], s19[NRGB], s20[NRGB], s21[NRGB], s22[NRGB], s23[NRGB], s24[NRGB];
unsigned char s25[NRGB], s26[NRGB], s27[NRGB], s28[NRGB], s29[NRGB], s30[NRGB], s31[NRGB], s32[NRGB];
//////////////////////////////////////////////////////////////////////////////////////////////////////
void sendAll() {
int b0, b1, b2, b3, b16, b17, b18, b19;
int c0, c1, c2, c3, c4, c5, c6, c7;
int d0, d1, d2, d3, d4, d5, d6, d7;
int e0, e1, e2, e3, e4, e5, e10, e11;
int tb, tc, td, te;
int i, j;
// Make sure all used pins are configured for output.
GPIOB_PDOR = 0;
GPIOC_PDOR = 0;
GPIOD_PDOR = 0;
GPIOE_PDOR = 0;
delayMicroseconds(30);
for (i = 0; i < NRGB; i++) {
b0 = s1[i];
b1 = s2[i];
b2 = s3[i];
b3 = s4[i];
b16 = s5[i];
b17 = s6[i];
b17 = s7[i];
b18 = s8[i];
c0 = s9[i];
c1 = s10[i];
c2 = s11[i];
c3 = s12[i];
c4 = s13[i];
c5 = s14[i];
c6 = s15[i];
c7 = s16[i];
d0 = s17[i];
d1 = s18[i];
d2 = s19[i];
d3 = s20[i];
d4 = s21[i];
d5 = s22[i];
d6 = s23[i];
d7 = s24[i];
e0 = s25[i];
e1 = s26[i];
e2 = s27[i];
e3 = s28[i];
e4 = s29[i];
e5 = s30[i];
e10 = s31[i];
e11 = s32[i];
for (j = 0; j < 8; j++) {
TRANSPOSE_B (tb,b0,b1,b2,b3,b16,b17,b18,b19,j);
TRANSPOSE_C (tc,c0,c1,c2,c3,c4,c5,c6,c7,j);
TRANSPOSE_D (td,d0,d1,d2,d3,d4,d5,d6,d7,j);
TRANSPOSE_E (te,e0,e1,e2,e3,e4,e5,e10,e11,j);
GPIOB_PDOR = 0xF000F;
GPIOC_PDOR = 0xFF;
GPIOD_PDOR = 0xFF;
GPIOE_PDOR = 0xC0F;
NOP40;
GPIOB_PDOR = tb;
GPIOC_PDOR = tc;
GPIOD_PDOR = td;
GPIOE_PDOR = te;
NOP40; NOP40;
GPIOB_PDOR = 0;
GPIOC_PDOR = 0;
GPIOD_PDOR = 0;
GPIOE_PDOR = 0;
// add delay here if you use fewer than 32 channels (~3-4 NOPs per channel removed)
}
}