We have been building art cars (mutant vehicles) for Burning Man for many years. Our current one is based on a solar electrified Worksman side-by-side tricycle. For night driving it is illuminated by 22 pairs of NeoPixel (WS2812) strips for a total of ~6400 RGB LEDs. The strips are in pairs because half of them are always on the other side.
Anyway, many of the dozens of light patterns are animated and there was a strong desire for the animation to be as smooth as possible which required a high frame rate. The lights are controlled by a Teensy 3.6 running at 256 MHz that drives all 22 signal lines in parallel through some 74HCT245s at the maximum clock rate for the neoPixels. We can easily achieve well over a 200 Hz frame rate.
The code presented in this post is an illustration of how that works. Here we show a 32 channel example, which, btw., also runs at the maximum NeoPixel clock rate of ~1MHz. Total output rate is ~1.3 million RGB pixels per second.
The serialization of the color bits is done in the TRANSPOSE macros using the BFI instruction for the ARM and a word is sent directly to the ports on the chip. The NeoPixel documentation has separate timing rules for sending a zero and a one. We just turn on the line, wait 200 ns, send out the data bit which either turns it off (a zero) or leaves it high (a one), wait ~400 ns and send out a zero which turns off the one or leaves it low, then wait ~400 ns for the next bit. The serialization fits in nicely during the wait for the next bit.
Serialization of 32 channels is accomplished in the ~400ns of interbit time. Our 22 channel version has NOPs to make the timing work out. Adding more channels beyond 32 would cost about 3-1/2 cycles each.
The NeoPixel strips do NOT have to be all the same length. Once a strip is full, any additional bits are shifted off the end, even if they come from overrunning your data arrays. So set the length to the longest strip and you will be fine.

Twilight image of the previous golf cart version.

Tricycle version in the shop.

Control box under the seats.

Control board with Teensy 3.6 and HCT245s, plus battery chargers and relays.
[video]http://www.cinematrix.com/BMan/2015/SplashNight.mov[/video]
Click to play.
The code presented in this post is an illustration of how that works. Here we show a 32 channel example, which, btw., also runs at the maximum NeoPixel clock rate of ~1MHz. Total output rate is ~1.3 million RGB pixels per second.
The serialization of the color bits is done in the TRANSPOSE macros using the BFI instruction for the ARM and a word is sent directly to the ports on the chip. The NeoPixel documentation has separate timing rules for sending a zero and a one. We just turn on the line, wait 200 ns, send out the data bit which either turns it off (a zero) or leaves it high (a one), wait ~400 ns and send out a zero which turns off the one or leaves it low, then wait ~400 ns for the next bit. The serialization fits in nicely during the wait for the next bit.
Serialization of 32 channels is accomplished in the ~400ns of interbit time. Our 22 channel version has NOPs to make the timing work out. Adding more channels beyond 32 would cost about 3-1/2 cycles each.
The NeoPixel strips do NOT have to be all the same length. Once a strip is full, any additional bits are shifted off the end, even if they come from overrunning your data arrays. So set the length to the longest strip and you will be fine.
Code:
[FONT=Courier New]// Fast parallel NeoPixel driver, generalized to 32 channels, timing set for a 256MHz Teensy 3.6.
// Loren Carpenter, January 2020.
// Runs at ~1 microsecond per bit, or ~1.3 million pixels per second. You will need a NOP delay with fewer channels.
// The bit assignment is somewhat arbitrary and is easily reconfigured.
#define TRANSPOSE_B(R,S0,S1,S2,S3,S4,S5,S6,S7,M) \ // Port B uses bits 0-3,16-19
{ int t; \
R = S0>>M; \
t = S1>>M; asm volatile ( "bfi %[d],%[s],#1,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S2>>M; asm volatile ( "bfi %[d],%[s],#2,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S3>>M; asm volatile ( "bfi %[d],%[s],#3,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S4>>M; asm volatile ( "bfi %[d],%[s],#16,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S5>>M; asm volatile ( "bfi %[d],%[s],#17,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S6>>M; asm volatile ( "bfi %[d],%[s],#18,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S7>>M; asm volatile ( "bfi %[d],%[s],#19,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
R &= 0xF000F; } // don't set unused pins
#define TRANSPOSE_8(R,S0,S1,S2,S3,S4,S5,S6,S7,M) \ // Ports C and D use bits 0-7
{ int t; \
R = S0>>M; \
t = S1>>M; asm volatile ( "bfi %[d],%[s],#1,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S2>>M; asm volatile ( "bfi %[d],%[s],#2,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S3>>M; asm volatile ( "bfi %[d],%[s],#3,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S4>>M; asm volatile ( "bfi %[d],%[s],#4,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S5>>M; asm volatile ( "bfi %[d],%[s],#5,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S6>>M; asm volatile ( "bfi %[d],%[s],#6,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S7>>M; asm volatile ( "bfi %[d],%[s],#7,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); }
#define TRANSPOSE_C(R,S0,S1,S2,S3,S4,S5,S6,S7,M) TRANSPOSE_8(R,S0,S1,S2,S3,S4,S5,S6,S7,M)
#define TRANSPOSE_D(R,S0,S1,S2,S3,S4,S5,S6,S7,M) TRANSPOSE_8(R,S0,S1,S2,S3,S4,S5,S6,S7,M)
#define TRANSPOSE_E(R,S0,S1,S2,S3,S4,S5,S6,S7,M) \ // Port E uses bits 0-5,10,11
{ int t; \
R = S0>>M; \
t = S1>>M; asm volatile ( "bfi %[d],%[s],#1,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S2>>M; asm volatile ( "bfi %[d],%[s],#2,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S3>>M; asm volatile ( "bfi %[d],%[s],#3,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S4>>M; asm volatile ( "bfi %[d],%[s],#4,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S5>>M; asm volatile ( "bfi %[d],%[s],#5,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S6>>M; asm volatile ( "bfi %[d],%[s],#10,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
t = S7>>M; asm volatile ( "bfi %[d],%[s],#11,#1 \n\t" : [d] "+r" (R) : [s] "r" (t) : ); \
R &= 0xC3F; }
#define NOP __asm__ __volatile__ (" mov r0,r0 \n\t" : : )
#define NOP10 {NOP;NOP;NOP;NOP;NOP;NOP;NOP;NOP;NOP;NOP;}
#define NOP30 {NOP10;NOP10;NOP10;}
#define NOP40 {NOP10;NOP10;NOP10;NOP10;}
#define NLED 1000
#define NRGB (NLED*3)
// Some colors to send, in NeoPixel GRB order
unsigned char s1[NRGB], s2[NRGB], s3[NRGB], s4[NRGB], s5[NRGB], s6[NRGB], s7[NRGB], s8[NRGB];
unsigned char s9[NRGB], s10[NRGB], s11[NRGB], s12[NRGB], s13[NRGB], s14[NRGB], s15[NRGB], s16[NRGB];
unsigned char s17[NRGB], s18[NRGB], s19[NRGB], s20[NRGB], s21[NRGB], s22[NRGB], s23[NRGB], s24[NRGB];
unsigned char s25[NRGB], s26[NRGB], s27[NRGB], s28[NRGB], s29[NRGB], s30[NRGB], s31[NRGB], s32[NRGB];
//////////////////////////////////////////////////////////////////////////////////////////////////////
void sendAll() {
int b0, b1, b2, b3, b16, b17, b18, b19;
int c0, c1, c2, c3, c4, c5, c6, c7;
int d0, d1, d2, d3, d4, d5, d6, d7;
int e0, e1, e2, e3, e4, e5, e10, e11;
int tb, tc, td, te;
int i, j;
// Make sure all used pins are configured for output.
GPIOB_PDOR = 0;
GPIOC_PDOR = 0;
GPIOD_PDOR = 0;
GPIOE_PDOR = 0;
delayMicroseconds(30);
for (i = 0; i < NRGB; i++) {
b0 = s1[i];
b1 = s2[i];
b2 = s3[i];
b3 = s4[i];
b16 = s5[i];
b17 = s6[i];
b17 = s7[i];
b18 = s8[i];
c0 = s9[i];
c1 = s10[i];
c2 = s11[i];
c3 = s12[i];
c4 = s13[i];
c5 = s14[i];
c6 = s15[i];
c7 = s16[i];
d0 = s17[i];
d1 = s18[i];
d2 = s19[i];
d3 = s20[i];
d4 = s21[i];
d5 = s22[i];
d6 = s23[i];
d7 = s24[i];
e0 = s25[i];
e1 = s26[i];
e2 = s27[i];
e3 = s28[i];
e4 = s29[i];
e5 = s30[i];
e10 = s31[i];
e11 = s32[i];
for (j = 0; j < 8; j++) {
TRANSPOSE_B (tb,b0,b1,b2,b3,b16,b17,b18,b19,j);
TRANSPOSE_C (tc,c0,c1,c2,c3,c4,c5,c6,c7,j);
TRANSPOSE_D (td,d0,d1,d2,d3,d4,d5,d6,d7,j);
TRANSPOSE_E (te,e0,e1,e2,e3,e4,e5,e10,e11,j);
GPIOB_PDOR = 0xF000F;
GPIOC_PDOR = 0xFF;
GPIOD_PDOR = 0xFF;
GPIOE_PDOR = 0xC0F;
NOP40;
GPIOB_PDOR = tb;
GPIOC_PDOR = tc;
GPIOD_PDOR = td;
GPIOE_PDOR = te;
NOP40; NOP40;
GPIOB_PDOR = 0;
GPIOC_PDOR = 0;
GPIOD_PDOR = 0;
GPIOE_PDOR = 0;
// add delay here if you use fewer than 32 channels (~3-4 NOPs per channel removed)
}
}[/FONT]

Twilight image of the previous golf cart version.

Tricycle version in the shop.

Control box under the seats.

Control board with Teensy 3.6 and HCT245s, plus battery chargers and relays.
[video]http://www.cinematrix.com/BMan/2015/SplashNight.mov[/video]
Click to play.