I implemented my own ILI9341 lib using Kurt's lib as a reference. It's further optimized and the DMA transfer supports partial tiled updates, so only updated regions are transferred over SPI. And it doesn't require holding the entire frame buffer in memory either, so could run this in higher resolutions without memory issues
// render mesh
for(uint16_t seg_idx=0; seg_idx<s_mesh.num_segments(); ++seg_idx)
{
test_shader sh;
sh.m_mesh=&s_mesh;
sh.m_seg=&s_mesh.segment(seg_idx);
sh.m_o2c=o2c; // object->camera matrix
sh.m_o2p=o2p; // object->projection matrix
s_gfx_dev.dispatch_shader(sh);
}
s_gfx_dev.commit(); // kick off tile rendering