Just measured the speed of this snippet with an LA on pin 11 and get some 250ns on a T4@600MHz

Code:

digitalWriteFast(11, HIGH);
volatile uint16_t value =
digitalReadFast(2) << 0 |
digitalReadFast(3) << 1 |
digitalReadFast(5) << 2 |
digitalReadFast(1) << 3 |
digitalReadFast(7) << 4 |
digitalReadFast(12) << 5 |
digitalReadFast(15) << 6 |
digitalReadFast(14) << 7 |
digitalReadFast(4) << 8 |
digitalReadFast(6) << 9 |
digitalReadFast(9) << 10 |
digitalReadFast(8) << 11 |
digitalReadFast(7) << 12 |
digitalReadFast(10) << 13 |
digitalReadFast(16) << 14 |
digitalReadFast(17) << 15;
digitalWriteFast(11, LOW);