Reading multiple GPIO pins on the Teensy 4.0 "atomically"

Status
Not open for further replies.
Changed time calc to ARM_DWT_CYCCNT - and did it without the add math:
Code:
3 :: 32
2 :: C0
1 :: C0
A :: C0
B :: 32

t3 result = 20002.035000 32
t2 result = 66674.343333 C0
t1 result = 55006.448333 C0
tA result = 55006.441667 C0
tB result = 23336.541667 32

Collected the prior versions above and the latest - not all return the same value - but pins here are floating.

Code:
#define IMXRT_GPIO6_DIRECT  (*(volatile uint32_t *)0x42000000)
#define IMXRT_GPIO9_DIRECT  (*(volatile uint32_t *)0x4200C000)
#define IMXRT_GPIO7_DIRECT  (*(volatile uint32_t *)0x42004000)

// this can be done with a mask and 2 less shifts
//#define IO_BLOCK_A ((IMXRT_GPIO7_DIRECT << 29) >> 29)
//#define IO_BLOCK_B (((IMXRT_GPIO9_DIRECT >> 4) << 29) >> 26)
//#define IO_BLOCK_C (((IMXRT_GPIO6_DIRECT >> 16) << 28) >> 22)
//#define IO_BLOCK_D (((IMXRT_GPIO6_DIRECT >> 22) << 26) >> 16)

#define IO_BLOCK_A (IMXRT_GPIO7_DIRECT & 0b00000000000000000000000000000111)
#define IO_BLOCK_B ((IMXRT_GPIO9_DIRECT & 0b00000000000000000000000001110000) >> 1)
#define IO_BLOCK_C ((IMXRT_GPIO6_DIRECT & 0b00000000000011110000000000000000) >> 10)
#define IO_BLOCK_D ((IMXRT_GPIO6_DIRECT & 0b00001111110000000000000000000000) >> 12)

void setup() {
	pinMode(2, INPUT);
	pinMode(3, INPUT);
	pinMode(4, INPUT);
	pinMode(10, INPUT);
	pinMode(11, INPUT);
	pinMode(12, INPUT);
	pinMode(17, INPUT);
	pinMode(16, INPUT);
	pinMode(22, INPUT);
	pinMode(23, INPUT);
	pinMode(20, INPUT);
	pinMode(21, INPUT);
	pinMode(19, INPUT);
	pinMode(18, INPUT);
	pinMode(14, INPUT);
	pinMode(15, INPUT);

	Serial.begin(115200);
	while (!Serial && millis() < 4000 );
	Serial.println("\n" __FILE__ " " __DATE__ " " __TIME__);
}

void loop() {
	Serial.printf( "3 :: %X\n", test3() );
	Serial.printf( "2 :: %X\n", test2() );
	Serial.printf( "1 :: %X\n", test1() );
	Serial.printf( "A :: %X\n", testA() );
	Serial.printf( "B :: %X\n", testB() );
	Serial.printf( "\n" );
	delay(2000);
	loopNot();
}

inline uint32_t testA() {
	uint16_t data = IO_BLOCK_A | IO_BLOCK_B | IO_BLOCK_C | IO_BLOCK_D;
	return data;
}

inline uint32_t testB() {
	register uint32_t data  = IMXRT_GPIO6_DIRECT >> 2;
	asm volatile("bfi %0, %1, 12, 2" : "+r"(data) : "r"(data));
	asm volatile("bfi %0, %1, 18, 2" : "+r"(data) : "r"(data >> 28));
	return (data >> 10) & 0xffff;
}

inline uint32_t test1() {
	return  IO_BLOCK_A | IO_BLOCK_B | IO_BLOCK_C | IO_BLOCK_D;
}

inline uint32_t test2() {
	uint32_t data = IMXRT_GPIO7_DIRECT & 0b00000000000000000000000000000111;
	asm volatile("bfi %0, %1, 3, 3" : "+r"(data) : "r"(IMXRT_GPIO9_DIRECT >> 4));
	asm volatile("bfi %0, %1, 6, 4" : "+r"(data) : "r"(IMXRT_GPIO6_DIRECT >> 16));
	asm volatile("bfi %0, %1, 10, 6" : "+r"(data) : "r"(IMXRT_GPIO6_DIRECT >> 22));
	return data;
}

#define IMXRT_GPIO6_DIRECT  (*(volatile uint32_t *)0x42000000)

inline uint32_t test3()
{
	register uint32_t data  = IMXRT_GPIO6_DIRECT;
	register uint32_t data2  = data >> 30;
	register uint32_t data3  = data >> 2;
	asm volatile("bfi %0, %1, 20, 2" : "+r"(data) : "r"(data2));
	asm volatile("bfi %0, %1, 14, 2" : "+r"(data) : "r"(data3));
	return (data >> 12) & 0xffff;
}

void loopNot()
{
	long unsigned stime;

	stime = ARM_DWT_CYCCNT;
	for (register int i = 0; i < 1000000; ++i ) {
		test3();
	}
	Serial.printf("t3 result = %f %X\n", 1000000.0 * (ARM_DWT_CYCCNT - stime) / F_CPU_ACTUAL, test3());

	stime = ARM_DWT_CYCCNT;
	for (register int i = 0; i < 1000000; ++i ) {
		test2();
	}
	Serial.printf("t2 result = %f %X\n", 1000000.0 * (ARM_DWT_CYCCNT - stime) / F_CPU_ACTUAL, test2());

	stime = ARM_DWT_CYCCNT;
	for (register int i = 0; i < 1000000; ++i ) {
		test1();
	}
	Serial.printf("t1 result = %f %X\n", 1000000.0 * (ARM_DWT_CYCCNT - stime) / F_CPU_ACTUAL, test1());

	stime = ARM_DWT_CYCCNT;
	for (register int i = 0; i < 1000000; ++i ) {
		testA();
	}
	Serial.printf("tA result = %f %X\n", 1000000.0 * (ARM_DWT_CYCCNT - stime) / F_CPU_ACTUAL, testA());

	stime = ARM_DWT_CYCCNT;
	for (register int i = 0; i < 1000000; ++i ) {
		testB();
	}
	Serial.printf("tB result = %f %X\n", 1000000.0 * (ARM_DWT_CYCCNT - stime) / F_CPU_ACTUAL, testB());

	delay(10000);
}
 
As written (for all 16 bits from GPIO6), I measure < 25 ns with a for loop around it. At 600 mhz..

Your code seems to only be reading 4 pins and some of them are invalid. Also GPIO6 only gives us 10 pins of the pins we need.

I have been modeling a solution with the following pins.
Code:
[   GPIO7   |  GPIO9  |     GPIO6      |         GPIO6         ]
[10, 12, 11 | 2, 3, 4 | 19, 18, 14, 15 | 17, 16, 22, 23, 20, 21]

Here is your code modified to read all 16 of those pins. This uses the SHIFT,BFI method. (avg 61ns)
Code:
__attribute__((always_inline)) static inline uint32_t test3() {
  // bits 0-2
  register uint32_t data = GPIO7_DR & 0b00000000000000000000000000000111;
  // bits 3-5
  register uint32_t data_gpio9 = GPIO9_DR >> 4;
  asm volatile("bfi %0, %1, 3, 3" : "+r"(data) : "r"(data_gpio9));
  // bits 6-15
  register uint32_t data_gpio6  = GPIO6_DR;
  register uint32_t data_gpio6_part1  = data_gpio6 >> 16;
  register uint32_t data_gpio6_part2  = data_gpio6 >> 22;
  asm volatile("bfi %0, %1, 6, 4" : "+r"(data) : "r"(data_gpio6_part1));
  asm volatile("bfi %0, %1, 10, 6" : "+r"(data) : "r"(data_gpio6_part2));
  return data;
}

Here is my new MASK,SHIFT,OR method code using a shared GPIO6 register. (avg 49ns)
Code:
__attribute__((always_inline)) static inline uint16_t test5() {
  register uint32_t data = GPIO7_DR & 0b00000000000000000000000000000111;
  data |= (GPIO9_DR & 0b00000000000000000000000001110000) >> 1;
  register uint32_t data_gpio6 = GPIO6_DR;
  data |= (data_gpio6 & 0b00000000000011110000000000000000) >> 10;
  data |= (data_gpio6 & 0b00001111110000000000000000000000) >> 12;
  return data;
}
(UPDATE made change from #28)

I don't exactly follow what you're doing. Could you clear up what your code is doing in #21 and how it works?
 
Last edited:
I think I understand your solution now...

You are using all pins in GPIO9 to get 16 bits

Code:
#define CORE_PIN24_BIT		12
#define CORE_PIN25_BIT		13

#define CORE_PIN19_BIT		16
#define CORE_PIN18_BIT		17
#define CORE_PIN14_BIT		18
#define CORE_PIN15_BIT		19

#define CORE_PIN17_BIT		22
#define CORE_PIN16_BIT		23
#define CORE_PIN22_BIT		24
#define CORE_PIN23_BIT		25
#define CORE_PIN20_BIT		26
#define CORE_PIN21_BIT		27

Very clever!
 
@trevor403 - test5() should have this as first assignment { not |= }: data = GPIO7_DR & 0b00000000000000000000000000000111;
 
My code is based on response #3 and the following (all from GPIO6):

#define CORE_PIN1_BIT 2
#define CORE_PIN0_BIT 3

#define CORE_PIN24_BIT 12
#define CORE_PIN25_BIT 13

#define CORE_PIN19_BIT 16
#define CORE_PIN18_BIT 17
#define CORE_PIN14_BIT 18
#define CORE_PIN15_BIT 19

#define CORE_PIN17_BIT 22
#define CORE_PIN16_BIT 23
#define CORE_PIN22_BIT 24
#define CORE_PIN23_BIT 25
#define CORE_PIN20_BIT 26
#define CORE_PIN21_BIT 27

#define CORE_PIN26_BIT 30
#define CORE_PIN27_BIT 31

One would need to be careful in wiring it up to get the right order.
 
As a good reminder that it seldom makes sense to try to outsmart the compiler, here is the fastest version yet (15 ns). Also note how clear and simple it is.
Code:
inline uint32_t test3()
{
  register uint32_t data  = IMXRT_GPIO6_DIRECT;
  register uint32_t result=0;

  result |= (data & 0B1100) >> 2;
  result |= (data & 0B11000000000000) >> 10;
  result |= (data & 0B011110000000000000000) >> 12;
  result |= (data & 0B1111110000000000000000000000) >> 14;
  result |= (data & 0B11000000000000000000000000000000) >> 16;

  return result;
}
 
I can confirm the speed as tested with "t3" below - but not the data integrity as I have floating pins for GPIO_6 { #define IMXRT_GPIO6_DIRECT (*(volatile uint32_t *)0x42000000) }

Indeed the compiler is good with "C" and also maybe knows something about doing it to feed the twin CPU pipes.

And the "C" code is easier to verify and edit than whatever that 'asm' stuff was doing :)

Code:
[COLOR="#FF0000"]t3 result = 15001.528333 138[/COLOR]
t2 result = 66674.351667 4C0
t1 result = 55006.456667 4C0
t4 result = 23336.545000 432
t5 result = 41671.716667 1000

from:
Code:
	long unsigned stime;

	stime = ARM_DWT_CYCCNT;
	for (register int i = 0; i < 1000000; ++i ) {
		test3();
	}
	Serial.printf("t3 result = %f %X\n", 1000000.0 * (ARM_DWT_CYCCNT - stime) / F_CPU_ACTUAL, test3());
 
defragster: what do you get if you make some use of the return values (as I did earlier with total +=)? I'm suspicious that the compiler is optimizing away required code and that 15ns isn't real.
 
Code:
t3 result = 30003.013333 8448BE00

with:
Code:
	long unsigned stime;
	register uint32_t summ=0;

	stime = ARM_DWT_CYCCNT;
	for (register int i = 0; i < 1000000; ++i ) {
		summ+=test3();
	}
	Serial.printf("t3 result = %f %X\n", 1000000.0 * (ARM_DWT_CYCCNT - stime) / F_CPU_ACTUAL, summ);
 
re post #34 - the test3() was not optimized out AFAIK. Adding the 'summ' was optimized out until I printed it. One earlier variation { test4()? now showing 23K? } though went from 20000 to 25000 when the print was added - so above doubling from 15K to 30K is an odd result compared to that. It must break the pipeline doubling or something extreme.

Here it is for those two with and without summ - test4() now hits 30K ( 30 ns ):
Code:
t3 result = 15001.528333 C138
summ>t3 result = 31544.241667 8448BE00
t2 result = 66674.358333 4C0
t1 result = 55006.453333 4C0
t4 result = 23336.525000 732
summ>t4 result = 30003.953333 F2136E80
t5 result = 41671.765000 1000

where test4() is:
Code:
inline uint32_t test4() {
	register uint32_t data  = IMXRT_GPIO6_DIRECT >> 2;
	asm volatile("bfi %0, %1, 12, 2" : "+r"(data) : "r"(data));
	asm volatile("bfi %0, %1, 18, 2" : "+r"(data) : "r"(data >> 28));
	return (data >> 10) & 0xffff;
}

Sometimes you've got to fight the compiler, and sometimes it works with you ;)

Indeed I did some simple math for the micros() calc from CYCCNT on T_4 and Paul swapped in some asm for the same calc and more code with a conditional limit and it runs 2 cycles faster.
 
For the record, here is what tests out fastest for me when I make use of the return value.
Code:
#define IMXRT_GPIO6_DIRECT  (*(volatile uint32_t *)0x42000000)

// rearrange 16 GPIO6 pin inputs into 16 consecutive bits
// move two two bit fields from the lower word into gaps in the upper word

inline uint16_t test5()
{
  register uint32_t data  = IMXRT_GPIO6_DIRECT;
  register uint32_t data2  = data >> 2;
  register uint32_t data3  = data >> 12;
  asm volatile("bfi %0, %1, 20, 2" : "+r"(data) : "r"(data2));
  asm volatile("bfi %0, %1, 28, 2" : "+r"(data) : "r"(data3));
  return (data >> 16);
}
 
As test6() {p#37 test5()} in the skeleton above I see this where using the result is the best number so far - where test3() is as above p#31:
Code:
t6 result = 20002.025000 E043
[U][B]summ>t6 result = 23336.526667[/B][/U] 5DF656C0
t3 result = 15002.495000 C138
summ>t3 result = 30003.955000 E23F14C0
 
Status
Not open for further replies.
Back
Top