Forum Rule: Always post complete source code & details to reproduce any issue!
Page 2 of 2 FirstFirst 1 2
Results 26 to 38 of 38

Thread: Reading multiple GPIO pins on the Teensy 4.0 "atomically"

  1. #26
    Senior Member+ defragster's Avatar
    Join Date
    Feb 2015
    Posts
    10,950
    Changed time calc to ARM_DWT_CYCCNT - and did it without the add math:
    Code:
    3 :: 32
    2 :: C0
    1 :: C0
    A :: C0
    B :: 32
    
    t3 result = 20002.035000 32
    t2 result = 66674.343333 C0
    t1 result = 55006.448333 C0
    tA result = 55006.441667 C0
    tB result = 23336.541667 32
    Collected the prior versions above and the latest - not all return the same value - but pins here are floating.

    Code:
    #define IMXRT_GPIO6_DIRECT  (*(volatile uint32_t *)0x42000000)
    #define IMXRT_GPIO9_DIRECT  (*(volatile uint32_t *)0x4200C000)
    #define IMXRT_GPIO7_DIRECT  (*(volatile uint32_t *)0x42004000)
    
    // this can be done with a mask and 2 less shifts
    //#define IO_BLOCK_A ((IMXRT_GPIO7_DIRECT << 29) >> 29)
    //#define IO_BLOCK_B (((IMXRT_GPIO9_DIRECT >> 4) << 29) >> 26)
    //#define IO_BLOCK_C (((IMXRT_GPIO6_DIRECT >> 16) << 28) >> 22)
    //#define IO_BLOCK_D (((IMXRT_GPIO6_DIRECT >> 22) << 26) >> 16)
    
    #define IO_BLOCK_A (IMXRT_GPIO7_DIRECT & 0b00000000000000000000000000000111)
    #define IO_BLOCK_B ((IMXRT_GPIO9_DIRECT & 0b00000000000000000000000001110000) >> 1)
    #define IO_BLOCK_C ((IMXRT_GPIO6_DIRECT & 0b00000000000011110000000000000000) >> 10)
    #define IO_BLOCK_D ((IMXRT_GPIO6_DIRECT & 0b00001111110000000000000000000000) >> 12)
    
    void setup() {
    	pinMode(2, INPUT);
    	pinMode(3, INPUT);
    	pinMode(4, INPUT);
    	pinMode(10, INPUT);
    	pinMode(11, INPUT);
    	pinMode(12, INPUT);
    	pinMode(17, INPUT);
    	pinMode(16, INPUT);
    	pinMode(22, INPUT);
    	pinMode(23, INPUT);
    	pinMode(20, INPUT);
    	pinMode(21, INPUT);
    	pinMode(19, INPUT);
    	pinMode(18, INPUT);
    	pinMode(14, INPUT);
    	pinMode(15, INPUT);
    
    	Serial.begin(115200);
    	while (!Serial && millis() < 4000 );
    	Serial.println("\n" __FILE__ " " __DATE__ " " __TIME__);
    }
    
    void loop() {
    	Serial.printf( "3 :: %X\n", test3() );
    	Serial.printf( "2 :: %X\n", test2() );
    	Serial.printf( "1 :: %X\n", test1() );
    	Serial.printf( "A :: %X\n", testA() );
    	Serial.printf( "B :: %X\n", testB() );
    	Serial.printf( "\n" );
    	delay(2000);
    	loopNot();
    }
    
    inline uint32_t testA() {
    	uint16_t data = IO_BLOCK_A | IO_BLOCK_B | IO_BLOCK_C | IO_BLOCK_D;
    	return data;
    }
    
    inline uint32_t testB() {
    	register uint32_t data  = IMXRT_GPIO6_DIRECT >> 2;
    	asm volatile("bfi %0, %1, 12, 2" : "+r"(data) : "r"(data));
    	asm volatile("bfi %0, %1, 18, 2" : "+r"(data) : "r"(data >> 28));
    	return (data >> 10) & 0xffff;
    }
    
    inline uint32_t test1() {
    	return  IO_BLOCK_A | IO_BLOCK_B | IO_BLOCK_C | IO_BLOCK_D;
    }
    
    inline uint32_t test2() {
    	uint32_t data = IMXRT_GPIO7_DIRECT & 0b00000000000000000000000000000111;
    	asm volatile("bfi %0, %1, 3, 3" : "+r"(data) : "r"(IMXRT_GPIO9_DIRECT >> 4));
    	asm volatile("bfi %0, %1, 6, 4" : "+r"(data) : "r"(IMXRT_GPIO6_DIRECT >> 16));
    	asm volatile("bfi %0, %1, 10, 6" : "+r"(data) : "r"(IMXRT_GPIO6_DIRECT >> 22));
    	return data;
    }
    
    #define IMXRT_GPIO6_DIRECT  (*(volatile uint32_t *)0x42000000)
    
    inline uint32_t test3()
    {
    	register uint32_t data  = IMXRT_GPIO6_DIRECT;
    	register uint32_t data2  = data >> 30;
    	register uint32_t data3  = data >> 2;
    	asm volatile("bfi %0, %1, 20, 2" : "+r"(data) : "r"(data2));
    	asm volatile("bfi %0, %1, 14, 2" : "+r"(data) : "r"(data3));
    	return (data >> 12) & 0xffff;
    }
    
    void loopNot()
    {
    	long unsigned stime;
    
    	stime = ARM_DWT_CYCCNT;
    	for (register int i = 0; i < 1000000; ++i ) {
    		test3();
    	}
    	Serial.printf("t3 result = %f %X\n", 1000000.0 * (ARM_DWT_CYCCNT - stime) / F_CPU_ACTUAL, test3());
    
    	stime = ARM_DWT_CYCCNT;
    	for (register int i = 0; i < 1000000; ++i ) {
    		test2();
    	}
    	Serial.printf("t2 result = %f %X\n", 1000000.0 * (ARM_DWT_CYCCNT - stime) / F_CPU_ACTUAL, test2());
    
    	stime = ARM_DWT_CYCCNT;
    	for (register int i = 0; i < 1000000; ++i ) {
    		test1();
    	}
    	Serial.printf("t1 result = %f %X\n", 1000000.0 * (ARM_DWT_CYCCNT - stime) / F_CPU_ACTUAL, test1());
    
    	stime = ARM_DWT_CYCCNT;
    	for (register int i = 0; i < 1000000; ++i ) {
    		testA();
    	}
    	Serial.printf("tA result = %f %X\n", 1000000.0 * (ARM_DWT_CYCCNT - stime) / F_CPU_ACTUAL, testA());
    
    	stime = ARM_DWT_CYCCNT;
    	for (register int i = 0; i < 1000000; ++i ) {
    		testB();
    	}
    	Serial.printf("tB result = %f %X\n", 1000000.0 * (ARM_DWT_CYCCNT - stime) / F_CPU_ACTUAL, testB());
    
    	delay(10000);
    }

  2. #27
    Junior Member
    Join Date
    Jan 2020
    Posts
    9
    Quote Originally Posted by jonr View Post
    As written (for all 16 bits from GPIO6), I measure < 25 ns with a for loop around it. At 600 mhz..
    Your code seems to only be reading 4 pins and some of them are invalid. Also GPIO6 only gives us 10 pins of the pins we need.

    I have been modeling a solution with the following pins.
    Code:
    [   GPIO7   |  GPIO9  |     GPIO6      |         GPIO6         ]
    [10, 12, 11 | 2, 3, 4 | 19, 18, 14, 15 | 17, 16, 22, 23, 20, 21]
    Here is your code modified to read all 16 of those pins. This uses the SHIFT,BFI method. (avg 61ns)
    Code:
    __attribute__((always_inline)) static inline uint32_t test3() {
      // bits 0-2
      register uint32_t data = GPIO7_DR & 0b00000000000000000000000000000111;
      // bits 3-5
      register uint32_t data_gpio9 = GPIO9_DR >> 4;
      asm volatile("bfi %0, %1, 3, 3" : "+r"(data) : "r"(data_gpio9));
      // bits 6-15
      register uint32_t data_gpio6  = GPIO6_DR;
      register uint32_t data_gpio6_part1  = data_gpio6 >> 16;
      register uint32_t data_gpio6_part2  = data_gpio6 >> 22;
      asm volatile("bfi %0, %1, 6, 4" : "+r"(data) : "r"(data_gpio6_part1));
      asm volatile("bfi %0, %1, 10, 6" : "+r"(data) : "r"(data_gpio6_part2));
      return data;
    }
    Here is my new MASK,SHIFT,OR method code using a shared GPIO6 register. (avg 49ns)
    Code:
    __attribute__((always_inline)) static inline uint16_t test5() {
      register uint32_t data = GPIO7_DR & 0b00000000000000000000000000000111;
      data |= (GPIO9_DR & 0b00000000000000000000000001110000) >> 1;
      register uint32_t data_gpio6 = GPIO6_DR;
      data |= (data_gpio6 & 0b00000000000011110000000000000000) >> 10;
      data |= (data_gpio6 & 0b00001111110000000000000000000000) >> 12;
      return data;
    }
    (UPDATE made change from #28)

    I don't exactly follow what you're doing. Could you clear up what your code is doing in #21 and how it works?
    Last edited by trevor403; 01-04-2020 at 07:00 AM.

  3. #28
    Junior Member
    Join Date
    Jan 2020
    Posts
    9
    I think I understand your solution now...

    You are using all pins in GPIO9 to get 16 bits

    Code:
    #define CORE_PIN24_BIT		12
    #define CORE_PIN25_BIT		13
    
    #define CORE_PIN19_BIT		16
    #define CORE_PIN18_BIT		17
    #define CORE_PIN14_BIT		18
    #define CORE_PIN15_BIT		19
    
    #define CORE_PIN17_BIT		22
    #define CORE_PIN16_BIT		23
    #define CORE_PIN22_BIT		24
    #define CORE_PIN23_BIT		25
    #define CORE_PIN20_BIT		26
    #define CORE_PIN21_BIT		27
    Very clever!

  4. #29
    Senior Member+ defragster's Avatar
    Join Date
    Feb 2015
    Posts
    10,950
    @trevor403 - test5() should have this as first assignment { not |= }: data = GPIO7_DR & 0b00000000000000000000000000000111;

  5. #30
    Senior Member
    Join Date
    May 2015
    Location
    USA
    Posts
    330
    My code is based on response #3 and the following (all from GPIO6):

    #define CORE_PIN1_BIT 2
    #define CORE_PIN0_BIT 3

    #define CORE_PIN24_BIT 12
    #define CORE_PIN25_BIT 13

    #define CORE_PIN19_BIT 16
    #define CORE_PIN18_BIT 17
    #define CORE_PIN14_BIT 18
    #define CORE_PIN15_BIT 19

    #define CORE_PIN17_BIT 22
    #define CORE_PIN16_BIT 23
    #define CORE_PIN22_BIT 24
    #define CORE_PIN23_BIT 25
    #define CORE_PIN20_BIT 26
    #define CORE_PIN21_BIT 27

    #define CORE_PIN26_BIT 30
    #define CORE_PIN27_BIT 31

    One would need to be careful in wiring it up to get the right order.

  6. #31
    Senior Member
    Join Date
    May 2015
    Location
    USA
    Posts
    330
    As a good reminder that it seldom makes sense to try to outsmart the compiler, here is the fastest version yet (15 ns). Also note how clear and simple it is.
    Code:
    inline uint32_t test3()
    {
      register uint32_t data  = IMXRT_GPIO6_DIRECT;
      register uint32_t result=0;
    
      result |= (data & 0B1100) >> 2;
      result |= (data & 0B11000000000000) >> 10;
      result |= (data & 0B011110000000000000000) >> 12;
      result |= (data & 0B1111110000000000000000000000) >> 14;
      result |= (data & 0B11000000000000000000000000000000) >> 16;
    
      return result;
    }

  7. #32
    Senior Member+ defragster's Avatar
    Join Date
    Feb 2015
    Posts
    10,950
    I can confirm the speed as tested with "t3" below - but not the data integrity as I have floating pins for GPIO_6 { #define IMXRT_GPIO6_DIRECT (*(volatile uint32_t *)0x42000000) }

    Indeed the compiler is good with "C" and also maybe knows something about doing it to feed the twin CPU pipes.

    And the "C" code is easier to verify and edit than whatever that 'asm' stuff was doing

    Code:
    t3 result = 15001.528333 138
    t2 result = 66674.351667 4C0
    t1 result = 55006.456667 4C0
    t4 result = 23336.545000 432
    t5 result = 41671.716667 1000
    from:
    Code:
    	long unsigned stime;
    
    	stime = ARM_DWT_CYCCNT;
    	for (register int i = 0; i < 1000000; ++i ) {
    		test3();
    	}
    	Serial.printf("t3 result = %f %X\n", 1000000.0 * (ARM_DWT_CYCCNT - stime) / F_CPU_ACTUAL, test3());

  8. #33
    Senior Member
    Join Date
    May 2015
    Location
    USA
    Posts
    330
    defragster: what do you get if you make some use of the return values (as I did earlier with total +=)? I'm suspicious that the compiler is optimizing away required code and that 15ns isn't real.

  9. #34
    Senior Member+ defragster's Avatar
    Join Date
    Feb 2015
    Posts
    10,950
    Code:
    t3 result = 30003.013333 8448BE00
    with:
    Code:
    	long unsigned stime;
    	register uint32_t summ=0;
    
    	stime = ARM_DWT_CYCCNT;
    	for (register int i = 0; i < 1000000; ++i ) {
    		summ+=test3();
    	}
    	Serial.printf("t3 result = %f %X\n", 1000000.0 * (ARM_DWT_CYCCNT - stime) / F_CPU_ACTUAL, summ);

  10. #35
    Junior Member
    Join Date
    Jan 2020
    Posts
    9
    Sometimes you've got to fight the compiler, and sometimes it works with you

  11. #36
    Senior Member+ defragster's Avatar
    Join Date
    Feb 2015
    Posts
    10,950
    re post #34 - the test3() was not optimized out AFAIK. Adding the 'summ' was optimized out until I printed it. One earlier variation { test4()? now showing 23K? } though went from 20000 to 25000 when the print was added - so above doubling from 15K to 30K is an odd result compared to that. It must break the pipeline doubling or something extreme.

    Here it is for those two with and without summ - test4() now hits 30K ( 30 ns ):
    Code:
    t3 result = 15001.528333 C138
    summ>t3 result = 31544.241667 8448BE00
    t2 result = 66674.358333 4C0
    t1 result = 55006.453333 4C0
    t4 result = 23336.525000 732
    summ>t4 result = 30003.953333 F2136E80
    t5 result = 41671.765000 1000
    where test4() is:
    Code:
    inline uint32_t test4() {
    	register uint32_t data  = IMXRT_GPIO6_DIRECT >> 2;
    	asm volatile("bfi %0, %1, 12, 2" : "+r"(data) : "r"(data));
    	asm volatile("bfi %0, %1, 18, 2" : "+r"(data) : "r"(data >> 28));
    	return (data >> 10) & 0xffff;
    }
    Quote Originally Posted by trevor403 View Post
    Sometimes you've got to fight the compiler, and sometimes it works with you
    Indeed I did some simple math for the micros() calc from CYCCNT on T_4 and Paul swapped in some asm for the same calc and more code with a conditional limit and it runs 2 cycles faster.

  12. #37
    Senior Member
    Join Date
    May 2015
    Location
    USA
    Posts
    330
    For the record, here is what tests out fastest for me when I make use of the return value.
    Code:
    #define IMXRT_GPIO6_DIRECT  (*(volatile uint32_t *)0x42000000)
    
    // rearrange 16 GPIO6 pin inputs into 16 consecutive bits
    // move two two bit fields from the lower word into gaps in the upper word
    
    inline uint16_t test5()
    {
      register uint32_t data  = IMXRT_GPIO6_DIRECT;
      register uint32_t data2  = data >> 2;
      register uint32_t data3  = data >> 12;
      asm volatile("bfi %0, %1, 20, 2" : "+r"(data) : "r"(data2));
      asm volatile("bfi %0, %1, 28, 2" : "+r"(data) : "r"(data3));
      return (data >> 16);
    }

  13. #38
    Senior Member+ defragster's Avatar
    Join Date
    Feb 2015
    Posts
    10,950
    As test6() {p#37 test5()} in the skeleton above I see this where using the result is the best number so far - where test3() is as above p#31:
    Code:
    t6 result = 20002.025000 E043
    summ>t6 result = 23336.526667 5DF656C0
    t3 result = 15002.495000 C138
    summ>t3 result = 30003.955000 E23F14C0

Tags for this Thread

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •