Here is what delayMicroseconds disassembly look like with -O3 and LTO enabled on a T3:

Code:

000018b8 <L_783_delayMicroseconds>:
18b8: 3b01 subs r3, #1
18ba: d1fd bne.n 18b8 <L_783_delayMicroseconds>
18bc: f892 3200 ldrb.w r3, [r2, #512] ; 0x200
18c0: f892 1280 ldrb.w r1, [r2, #640] ; 0x280
18c4: b2db uxtb r3, r3
18c6: 2900 cmp r1, #0
18c8: d1f2 bne.n 18b0 <main+0x28>
18ca: b13b cbz r3, 18dc <L_783_delayMicroseconds+0x24>
18cc: 6803 ldr r3, [r0, #0]
18ce: f023 0302 bic.w r3, r3, #2
18d2: 6003 str r3, [r0, #0]
18d4: e7ef b.n 18b6 <main+0x2e>
18d6: f882 5100 strb.w r5, [r2, #256] ; 0x100
18da: e7ec b.n 18b6 <main+0x2e>
18dc: 6803 ldr r3, [r0, #0]
18de: f043 0303 orr.w r3, r3, #3
18e2: 6003 str r3, [r0, #0]
18e4: e7e7 b.n 18b6 <main+0x2e>
18e6: f8df 8078 ldr.w r8, [pc, #120] ; 1960 <L_869_delayMicroseconds+0x58>
18ea: f8df c078 ldr.w ip, [pc, #120] ; 1964 <L_869_delayMicroseconds+0x5c>
18ee: f8df e078 ldr.w lr, [pc, #120] ; 1968 <L_869_delayMicroseconds+0x60>
18f2: 4f18 ldr r7, [pc, #96] ; (1954 <L_869_delayMicroseconds+0x4c>)
18f4: 4e18 ldr r6, [pc, #96] ; (1958 <L_869_delayMicroseconds+0x50>)
18f6: 4d19 ldr r5, [pc, #100] ; (195c <L_869_delayMicroseconds+0x54>)
18f8: 4c15 ldr r4, [pc, #84] ; (1950 <L_869_delayMicroseconds+0x48>)
18fa: e010 b.n 191e <L_869_delayMicroseconds+0x16>
18fc: b1eb cbz r3, 193a <L_869_delayMicroseconds+0x32>
18fe: 6803 ldr r3, [r0, #0]
1900: f023 0302 bic.w r3, r3, #2
1904: 6003 str r3, [r0, #0]
1906: 4623 mov r3, r4
00001908 <L_869_delayMicroseconds>:
1908: 3b01 subs r3, #1
190a: d1fd bne.n 1908 <L_869_delayMicroseconds>
190c: f898 3000 ldrb.w r3, [r8]
1910: f89c 3000 ldrb.w r3, [ip]
1914: f89e 3000 ldrb.w r3, [lr]
1918: 783b ldrb r3, [r7, #0]
191a: 7833 ldrb r3, [r6, #0]
191c: 782b ldrb r3, [r5, #0]
191e: f892 3200 ldrb.w r3, [r2, #512] ; 0x200
1922: f892 1280 ldrb.w r1, [r2, #640] ; 0x280
1926: b2db uxtb r3, r3
1928: 2900 cmp r1, #0
192a: d0e7 beq.n 18fc <L_783_delayMicroseconds+0x44>
192c: b113 cbz r3, 1934 <L_869_delayMicroseconds+0x2c>
192e: f882 9100 strb.w r9, [r2, #256] ; 0x100
1932: e7e8 b.n 1906 <L_783_delayMicroseconds+0x4e>
1934: f882 9080 strb.w r9, [r2, #128] ; 0x80
1938: e7e5 b.n 1906 <L_783_delayMicroseconds+0x4e>
193a: 6803 ldr r3, [r0, #0]
193c: f043 0303 orr.w r3, r3, #3
1940: 6003 str r3, [r0, #0]
1942: e7e0 b.n 1906 <L_783_delayMicroseconds+0x4e>
1944: 4004b014 andmi fp, r4, r4, lsl r0
1948: 43fe1014 mvnsmi r1, #20
194c: 1fff8e08 svcne 0x00ff8e08
1950: 00f42400 rscseq r2, r4, r0, lsl #8
1954: 1fff8e0c svcne 0x00ff8e0c
1958: 1fff8e00 svcne 0x00ff8e00
195c: 1fff8dff svcne 0x00ff8dff
1960: 1fff8e09 svcne 0x00ff8e09
1964: 1fff8e0a svcne 0x00ff8e0a
1968: 1fff8e0b svcne 0x00ff8e0b

I found this out with my Zilch library which heavily uses inline assembly.