PaulStoffregen
Well-known member
I don't recall the timings of the Cortex-M0 that the Teensy-LC uses, but some M0's multiply takes 32 cycles, while others take 1 cycle.
The Cortex-M0+ on Teensy-LC has the single-cycle multiply hardware.
I don't recall the timings of the Cortex-M0 that the Teensy-LC uses, but some M0's multiply takes 32 cycles, while others take 1 cycle.
Hello World! ... LC w/F_CPU == 48000000
inWhileMax==266 outWhile Count==33182 Total Time==9999427
100
4294967295
12345
-97847383
_print_ microseconds = 69
{ BEFORE WAS :: _print_ microseconds = 233 }
Hello World! ... T_3.1 w/F_CPU == 96000000
inWhileMax==822 outWhile Count==33331 Total Time==9999856
100
4294967295
12345
-97847383
_print_ microseconds = 30
I wonder how many have been improved on LC with code and toolchain change? I don't see the testing source posted there? I don't see you tested Read/Write FAST?
delay() type stuff will probably be as slow ( ) - but may be more accurate as the micros() won't be as sluggish.
#include <EEPROM.h>
void setup() {
while (!Serial) ;
delay(1000);
Serial.println("Begin divmod10_asm optimization verify");
}
#define divmod10_asm(div, mod, tmp1, tmp2, const3333) \
asm ( \
" lsr %2, %0, #16" "\n\t" \
" mul %2, %4" "\n\t" \
" uxth %1, %0" "\n\t" \
" mul %1, %4" "\n\t" \
" add %1, #1" "\n\t" \
" lsr %0, %2, #16" "\n\t" \
" lsl %2, %2, #16" "\n\t" \
" add %1, %2" "\n\t" \
" mov %3, #0" "\n\t" \
" adc %0, %3" "\n\t" \
" lsl %0, %0, #15" "\n\t" \
" lsr %2, %1, #17" "\n\t" \
" orr %0, %2" "\n\t" \
" lsl %1, %1, #15" "\n\t" \
" lsr %2, %1, #16" "\n\t" \
" lsl %3, %0, #16" "\n\t" \
" orr %2, %3" "\n\t" \
" lsr %3, %0, #16" "\n\t" \
" add %1, %0" "\n\t" \
" adc %0, %1" "\n\t" \
" sub %0, %1" "\n\t" \
" add %1, %2" "\n\t" \
" adc %0, %3" "\n\t" \
" lsr %1, %1, #4" "\n\t" \
" mov %3, #10" "\n\t" \
" mul %1, %3" "\n\t" \
" lsr %1, %1, #28" "\n\t" \
: "+l" (div), \
"=&l" (mod), \
"=&l" (tmp1), \
"=&l" (tmp2) \
: "l" (const3333) \
: \
)
void loop() {
uint32_t d, m, t1, t2, c;
uint32_t i, correct_d, correct_m;
uint32_t dotcount=100000, linecount=0;
i = (EEPROM.read(3) << 24) | 0x00FFFFFF;
Serial.print("begin at: ");
Serial.println(i, HEX);
correct_d = i / 10;
correct_m = i % 10;
c = 0x3333;
d = 15842193;
m = 0;
while (1) {
//correct_d = i / 10; // very slow :-(
//correct_m = i % 10;
d = i;
divmod10_asm(d, m, t1, t2, c);
if (d != correct_d || m != correct_m) {
Serial.println();
Serial.println("Error:");
Serial.print("in = ");
Serial.println(i, HEX);
Serial.print("in = ");
Serial.println(i);
Serial.print("div = ");
Serial.print(d);
Serial.print(", correct = ");
Serial.println(correct_d);
Serial.print("mod = ");
Serial.print(m);
Serial.print(", correct = ");
Serial.println(correct_m);
}
if (correct_m > 0) {
correct_m = correct_m - 1; // fast :-)
} else {
correct_m = 9;
correct_d = correct_d - 1;
}
i--;
if ((i & 0xFFFF) == 0) {
if ((i & 0xFFFFFF) == 0) {
EEPROM.write(3, i >> 24);
Serial.print(i >> 24, HEX);
}
if (++dotcount > 65) {
dotcount = 0;
linecount++;
Serial.println();
Serial.print(linecount);
Serial.print(": ");
}
Serial.print(".");
if (i == 0) break;
}
}
Serial.println();
Serial.println("Done");
EEPROM.write(3, 0xFF); // reset to FF, to run again
while (1) ; // end
}
Error:
in = FFF90844
in = 4294510660
div = 429483836, correct = 429451066
mod = 5, correct = 0
void inline divmod10_v2(uint32_t n,uint32_t *div,uint32_t *mod) {
uint32_t p,q;
/* Using 32.16 fixed point representation p.q */
/* p.q = (n+1)/512 */
q = (n&0xFFFF) + 1;
p = (n>>16);
/* p.q = 51*(n+1)/512 */
q = 13107*q;
p = 13107*p;
/* p.q = (1+1/2^8+1/2^16+1/2^24)*51*(n+1)/512 */
q = q + (q>>16) + (p&0xFFFF);
p = p + (p>>16) + (q>>16);
/* divide by 2 */
p = p>>1;
*div = p;
*mod = n-10*p;
}
Begin divmod10_v2 optimization verify
begin at: FFFFFFFF
begin micros: 1733982
1: ..................................................................
2: ..................................................................
3: ..................................................................
4: .........................................................FF.........
5: ..................................................................
6: ..................................................................
7: ..................................................................
8: .................................................FE.................
989: ..................................................................
990: .....1.............................................................
991: ..................................................................
992: ..................................................................
993: ...............................................................0.
Done
END micros: 1176985217
would signed variables make a speed difference?
I assumed you put the ASM in to emulate what the LC does but would run on either MCU?
Begin divmod10_v2 optimization verify
begin at: FFFFFFFF
begin micros: 1252063
1: ..................................................................
2: ..................................................................
3: ..................................................................
4: .........................................................FF.........
{ ... }
990: .....1.............................................................
991: ..................................................................
992: ..................................................................
993: ...............................................................0.
Done
END micros: 107977615
Begin divmod10_v2 optimization verify
Hello World! ... T_3.1 w/F_CPU == 120000000
begin at: FFFFFFFF
begin millis: 1862
1: ..................................................................
{ ... }
993: ...............................................................0.
Done
ERRORS detected: 0
END millis diff: 940313
END mins: 15
Begin divmod10_v2 optimization verify
Hello World! ... LC w/F_CPU == 48000000
{ ... }
ERRORS detected: 0
END millis diff: 3970690
END mins: 66
ERRORS detected: 0
END millis diff: 4599301
END mins: 76
the following code beats both Pauls asm code and my own asm version, not by huge amounts but around 10%.
Signed values would produce the wrong values. If you shift a negative signed value right, the result gives you a implementation defined behavior. On just about every computer you will run into today (except for some ancient mainframes) the implementation behavior is to replicate the sign bit. Unsigned values shifted right always fill the top bits with 0's.
So, wondering if an unsigned-only speed test would be fudging,
And corner cases (over/underflow).
LC w/F_CPU == 48000000
_print_ sprintf microseconds = 795
100,4294967295,12345, -97847383
100
4294967295
12345
-97847383
_print_ microseconds = 64
This optimizing software divide is only used by print()?