I am currently playing around with the Teens4.0 and want to understand its architectural behaviour. To inspect "dual issue" I created a program (listing complete below) that contains the following loop:
In assembly code this is:
The cycle measurement now gives the following result: With dual issue enabled (standard setting) a loop needs 5 cycles.
With dual issue disabled
the loop needs also only 6 cycles. The question is: How can the teensy execute 9 instructions in 6 cycles without dual issuing?
To measure cycles I use the internal cycle counter and the signal on the LED pin.
The complete program is:
Code:
for( k=0 ; k<nn ; k++){
CORE_PIN13_PORTSET = CORE_PIN13_BITMASK; // led-1
CORE_PIN13_PORTCLEAR = CORE_PIN13_BITMASK; // led-2
xx *= 105529 ;
vv += m & 0x1234 ;
m +=17 ;
}
In assembly code this is:
Code:
// r9=105529 ; r3=m ; r5=v ; r6=xx
// 1d8: f241 2234 movw r2, #4660 ; 0x1234 ; r2=0x1234
// 1dc: f8c8 7084 str.w r7, [r8, #132] ; 0x84 CORE_PIN13_PORTSET = CORE_PIN13_BITMASK; // led-1
// 1e0: fb09 f606 mul.w r6, r9, r6 ; xx *= 105529 ;
// 1e4: 401a ands r2, r3 ; r2= m & 0x1234 ; r2=1234h r3=m
// 1e6: 3311 adds r3, #17 ; m +=17 ;
// 1e8: f8c8 7088 str.w r7, [r8, #136] ; 0x88 CORE_PIN13_PORTCLEAR = CORE_PIN13_BITMASK; // led-2
// for( k=0 ; k<256 ; k++){
// 1ec: 4299 cmp r1, r3 ; loop end test r1 <> m
// 1ee: 4415 add r5, r2 ; r5 += m & 0x1234 ; r5=v
// 1f0: d1f2 bne.n 1d8 <loop+0x1c> ; loop go on
The cycle measurement now gives the following result: With dual issue enabled (standard setting) a loop needs 5 cycles.
With dual issue disabled
Code:
void disableDualIssue(){
Serial.println("disableDualIssue()") ;
Serial.printf("before: ACTLR=%08XH\n",*ACTLR) ;
(*ACTLR) |= 0x0001F0000 ; // DISDI
(*ACTLR) |= 0x003E00000 ; // DISISSCH1
Serial.printf("after: ACTLR=%08XH\n",*ACTLR) ;
}
To measure cycles I use the internal cycle counter and the signal on the LED pin.
The complete program is:
Code:
int led = 13;
#define MHz 1e6
#define ns 1e-9
extern "C" uint32_t set_arm_clock(uint32_t frequency);
void setClock100MHz(){
int frq=100000000 ;
Serial.printf("F_CPU_ACTUAL =%8.4f MHz\n",F_CPU_ACTUAL/MHz);
Serial.printf("set_arm_clock (%8.4f MHz)\n",frq/MHz);
set_arm_clock(frq);
Serial.printf("F_CPU_ACTUAL =%8.4f MHz\n",F_CPU_ACTUAL/MHz);
Serial.printf("F_CPU_ACTUAL/4=%8.4f MHz\n",F_CPU_ACTUAL/4/MHz);
Serial.printf("1/F_CPU_ACTUAL=%8.4f ns\n",1.0/F_CPU_ACTUAL/ns);
}
#define ACTLR ((int*) 0xE000E008)
void disableDualIssue(){
Serial.println("disableDualIssue()") ;
Serial.printf("before: ACTLR=%08XH\n",*ACTLR) ;
(*ACTLR) |= 0x0001F0000 ; // DISDI
(*ACTLR) |= 0x003E00000 ; // DISISSCH1
Serial.printf("after: ACTLR=%08XH\n",*ACTLR) ;
}
void setup() {
pinMode(led, OUTPUT);
Serial.begin(115200);
while(!Serial){} ;
// Enable CPU Cycle Count
ARM_DEMCR |= ARM_DEMCR_TRCENA;
ARM_DWT_CTRL |= ARM_DWT_CTRL_CYCCNTENA;
// setClock100MHz() ;
disableDualIssue() ;
delay(2000) ;
Serial.println("teensy40startSequenceOfLoop1...") ;
delay(200) ;
}
int cyclesStart ;
int cyclesStop ;
int cycles ;
void loop(){
int xx=42 ;
int nn=256 ;
int k ;
int m ;
m=0 ;
int vv=0 ;
while(1){ // 5 cycles
cyclesStart = ARM_DWT_CYCCNT ;
for( k=0 ; k<nn ; k++){
CORE_PIN13_PORTSET = CORE_PIN13_BITMASK; // led-1
CORE_PIN13_PORTCLEAR = CORE_PIN13_BITMASK; // led-2
xx *= 105529 ;
vv += m & 0x1234 ;
m +=17 ;
}
cyclesStop = ARM_DWT_CYCCNT ;
Serial.printf("k=%8i xx=%8i m=%8i vv=%8i \n",k++,xx,m,vv) ;
cycles=cyclesStop-cyclesStart ;
Serial.printf("cyclesNeeded=%d cycles per loop=%d \n",cycles,cycles/nn) ;
delay(1000) ;
//Serial.println("abort...") ;while(1){} ;
}
}