Code:
#include <arm_math.h>
#include <arm_const_structs.h>
// instruction times for cortex M4
// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0439b/CHDDIGAC.html
#define nn 10000
float x,y,sum,T ;
int32_t start,stop ;
void setup() {
Serial.begin(115200);
delay(2000) ;
Serial.print("Hello World id=1234\n");
sum=0 ; x=0 ; y=0 ;
start=micros() ;
for(int k=0 ; k<nn ; k++){
x += 1.2 ;
y += 1.3 ;
sum += x * y ;
}
stop=micros() ;
T=(stop-start)*1e-6 ;
Serial.printf("sum= %10.2f cycles:%10.5f\n",sum,T/nn*180e6) ;
sum=0 ; x=0 ; y=0 ;
start=micros() ;
for(int k=0 ; k<nn ; k++){
x += 1.2 ;
y += 1.3 ;
sum += x * y ;
}
stop=micros() ;
T=(stop-start)*1e-6 ;
Serial.printf("sum= %10.2f cycles:%10.5f\n",sum,T/nn*180e6) ;
}
void loop(){
}
// 4b4: ee37 7a25 vadd.f32 s14, s14, s11 // x+=1.2
// 4b8: ee77 7a86 vadd.f32 s15, s15, s12 // y+=1.3
// 4bc: 3b01 subs r3, #1 // k count
// 4be: eee7 6a27 vfma.f32 s13, s14, s15 // sum += x*y
// 4c2: d1f7 bne.n 4b4 // conditional branch
I am trying to understand the instruction times on the Teensy 36. In order to see the instruction timing for a MAC (Multiply Accumulate) I use the program shown above. This program increments x and y and then sums x*y in sum. The corresponding arm assmbly is given under the source code. Now I have som questions. First: The first loop gives a timing of 6 cycles, the second loop gives a timing of 7 cycles. Is this an alignment problem? If so which instructions depend on alignment? If we take the first case with a cycle time of 6 I would say the instructions vadd take 1 cycle each, the subs will also need one cycle, and the cond. branch will need at least 1 cycle (more likely 2). Thus for the vfma insrtraction remain 6-1-1-1-1=2 cycles, while the manual states that a vfma needs 3 cycles. Are floating point instructions done in parallel to other instructions?