Code:

#include <math.h>
#include <string.h>
//Compile with ""optimized" from the Arduino-Menu
FASTRUN
void float_MatMult(float* A, float* B, int m, int p, int n, float* C) {
// A = input matrix (m x p)
// B = input matrix (p x n)
// m = number of rows in A
// p = number of columns in A = number of rows in B
// n = number of columns in B
// C = output matrix = A*B (m x n)
int i, j, k;
for ( i = 0; i < m; i++ )
for ( j = 0; j < n; j++ ){
C[i*n+j] = 0;
for( k = 0; k < p; k++ )
C[i*n+j] += A[i*p+k]*B[k*n+j];
}
}
FASTRUN __attribute__((optimize("O2")))
void float_MatMultO2(float* A, float* B, int m, int p, int n, float* C) {
// A = input matrix (m x p)
// B = input matrix (p x n)
// m = number of rows in A
// p = number of columns in A = number of rows in B
// n = number of columns in B
// C = output matrix = A*B (m x n)
int i, j, k;
for ( i = 0; i < m; i++ )
for ( j = 0; j < n; j++ ){
C[i*n+j] = 0;
for( k = 0; k < p; k++ )
C[i*n+j] += A[i*p+k]*B[k*n+j];
}
}
FASTRUN __attribute__((optimize("O3")))
void float_MatMultO3(float* A, float* B, int m, int p, int n, float* C) {
// A = input matrix (m x p)
// B = input matrix (p x n)
// m = number of rows in A
// p = number of columns in A = number of rows in B
// n = number of columns in B
// C = output matrix = A*B (m x n)
int i, j, k;
for ( i = 0; i < m; i++ )
for ( j = 0; j < n; j++ ){
C[i*n+j] = 0;
for( k = 0; k < p; k++ )
C[i*n+j] += A[i*p+k]*B[k*n+j];
}
}
float multiply(float* A, float* B, int i, int p, int k, int n, int j) {
return A[i*p+k]*B[k*n+j];
}
void float_MatMult_noexperienceduser(float* A, float* B, int m, int p, int n, float* C) {
// A = input matrix (m x p)
// B = input matrix (p x n)
// m = number of rows in A
// p = number of columns in A = number of rows in B
// n = number of columns in B
// C = output matrix = A*B (m x n)
int i, j, k;
for ( i = 0; i < m; i++ )
for ( j = 0; j < n; j++ ){
C[i*n+j] = 0;
for( k = 0; k < p; k++ ) {
C[i*n+j] += multiply(A, B, i, p, k, n, j);
}
}
}
__attribute__((optimize("O2")))
float multiplyO2(float* A, float* B, int i, int p, int k, int n, int j) {
return A[i*p+k]*B[k*n+j];
}
__attribute__((optimize("O2")))
void float_MatMult_noexperienceduserO2(float* A, float* B, int m, int p, int n, float* C) {
// A = input matrix (m x p)
// B = input matrix (p x n)
// m = number of rows in A
// p = number of columns in A = number of rows in B
// n = number of columns in B
// C = output matrix = A*B (m x n)
int i, j, k;
for ( i = 0; i < m; i++ )
for ( j = 0; j < n; j++ ){
C[i*n+j] = 0;
for( k = 0; k < p; k++ ) {
C[i*n+j] += multiplyO2(A, B, i, p, k, n, j);
}
}
}
void setup() {
while (!Serial) ;
// variables for timing
int i=0;
int dt;
// variables for calculation
#define N 16
float A[N][N];
float B[N][N];
float C[N][N];
memset(A,3.1415,sizeof(A));
memset(B,8.1415,sizeof(B));
memset(C,0.0,sizeof(C));
int tbegin = micros();
for (i=1;;i++) {
// do calculation
float_MatMult((float*) A, (float*)B, N,N,N, (float*)C);
// check if t_delay has passed
dt = micros() - tbegin;
if (dt > 1000000) break;
}
Serial.println("Optimized");
Serial.printf("(%dx%d) matrices: ", N, N);
Serial.printf("%d matrices in %d usec: ", i, dt);
Serial.printf("%d matrices/second\n", (int)((float)i*1000000/dt));
Serial.printf("Float (%d bytes) ", sizeof(float));
float total = N*N*N*i*1e6 / (float)dt;
Serial.printf(" multiplications per second:\t(%u)\n", (unsigned int)total);
delay(500);
tbegin = micros();
for (i=1;;i++) {
// do calculation
float_MatMult((float*) A, (float*)B, N,N,N, (float*)C);
// check if t_delay has passed
dt = micros() - tbegin;
if (dt > 1000000) break;
}
Serial.println("Optimized O2");
Serial.printf("(%dx%d) matrices: ", N, N);
Serial.printf("%d matrices in %d usec: ", i, dt);
Serial.printf("%d matrices/second\n", (int)((float)i*1000000/dt));
Serial.printf("Float (%d bytes) ", sizeof(float));
total = N*N*N*i*1e6 / (float)dt;
Serial.printf(" multiplications per second:\t(%u)\n", (unsigned int)total);
delay(500);
tbegin = micros();
for (i=1;;i++) {
// do calculation
float_MatMult((float*) A, (float*)B, N,N,N, (float*)C);
// check if t_delay has passed
dt = micros() - tbegin;
if (dt > 1000000) break;
}
Serial.println("Optimized O3");
Serial.printf("(%dx%d) matrices: ", N, N);
Serial.printf("%d matrices in %d usec: ", i, dt);
Serial.printf("%d matrices/second\n", (int)((float)i*1000000/dt));
Serial.printf("Float (%d bytes) ", sizeof(float));
total = N*N*N*i*1e6 / (float)dt;
Serial.printf(" multiplications per second:\t(%u)\n", (unsigned int)total);
Serial.println();
delay(500);
tbegin = micros();
for (i=1;;i++) {
// do calculation
float_MatMult_noexperienceduser((float*) A, (float*)B, N,N,N, (float*)C);
// check if t_delay has passed
dt = micros() - tbegin;
if (dt > 1000000) break;
}
Serial.println("Optimized unexperienced");
Serial.printf("(%dx%d) matrices: ", N, N);
Serial.printf("%d matrices in %d usec: ", i, dt);
Serial.printf("%d matrices/second\n", (int)((float)i*1000000/dt));
Serial.printf("Float (%d bytes) ", sizeof(float));
total = N*N*N*i*1e6 / (float)dt;
Serial.printf(" multiplications per second:\t(%u)\n", (unsigned int)total);
delay(500);
tbegin = micros();
for (i=1;;i++) {
// do calculation
float_MatMult_noexperienceduserO2((float*) A, (float*)B, N,N,N, (float*)C);
// check if t_delay has passed
dt = micros() - tbegin;
if (dt > 1000000) break;
}
Serial.println("Optimized unexperienced O2");
Serial.printf("(%dx%d) matrices: ", N, N);
Serial.printf("%d matrices in %d usec: ", i, dt);
Serial.printf("%d matrices/second\n", (int)((float)i*1000000/dt));
Serial.printf("Float (%d bytes) ", sizeof(float));
total = N*N*N*i*1e6 / (float)dt;
Serial.printf(" multiplications per second:\t(%u)\n", (unsigned int)total);
}
void loop() {
}

I added two variants which show what the compiler can archieve for not so experienced users.