great info, thanks
So if I use sqrtL and the like on a T3.5/3.6 does the program completely ignore the FPU when running?
(for clarity only - I don't need double maths)
great info, thanks
So if I use sqrtL and the like on a T3.5/3.6 does the program completely ignore the FPU when running?
(for clarity only - I don't need double maths)
#define CJ_ID "ARM_DSP.a"
/* test performance of arm_math.h/CMSIS-DSP functions
* see https://www.keil.com/pack/doc/CMSIS/DSP/html/group__SQRT.html
*/
#define LED LED_BUILTIN
#include <Streaming.h>
#include <arm_math.h>
elapsedMillis tms; // ms timer
elapsedMicros tus; // μs timer
float32_t x,y;
float32_t xroot;
void setup(){
Serial.begin(115200);
while (!Serial && (millis() <= 4000)){
digitalWriteFast(LED,!digitalReadFast(LED));
delay(50);}
Serial << F("\n######## ") << CJ_ID << F(" ########\n");
Serial << " * Serial open, millis: " << millis() << '\n';
tms = 0;
//x = 4.67992; // sqrt = 2.163312275;
//tus = 0;
//arm_sqrt_f32(x, &xroot);
//Serial << "time: " << tus << "us" << '\n';
//Serial << "sqrt(" << x << ") = " << _FLOAT(xroot,10) << '\n';
float x1 = 0.1307;
float delta = 0.1307;
Serial << "--------\n";
Serial << "do 100 x square root calculations ..." << '\n';
x = x1;
tus = 0;
for (int i=0; i<100; i++){
arm_sqrt_f32(x, &xroot);
x = x + delta;
}
Serial << "arm_math time: " << tus << "us" << '\n';
Serial << "last sqrt(" << x << ") = " << _FLOAT(xroot,10) << '\n';
Serial << "--------\n";
x = x1;
tus = 0;
for (int i=0; i<100; i++){
y = sqrtf(x);
x = x + delta;
}
Serial << "sqrtf time: " << tus << "us" << '\n';
Serial << "last sqrt(" << x << ") = " << _FLOAT(y,10) << '\n';
Serial << "---- end setup [" << tms << "ms " << tus << "us] ----" << '\n';
}
void loop() {
}
Do I need to load the whole CMSIS library somehow? or just copy the four files listed by duff + libarm_cortexM4lf_math.a into
C:\Program Files (x86)\Arduino\hardware\teensy\avr\cores\teensy3 and then do the edits?
#include <arm_math.h>
#include <arm_const_structs.h>
maybe the arm_math doesn't use the FPU?
__STATIC_INLINE arm_status arm_sqrt_f32(
float32_t in,
float32_t * pOut)
{
if(in > 0)
{
// #if __FPU_USED
#if (__FPU_USED == 1) && defined ( __CC_ARM )
*pOut = __sqrtf(in);
#elif (__FPU_USED == 1) && defined ( __TMS_740 )
*pOut = __builtin_sqrtf(in);
#else
*pOut = sqrtf(in);
#endif
return (ARM_MATH_SUCCESS);
}
else
{
*pOut = 0.0f;
return (ARM_MATH_ARGUMENT_ERROR);
}
}
static __INLINE arm_status arm_sqrt_f32(
float32_t in,
float32_t * pOut)
{
if(in >= 0.0f)
{
#if (__FPU_USED == 1) && defined ( __CC_ARM )
*pOut = __sqrtf(in);
#elif (__FPU_USED == 1) && (defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050))
*pOut = __builtin_sqrtf(in);
#elif (__FPU_USED == 1) && defined(__GNUC__)
*pOut = __builtin_sqrtf(in);
#elif (__FPU_USED == 1) && defined ( __ICCARM__ ) && (__VER__ >= 6040000)
__ASM("VSQRT.F32 %0,%1" : "=t"(*pOut) : "t"(in));
#else
*pOut = sqrtf(in);
#endif
return (ARM_MATH_SUCCESS);
}
else
{
*pOut = 0.0f;
return (ARM_MATH_ARGUMENT_ERROR);
}
}
STARTING TEST
ARM math time: microseconds = 2171 us
sqrtf time: microseconds = 1783 us
__builtin_sqrtf time: microseconds = 1784 us
sqrt time: microseconds = 60846 us
/* test performance of arm_math.h/CMSIS-DSP functions
* see https://www.keil.com/pack/doc/CMSIS/DSP/html/group__SQRT.html */
#define LED LED_BUILTIN
//#include <Streaming.h>
#include <arm_math.h>
elapsedMillis tms; // ms timer
elapsedMicros tus; // μs timer
float32_t x,y;
float32_t xroot;
void setup(){
Serial.begin(115200);
while (!Serial && (millis() <= 4000)){
digitalWriteFast(LED,!digitalReadFast(LED));
delay(5000);}
Serial.println(" STARTING TEST");
//Serial.println("millis = "); Serial.print(millis);
tms = 0;
//x = 4.67992; // sqrt = 2.163312275;
//tus = 0;
//arm_sqrt_f32(x, &xroot);
//Serial << "time: " << tus << "us" << '\n';
//Serial << "sqrt(" << x << ") = " << _FLOAT(xroot,10) << '\n';
float x1 = 0.1307;
float delta = 0.1307;
x = x1;
tus = 0;
for (int i=0; i<10000; i++){
arm_sqrt_f32(x, &xroot);
x = x + delta;
}
Serial.print(" ARM math time: ");
Serial.print("microseconds = "); Serial.print(tus); Serial.println(" us");
// Serial << "arm_math time: " << tus << "us" << '\n';
// Serial << "sqrt(" << x << ") = " << _FLOAT(xroot,10) << '\n';
// Serial << "--------\n";
x = x1;
tms = 0;
tus = 0;
for (int i=0; i<10000; i++){
y = sqrtf(x);
x = x + delta;
}
Serial.print(" sqrtf time: ");
Serial.print("microseconds = "); Serial.print(tus); Serial.println(" us");
// Serial << "sqrtf time: " << tus << "us" << '\n';
// Serial << "sqrt(" << x << ") = " << _FLOAT(y,10) << '\n';
// Serial << "---- end setup [" << tms << "ms " << tus << "us] ----" << '\n';
x = x1;
tms = 0;
tus = 0;
for (int i=0; i<10000; i++){
y = __builtin_sqrtf(x);
x = x + delta;
}
Serial.print(" __builtin_sqrtf time: ");
Serial.print("microseconds = "); Serial.print(tus); Serial.println(" us");
x = x1;
tms = 0;
tus = 0;
for (int i=0; i<10000; i++){
y = sqrt(x);
x = x + delta;
}
Serial.print(" sqrt time: ");
Serial.print("microseconds = "); Serial.print(tus); Serial.println(" us");
}
void loop() {
}
for(i = 0; i < FFT_length; i++)
{
FFT_magn[i] = sqrtf(FFT_buffer[(i*2)] * FFT_buffer[(i*2)] + FFT_buffer[(i*2) + 1] * FFT_buffer[(i*2) + 1]);
}
arm_cmplx_mag_f32(FFT_buffer, FFT_magn, FFT_length); // calculates sqrt(I*I + Q*Q) for each frequency bin of the FFT
and also tried the difference between:
Code:for(i = 0; i < FFT_length; i++) { FFT_magn[i] = sqrtf(FFT_buffer[(i*2)] * FFT_buffer[(i*2)] + FFT_buffer[(i*2) + 1] * FFT_buffer[(i*2) + 1]); }
AND
for(i = 0; i < FFT_length; )
{
FFT_magn[i] = sqrtf(FFT_buffer[(i*2)] * FFT_buffer[(i*2)] + FFT_buffer[(i*2) + 1] * FFT_buffer[(i*2) + 1]); i++;
FFT_magn[i] = sqrtf(FFT_buffer[(i*2)] * FFT_buffer[(i*2)] + FFT_buffer[(i*2) + 1] * FFT_buffer[(i*2) + 1]); i++;
FFT_magn[i] = sqrtf(FFT_buffer[(i*2)] * FFT_buffer[(i*2)] + FFT_buffer[(i*2) + 1] * FFT_buffer[(i*2) + 1]); i++;
FFT_magn[i] = sqrtf(FFT_buffer[(i*2)] * FFT_buffer[(i*2)] + FFT_buffer[(i*2) + 1] * FFT_buffer[(i*2) + 1]); i++;
}
The magic compiler flag you need for fast "sqrtf" is "-fno-math-errno". "-ffast-math" also enables a bunch of other stuff that can be problematic.The ARM Cortex M4F microprocessors have a square root instruction for single precision floating point, but the GCC compiler typically does not use the instruction for sqrtf unless you use the -ffast-math option (or some of the more specific fast math options). This is because the ISO C specification says that the global error variable (errno) may be set if the input value is out of bounds.
In my tests, you get exactly the same code with sqrtf and __builtin_sqrtf, regardless of optimization options. With "-Os", you get a math library call, with O1 / O2 / O3, you get an inlined hardware vsqrt.f32 instruction along with a ton of error handling code (unless you you used the "-fno-math-errno").If you don't need the error checking, you can use the '__builtin_sqrtf' function instead, and it will generate the direct instruction (note there are 2 leading underscores).