6 lat temu · db6a30b446
--- a/components/esp32/linker.lf
+++ b/components/esp32/linker.lf
@@ -7,6 +7,7 @@ entries:
 
				 archive: libgcc.a
			
 
				 entries:
			
 
				     lib2funcs (noflash_text)
			
 
				+    _divsf3 (noflash)
			
 
				 
			
 
				 [mapping:gcov]
			
 
				 archive: libgcov.a
			
--- a/components/esp32/test/test_fp.c
+++ b/components/esp32/test/test_fp.c
@@ -1,8 +1,14 @@
 
				 #include <math.h>
			
 
				 #include <stdio.h>
			
 
				+#include "soc/cpu.h"
			
 
				 #include "freertos/FreeRTOS.h"
			
 
				 #include "freertos/task.h"
			
 
				 #include "unity.h"
			
 
				+#include "test_utils.h"
			
 
				+
			
 
				+/* Note: these functions are included here for unit test purposes. They are not needed for writing
			
 
				+ * normal code. If writing standard C floating point code, libgcc should correctly include implementations
			
 
				+ * that use the floating point registers correctly. */
			
 
				 
			
 
				 static float addsf(float a, float b)
			
 
				 {
			
@@ -48,7 +54,7 @@ static float divsf(float a, float b)
 
				         "const.s f2, 0 \n"
			
 
				         "neg.s f9, f8 \n"
			
 
				         "maddn.s f5,f4,f6 \n"
			
 
				-        "maddn.s f2, f0, f3 \n"
			
 
				+        "maddn.s f2, f9, f3 \n"
			
 
				         "mkdadj.s f7, f0 \n"
			
 
				         "maddn.s f6,f5,f6 \n"
			
 
				         "maddn.s f9,f4,f2 \n"
			
@@ -191,3 +197,70 @@ TEST_CASE("context switch saves FP registers", "[fp]")
 
				     }
			
 
				     TEST_ASSERT(state.fail == 0);
			
 
				 }
			
 
				+
			
 
				+/* Note: not static, to avoid optimisation of const result */
			
 
				+float IRAM_ATTR test_fp_benchmark_fp_divide(int counts, unsigned *cycles)
			
 
				+{
			
 
				+    float f = MAXFLOAT;
			
 
				+    uint32_t before, after;
			
 
				+    RSR(CCOUNT, before);
			
 
				+
			
 
				+    for (int i = 0; i < counts; i++) {
			
 
				+        f /= 1.000432f;
			
 
				+    }
			
 
				+
			
 
				+    RSR(CCOUNT, after);
			
 
				+    *cycles = (after - before) / counts;
			
 
				+
			
 
				+    return f;
			
 
				+}
			
 
				+
			
 
				+TEST_CASE("floating point division performance", "[fp]")
			
 
				+{
			
 
				+    const unsigned COUNTS = 1000;
			
 
				+    unsigned cycles = 0;
			
 
				+
			
 
				+    // initialize fpu
			
 
				+    volatile __attribute__((unused)) float dummy = sqrtf(rand());
			
 
				+
			
 
				+    float f = test_fp_benchmark_fp_divide(COUNTS, &cycles);
			
 
				+
			
 
				+    printf("%d divisions from %f = %f\n", COUNTS, MAXFLOAT, f);
			
 
				+    printf("Per division = %d cycles\n", cycles);
			
 
				+
			
 
				+    TEST_PERFORMANCE_LESS_THAN(ESP32_CYCLES_PER_DIV, "%d cycles", cycles);
			
 
				+}
			
 
				+
			
 
				+/* Note: not static, to avoid optimisation of const result */
			
 
				+float IRAM_ATTR test_fp_benchmark_fp_sqrt(int counts, unsigned *cycles)
			
 
				+{
			
 
				+    float f = MAXFLOAT;
			
 
				+    uint32_t before, after;
			
 
				+    RSR(CCOUNT, before);
			
 
				+
			
 
				+    for (int i = 0; i < counts; i++) {
			
 
				+        f = sqrtf(f);
			
 
				+    }
			
 
				+
			
 
				+    RSR(CCOUNT, after);
			
 
				+    *cycles = (after - before) / counts;
			
 
				+
			
 
				+    return f;
			
 
				+}
			
 
				+
			
 
				+TEST_CASE("floating point square root performance", "[fp]")
			
 
				+{
			
 
				+    const unsigned COUNTS = 200;
			
 
				+    unsigned cycles = 0;
			
 
				+
			
 
				+    // initialize fpu
			
 
				+    volatile float __attribute__((unused)) dummy = sqrtf(rand());
			
 
				+
			
 
				+    float f = test_fp_benchmark_fp_sqrt(COUNTS, &cycles);
			
 
				+
			
 
				+    printf("%d square roots from %f = %f\n", COUNTS, MAXFLOAT, f);
			
 
				+    printf("Per sqrt = %d cycles\n", cycles);
			
 
				+
			
 
				+    TEST_PERFORMANCE_LESS_THAN(ESP32_CYCLES_PER_SQRT, "%d cycles", cycles);
			
 
				+}
			
 
				+
			
--- a/components/esp_rom/esp32/ld/esp32.rom.libgcc.ld
+++ b/components/esp_rom/esp32/ld/esp32.rom.libgcc.ld
@@ -24,8 +24,6 @@ __ctzsi2 = 0x4000c7f0;
 
				 __divdc3 = 0x400645a4;
			
 
				 __divdf3 = 0x40002954;
			
 
				 __divdi3 = 0x4000ca84;
			
 
				-__divsc3 = 0x4006429c;
			
 
				-__divsf3 = 0x4000234c;
			
 
				 __divsi3 = 0x4000c7b8;
			
 
				 __eqdf2 = 0x400636a8;
			
 
				 __eqsf2 = 0x40063374;
			
@@ -62,7 +60,6 @@ __modsi3 = 0x4000c7c0;
 
				 __muldc3 = 0x40063c90;
			
 
				 __muldf3 = 0x4006358c;
			
 
				 __muldi3 = 0x4000c9fc;
			
 
				-__mulsc3 = 0x40063944;
			
 
				 __mulsf3 = 0x400632c8;
			
 
				 __mulsi3 = 0x4000c7b0;
			
 
				 __mulvdi3 = 0x40002d78;
			
@@ -80,7 +77,6 @@ __popcount_tab = 0x3ff96544;
 
				 __popcountdi2 = 0x40002ef8;
			
 
				 __popcountsi2 = 0x40002ed0;
			
 
				 __powidf2 = 0x400638e4;
			
 
				-__powisf2 = 0x4006389c;
			
 
				 __subdf3 = 0x400026e4;
			
 
				 __subsf3 = 0x400021d0;
			
 
				 __subvdi3 = 0x40002d20;
			
--- a/components/idf_test/include/idf_performance.h
+++ b/components/idf_test/include/idf_performance.h
@@ -27,4 +27,7 @@
 
				 #define IDF_PERFORMANCE_MAX_ESP32_TIME_SHA512_32KB                              4500
			
 
				 // AES-CBC hardware throughput (accounts for worst-case performance with PSRAM workaround)
			
 
				 #define IDF_PERFORMANCE_MIN_AES_CBC_THROUGHPUT_MBSEC                            8.5
			
 
				+// floating point instructions per divide and per sqrt (configured for worst-case with PSRAM workaround)
			
 
				+#define IDF_PERFORMANCE_MAX_ESP32_CYCLES_PER_DIV 70
			
 
				+#define IDF_PERFORMANCE_MAX_ESP32_CYCLES_PER_SQRT 140