test_fp.c 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. #include <math.h>
  2. #include <stdio.h>
  3. #include "soc/cpu.h"
  4. #include "freertos/FreeRTOS.h"
  5. #include "freertos/task.h"
  6. #include "unity.h"
  7. #include "test_utils.h"
  8. /* Note: these functions are included here for unit test purposes. They are not needed for writing
  9. * normal code. If writing standard C floating point code, libgcc should correctly include implementations
  10. * that use the floating point registers correctly. */
  11. static float addsf(float a, float b)
  12. {
  13. float result;
  14. asm volatile (
  15. "wfr f0, %1\n"
  16. "wfr f1, %2\n"
  17. "add.s f2, f0, f1\n"
  18. "rfr %0, f2\n"
  19. :"=r"(result):"r"(a), "r"(b)
  20. );
  21. return result;
  22. }
  23. static float mulsf(float a, float b)
  24. {
  25. float result;
  26. asm volatile (
  27. "wfr f0, %1\n"
  28. "wfr f1, %2\n"
  29. "mul.s f2, f0, f1\n"
  30. "rfr %0, f2\n"
  31. :"=r"(result):"r"(a), "r"(b)
  32. );
  33. return result;
  34. }
  35. static float divsf(float a, float b)
  36. {
  37. float result;
  38. asm volatile (
  39. "wfr f0, %1\n"
  40. "wfr f1, %2\n"
  41. "div0.s f3, f1 \n"
  42. "nexp01.s f4, f1 \n"
  43. "const.s f5, 1 \n"
  44. "maddn.s f5, f4, f3 \n"
  45. "mov.s f6, f3 \n"
  46. "mov.s f7, f1 \n"
  47. "nexp01.s f8, f0 \n"
  48. "maddn.s f6, f5, f3 \n"
  49. "const.s f5, 1 \n"
  50. "const.s f2, 0 \n"
  51. "neg.s f9, f8 \n"
  52. "maddn.s f5,f4,f6 \n"
  53. "maddn.s f2, f9, f3 \n"
  54. "mkdadj.s f7, f0 \n"
  55. "maddn.s f6,f5,f6 \n"
  56. "maddn.s f9,f4,f2 \n"
  57. "const.s f5, 1 \n"
  58. "maddn.s f5,f4,f6 \n"
  59. "maddn.s f2,f9,f6 \n"
  60. "neg.s f9, f8 \n"
  61. "maddn.s f6,f5,f6 \n"
  62. "maddn.s f9,f4,f2 \n"
  63. "addexpm.s f2, f7 \n"
  64. "addexp.s f6, f7 \n"
  65. "divn.s f2,f9,f6\n"
  66. "rfr %0, f2\n"
  67. :"=r"(result):"r"(a), "r"(b)
  68. );
  69. return result;
  70. }
  71. static float sqrtsf(float a)
  72. {
  73. float result;
  74. asm volatile (
  75. "wfr f0, %1\n"
  76. "sqrt0.s f2, f0\n"
  77. "const.s f5, 0\n"
  78. "maddn.s f5, f2, f2\n"
  79. "nexp01.s f3, f0\n"
  80. "const.s f4, 3\n"
  81. "addexp.s f3, f4\n"
  82. "maddn.s f4, f5, f3\n"
  83. "nexp01.s f5, f0\n"
  84. "neg.s f6, f5\n"
  85. "maddn.s f2, f4, f2\n"
  86. "const.s f1, 0\n"
  87. "const.s f4, 0\n"
  88. "const.s f7, 0\n"
  89. "maddn.s f1, f6, f2\n"
  90. "maddn.s f4, f2, f3\n"
  91. "const.s f6, 3\n"
  92. "maddn.s f7, f6, f2\n"
  93. "maddn.s f5, f1, f1\n"
  94. "maddn.s f6, f4, f2\n"
  95. "neg.s f3, f7\n"
  96. "maddn.s f1, f5, f3\n"
  97. "maddn.s f7, f6, f7\n"
  98. "mksadj.s f2, f0\n"
  99. "nexp01.s f5, f0\n"
  100. "maddn.s f5, f1, f1\n"
  101. "neg.s f3, f7\n"
  102. "addexpm.s f1, f2\n"
  103. "addexp.s f3, f2\n"
  104. "divn.s f1, f5, f3\n"
  105. "rfr %0, f1\n"
  106. :"=r"(result):"r"(a)
  107. );
  108. return result;
  109. }
  110. TEST_CASE("test FP add", "[fp]")
  111. {
  112. float a = 100.0f;
  113. float b = 0.5f;
  114. float c = addsf(a, b);
  115. float eps = c - 100.5f;
  116. printf("a=%g b=%g c=%g eps=%g\r\n", a, b, c, eps);
  117. TEST_ASSERT_TRUE(fabs(eps) < 0.000001);
  118. }
  119. TEST_CASE("test FP mul", "[fp]")
  120. {
  121. float a = 100.0f;
  122. float b = 0.05f;
  123. float c = mulsf(a, b);
  124. float eps = c - 5.0f;
  125. printf("a=%g b=%g c=%g eps=%g\r\n", a, b, c, eps);
  126. TEST_ASSERT_TRUE(fabs(eps) < 0.000001);
  127. }
  128. TEST_CASE("test FP div", "[fp]")
  129. {
  130. float a = 100.0f;
  131. float b = 5.0f;
  132. float c = divsf(a, b);
  133. float eps = c - 20.0f;
  134. printf("a=%g b=%g c=%g eps=%g\r\n", a, b, c, eps);
  135. TEST_ASSERT_TRUE(fabs(eps) < 0.000001);
  136. }
  137. TEST_CASE("test FP sqrt", "[fp]")
  138. {
  139. float a = 100.0f;
  140. float c = sqrtsf(a);
  141. float eps = c - 10.0f;
  142. printf("a=%g c=%g eps=%g\r\n", a, c, eps);
  143. TEST_ASSERT_TRUE(fabs(eps) < 0.000001);
  144. }
  145. struct TestFPState {
  146. int fail;
  147. int done;
  148. };
  149. static const int testFpIter = 100000;
  150. static void tskTestFP(void *pvParameters)
  151. {
  152. struct TestFPState *state = (struct TestFPState *) pvParameters;
  153. for (int i = 0; i < testFpIter; ++i) {
  154. // calculate zero in a slightly obscure way
  155. float y = sqrtsf(addsf(1.0f, divsf(mulsf(sqrtsf(2), sqrtsf(2)), 2.0f)));
  156. y = mulsf(y, y);
  157. y = addsf(y, -2.0f);
  158. // check that result is not far from zero
  159. float eps = fabs(y);
  160. if (eps > 1e-6f) {
  161. state->fail++;
  162. printf("%s: i=%d y=%f eps=%f\r\n", __func__, i, y, eps);
  163. }
  164. }
  165. state->done++;
  166. vTaskDelete(NULL);
  167. }
  168. TEST_CASE("context switch saves FP registers", "[fp]")
  169. {
  170. struct TestFPState state;
  171. state.done = 0;
  172. state.fail = 0;
  173. xTaskCreatePinnedToCore(tskTestFP, "tsk1", 2048, &state, 3, NULL, 0);
  174. xTaskCreatePinnedToCore(tskTestFP, "tsk2", 2048, &state, 3, NULL, 0);
  175. xTaskCreatePinnedToCore(tskTestFP, "tsk3", 2048, &state, 3, NULL, portNUM_PROCESSORS - 1);
  176. xTaskCreatePinnedToCore(tskTestFP, "tsk4", 2048, &state, 3, NULL, 0);
  177. while (state.done != 4) {
  178. vTaskDelay(100 / portTICK_PERIOD_MS);
  179. }
  180. if (state.fail) {
  181. const int total = testFpIter * 4;
  182. printf("Failed: %d, total: %d\r\n", state.fail, total);
  183. }
  184. TEST_ASSERT(state.fail == 0);
  185. }
  186. /* Note: not static, to avoid optimisation of const result */
  187. float IRAM_ATTR test_fp_benchmark_fp_divide(int counts, unsigned *cycles)
  188. {
  189. float f = MAXFLOAT;
  190. uint32_t before, after;
  191. RSR(CCOUNT, before);
  192. for (int i = 0; i < counts; i++) {
  193. f /= 1.000432f;
  194. }
  195. RSR(CCOUNT, after);
  196. *cycles = (after - before) / counts;
  197. return f;
  198. }
  199. TEST_CASE("floating point division performance", "[fp]")
  200. {
  201. const unsigned COUNTS = 1000;
  202. unsigned cycles = 0;
  203. // initialize fpu
  204. volatile __attribute__((unused)) float dummy = sqrtf(rand());
  205. float f = test_fp_benchmark_fp_divide(COUNTS, &cycles);
  206. printf("%d divisions from %f = %f\n", COUNTS, MAXFLOAT, f);
  207. printf("Per division = %d cycles\n", cycles);
  208. TEST_PERFORMANCE_LESS_THAN(ESP32_CYCLES_PER_DIV, "%d cycles", cycles);
  209. }
  210. /* Note: not static, to avoid optimisation of const result */
  211. float IRAM_ATTR test_fp_benchmark_fp_sqrt(int counts, unsigned *cycles)
  212. {
  213. float f = MAXFLOAT;
  214. uint32_t before, after;
  215. RSR(CCOUNT, before);
  216. for (int i = 0; i < counts; i++) {
  217. f = sqrtf(f);
  218. }
  219. RSR(CCOUNT, after);
  220. *cycles = (after - before) / counts;
  221. return f;
  222. }
  223. TEST_CASE("floating point square root performance", "[fp]")
  224. {
  225. const unsigned COUNTS = 200;
  226. unsigned cycles = 0;
  227. // initialize fpu
  228. volatile float __attribute__((unused)) dummy = sqrtf(rand());
  229. float f = test_fp_benchmark_fp_sqrt(COUNTS, &cycles);
  230. printf("%d square roots from %f = %f\n", COUNTS, MAXFLOAT, f);
  231. printf("Per sqrt = %d cycles\n", cycles);
  232. TEST_PERFORMANCE_LESS_THAN(ESP32_CYCLES_PER_SQRT, "%d cycles", cycles);
  233. }