whets.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612
  1. /*
  2. * Compile Linux Intel
  3. * cc whets.c cpuidc.c -lm -lrt -O3 -o whetstoneIL
  4. *
  5. * Cross Compile on Linux Intel for ARM
  6. * ~/toolchain/raspbian-toolchain-gcc-4.7.2-linux32/bin/arm-linux-gnueabihf-gcc
  7. whets.c cpuidc.c -lm -lrt -O3 -march=armv6 -mfloat-abi=hard -mfpu=vfp -o
  8. whetstonePiA6
  9. *
  10. * Compile on Raspberry Pi
  11. * gcc whets.c cpuidc.c -lm -lrt -O3 -march=armv6 -mfloat-abi=hard -mfpu=vfp -o
  12. whetstonePiA6
  13. *
  14. *************************************************************************
  15. *
  16. * Document: Whets.c
  17. * File Group: Classic Benchmarks
  18. * Creation Date: 6 November 1996
  19. * Revision Date: 6 November 2010 Ubuntu Version for PCs
  20. *
  21. * Title: Whetstone Benchmark in C/C++
  22. * Keywords: WHETSTONE BENCHMARK PERFORMANCE MIPS
  23. * MWIPS MFLOPS
  24. *
  25. * Abstract: C or C++ version of Whetstone one of the
  26. * Classic Numeric Benchmarks with example
  27. * results on P3 to P6 based PCs.
  28. *
  29. * Contributor: roy@roylongbottom.org.uk
  30. *
  31. ************************************************************
  32. *
  33. * C/C++ Whetstone Benchmark Single or Double Precision
  34. *
  35. * Original concept Brian Wichmann NPL 1960's
  36. * Original author Harold Curnow CCTA 1972
  37. * Self timing versions Roy Longbottom CCTA 1978/87
  38. * Optimisation control Bangor University 1987/90
  39. * C/C++ Version Roy Longbottom 1996
  40. * Compatibility & timers Al Aburto 1996
  41. *
  42. ************************************************************
  43. *
  44. * Official version approved by:
  45. *
  46. * Harold Curnow 100421.1615@compuserve.com
  47. *
  48. * Happy 25th birthday Whetstone, 21 November 1997
  49. *
  50. ************************************************************
  51. *
  52. * The program normally runs for about 100 seconds
  53. * (adjustable in main - variable duration). This time
  54. * is necessary because of poor PC clock resolution.
  55. * The original concept included such things as a given
  56. * number of subroutine calls and divides which may be
  57. * changed by optimisation. For comparison purposes the
  58. * compiler and level of optimisation should be identified.
  59. *
  60. * This version is set to run for 10 seconds using high
  61. * resolution timer.
  62. *
  63. ************************************************************
  64. *
  65. * The original benchmark had a single variable I which
  66. * controlled the running time. Constants with values up
  67. * to 899 were multiplied by I to control the number
  68. * passes for each loop. It was found that large values
  69. * of I could overflow index registers so an extra outer
  70. * loop with a second variable J was added.
  71. *
  72. * Self timing versions were produced during the early
  73. * days. The 1978 changes supplied timings of individual
  74. * loops and these were used later to produce MFLOPS and
  75. * MOPS ratings.
  76. *
  77. * 1987 changes converted the benchmark to Fortran 77
  78. * standards and removed redundant IF statements and
  79. * loops to leave the 8 active loops N1 to N8. Procedure
  80. * P3 was changed to use global variables to avoid over-
  81. * optimisation with the first two statements changed from
  82. * X1=X and Y1=Y to X=Y and Y=Z. A self time calibrating
  83. * version for PCs was also produced, the facility being
  84. * incorporated in this version.
  85. *
  86. * This version has changes to avoid worse than expected
  87. * speed ratings, due to underflow, and facilities to show
  88. * that consistent numeric output is produced with varying
  89. * optimisation levels or versions in different languages.
  90. *
  91. * Some of the procedures produce ever decreasing numbers.
  92. * To avoid problems, variables T and T1 have been changed
  93. * from 0.499975 and 0.50025 to 0.49999975 and 0.50000025.
  94. *
  95. * Each section now has its own double loop. Inner loops
  96. * are run 100 times the loop constants. Calibration
  97. * determines the number of outer loop passes. The
  98. * numeric results produced in the main output are for
  99. * one pass on the outer loop. As underflow problems were
  100. * still likely on a processor 100 times faster than a 100
  101. * MHz Pentium, three sections have T=1.0-T inserted in the
  102. * outer loop to avoid the problem. The two loops avoid
  103. * index register overflows.
  104. *
  105. * The first section is run ten times longer than required
  106. * for accuracy in calculating MFLOPS. This time is divided
  107. * by ten for inclusion in the MWIPS calculations.
  108. *
  109. * Early version has facilities for typing in details of
  110. * the particular run, appended to file whets.txt along
  111. * with the results. This version attemps to obtain these
  112. * automatically.
  113. *
  114. * 2010 Section 4 modified slightly to avoid over optimisation
  115. * by GCC compiler
  116. *
  117. * Roy Longbottom roy@roylongbottom.org.uk
  118. *
  119. ************************************************************
  120. *
  121. * Whetstone benchmark results, further details of the
  122. * benchmarks and history are available from:
  123. *
  124. * http://www.roylongbottom.org.uk/whetstone%20results.htm
  125. * http://www.roylongbottom.org.uk/whetstone.htm
  126. *
  127. ************************************************************
  128. *
  129. * Source code is available in C/C++, Fortran, Basic and
  130. * Visual Basic in the same format as this version. Pre-
  131. * compiled versions for PCs are also available via C++.
  132. * These comprise optimised and non-optimised versions
  133. * for DOS, Windows and NT. See:
  134. *
  135. * http://www.roylongbottom.org.uk/whetstone%20results.htm
  136. *
  137. ************************************************************
  138. *
  139. * Example of initial calibration display (Pentium 100 MHz)
  140. *
  141. * Single Precision C/C++ Whetstone Benchmark
  142. *
  143. * Calibrate
  144. * 0.17 Seconds 1 Passes (x 100)
  145. * 0.77 Seconds 5 Passes (x 100)
  146. * 3.70 Seconds 25 Passes (x 100)
  147. *
  148. * Use 676 passes (x 100)
  149. *
  150. * 676 passes are used for an approximate duration of 100
  151. * seconds, providing an initial estimate of a speed rating
  152. * of 67.6 MWIPS.
  153. *
  154. * This is followed by the table of results as below.
  155. * Whetstone Single Precision Benchmark in C/C++
  156. *
  157. * Loop content Result MFLOPS MOPS Seconds
  158. *
  159. * N1 floating point -1.12475025653839100 19.971 0.274
  160. * N2 floating point -1.12274754047393800 11.822 3.240
  161. * N3 if then else 1.00000000000000000 11.659 2.530
  162. * N4 fixed point 12.00000000000000000 13.962 6.430
  163. * N5 sin,cos etc. 0.49904659390449520 2.097 11.310
  164. * N6 floating point 0.99999988079071040 3.360 45.750
  165. * N7 assignments 3.00000000000000000 2.415 21.810
  166. * N8 exp,sqrt etc. 0.75110864639282230 1.206 8.790
  167. *
  168. * MWIPS 28.462 100.134
  169. *
  170. * Note different numeric results to single precision. Slight variations
  171. * are normal with different compilers and sometimes optimisation levels.
  172. *
  173. **************************************************************************
  174. */
  175. #include <math.h> /* for sin, exp etc. */
  176. #include <stdio.h>
  177. #include <stdlib.h>
  178. #include <string.h>
  179. // zcc report error stdatomic.h:201:17: error: unknown type name 'int_least8_t'; did you mean '__int_least8_t'?
  180. //#include "stdatomic.h"
  181. #include "nuclei_sdk_soc.h"
  182. #include "config.h"
  183. #include "cpuidh.h"
  184. /*PRECISION PRECISION PRECISION PRECISION PRECISION PRECISION PRECISION*/
  185. #define Version "Roy Longbottom Version"
  186. void whetstones(long xtra, long x100, int calibrate);
  187. void pa(SPDP e[4], SPDP t, SPDP t2);
  188. void po(SPDP e1[4], long j, long k, long l);
  189. void p3(SPDP* x, SPDP* y, SPDP* z, SPDP t, SPDP t1, SPDP t2);
  190. void pout(char *title, float ops, int type, SPDP checknum, SPDP time,
  191. int calibrate, int section);
  192. static SPDP loop_time[9];
  193. static SPDP loop_mops[9];
  194. static SPDP loop_mflops[9];
  195. static SPDP TimeUsed;
  196. static SPDP mwips, mwips_mhz;
  197. static char headings[9][22];
  198. static SPDP Check;
  199. static SPDP results[9];
  200. static uint64_t start_cycle, end_cycle, used_cycle;
  201. static uint64_t start_instret, end_instret, used_instret;
  202. /* Only support dec number < 1000 */
  203. static char *dec2str(uint32_t val)
  204. {
  205. static char str[4];
  206. val = val % 1000;
  207. int decnum = 100;
  208. for (int i = 0; i < 3; i ++) {
  209. str[i] = val / decnum + '0';
  210. val = val % decnum;
  211. decnum = decnum / 10;
  212. }
  213. str[3] = '\0';
  214. return str;
  215. }
  216. int main(void)
  217. {
  218. int count = 10, calibrate = 1;
  219. long xtra = 1;
  220. #if defined(CPU_SERIES) && CPU_SERIES == 100
  221. long x100 = 10;
  222. #else
  223. long x100 = 100;
  224. //NOTE: when no fpu present, use less passes
  225. #ifndef __riscv_flen
  226. x100 = x100 >> 2;
  227. #endif
  228. #endif
  229. #if CFG_SIMULATION
  230. int duration = 1;
  231. #else
  232. int duration = 3;
  233. #endif
  234. printf("\n");
  235. #if defined(CPU_SERIES) && CPU_SERIES < 300
  236. printf("100 and 200 series CPU have no FPU, running Whetstone is meaningless for these CPU.\n");
  237. #endif
  238. printf("##########################################\n");
  239. printf("%s Precision C Whetstone Benchmark %s \n", Precision, Version);
  240. printf("Calibrate\n");
  241. do {
  242. TimeUsed = 0;
  243. whetstones(xtra, x100, calibrate);
  244. printf("%11.2f Seconds %10.0lf Passes (x %d)\n", TimeUsed,
  245. (SPDP)(xtra), x100);
  246. calibrate++;
  247. count--;
  248. #if CFG_SIMULATION
  249. if (TimeUsed > 0.02)
  250. #else
  251. #if defined(CPU_SERIES) && CPU_SERIES == 100
  252. if (TimeUsed > 0.1)
  253. #else
  254. if (TimeUsed > 0.2)
  255. #endif
  256. #endif
  257. {
  258. count = 0;
  259. } else {
  260. xtra = xtra * 5;
  261. }
  262. } while (count > 0);
  263. if (TimeUsed > 0) {
  264. xtra = (long)((SPDP)(duration * xtra) / TimeUsed);
  265. }
  266. if (xtra < 1) {
  267. xtra = 1;
  268. }
  269. calibrate = 0;
  270. printf("\nUse %u passes (x %d)\n", (uint32_t)xtra, x100);
  271. printf("\n %s Precision C/C++ Whetstone Benchmark", Precision);
  272. #ifdef PRECOMP
  273. printf("\n Compiler %s", precompiler);
  274. printf("\n Options %s\n", preoptions);
  275. #else
  276. printf("\n");
  277. #endif
  278. printf("\nLoop content Result MFLOPS "
  279. " MOPS Seconds\n\n");
  280. // reset instret and cycle
  281. __set_rv_cycle(0);
  282. __set_rv_instret(0);
  283. start_cycle = __get_rv_cycle();
  284. start_instret = __get_rv_instret();
  285. TimeUsed = 0;
  286. whetstones(xtra, x100, calibrate);
  287. end_cycle = __get_rv_cycle();
  288. end_instret = __get_rv_instret();
  289. used_cycle = end_cycle - start_cycle;
  290. used_instret = end_instret - start_instret;
  291. printf("\nMWIPS ");
  292. if (TimeUsed > 0) {
  293. mwips = (float)(xtra) * (float)(x100) / (10 * TimeUsed);
  294. } else {
  295. mwips = 0;
  296. }
  297. printf("%39.3f%19.3f\n\n", mwips, TimeUsed);
  298. printf("\nMWIPS/MHz ");
  299. mwips_mhz = mwips / SystemCoreClock * 1000000;
  300. printf("%39.3f%19.3f\n\n", mwips_mhz, TimeUsed);
  301. uint32_t whet_mwips = (uint32_t)(mwips_mhz * 1000);
  302. char *pstr = dec2str(whet_mwips);
  303. printf("\nCSV, Benchmark, MWIPS/MHz\n");
  304. printf("CSV, Whetstone, %u.%s\n", (unsigned int)(whet_mwips/1000), pstr);
  305. float f_ipc = (((float)used_instret / used_cycle));
  306. uint32_t i_ipc = (uint32_t)(f_ipc * 1000);
  307. pstr = dec2str(i_ipc);
  308. printf("IPC = Instret/Cycle = %u/%u = %u.%s\n", (unsigned int)used_instret, (unsigned int)used_cycle, (unsigned int)(i_ipc/1000), pstr);
  309. if (Check == 0) {
  310. printf("Wrong answer \n");
  311. return -1;
  312. }
  313. return 0;
  314. }
  315. void whetstones(long xtra, long x100, int calibrate)
  316. {
  317. long n1, n2, n3, n4, n5, n6, n7, n8, i, ix, n1mult;
  318. SPDP x, y, z;
  319. long j, k, l;
  320. SPDP e1[4];
  321. SPDP t = 0.49999975;
  322. SPDP t0 = t;
  323. SPDP t1 = 0.50000025;
  324. SPDP t2 = 2.0;
  325. Check = 0.0;
  326. n1 = 12 * x100;
  327. n2 = 14 * x100;
  328. n3 = 345 * x100;
  329. n4 = 210 * x100;
  330. n5 = 32 * x100;
  331. n6 = 899 * x100;
  332. n7 = 616 * x100;
  333. n8 = 93 * x100;
  334. n1mult = 10;
  335. /* Section 1, Array elements */
  336. e1[0] = 1.0;
  337. e1[1] = -1.0;
  338. e1[2] = -1.0;
  339. e1[3] = -1.0;
  340. start_time();
  341. {
  342. for (ix = 0; ix < xtra; ix++) {
  343. for (i = 0; i < n1 * n1mult; i++) {
  344. e1[0] = (e1[0] + e1[1] + e1[2] - e1[3]) * t;
  345. e1[1] = (e1[0] + e1[1] - e1[2] + e1[3]) * t;
  346. e1[2] = (e1[0] - e1[1] + e1[2] + e1[3]) * t;
  347. e1[3] = (-e1[0] + e1[1] + e1[2] + e1[3]) * t;
  348. }
  349. t = 1.0 - t;
  350. }
  351. t = t0;
  352. }
  353. end_time();
  354. secs = secs / (SPDP)(n1mult);
  355. pout("N1 floating point\0", (float)(n1 * 16) * (float)(xtra), 1, e1[3],
  356. secs, calibrate, 1);
  357. /* Section 2, Array as parameter */
  358. start_time();
  359. {
  360. for (ix = 0; ix < xtra; ix++) {
  361. for (i = 0; i < n2; i++) {
  362. pa(e1, t, t2);
  363. }
  364. t = 1.0 - t;
  365. }
  366. t = t0;
  367. }
  368. end_time();
  369. pout("N2 floating point\0", (float)(n2 * 96) * (float)(xtra), 1, e1[3],
  370. secs, calibrate, 2);
  371. /* Section 3, Conditional jumps */
  372. j = 1;
  373. start_time();
  374. {
  375. for (ix = 0; ix < xtra; ix++) {
  376. for (i = 0; i < n3; i++) {
  377. if (j == 1) {
  378. j = 2;
  379. } else {
  380. j = 3;
  381. }
  382. if (j > 2) {
  383. j = 0;
  384. } else {
  385. j = 1;
  386. }
  387. if (j < 1) {
  388. j = 1;
  389. } else {
  390. j = 0;
  391. }
  392. }
  393. }
  394. }
  395. end_time();
  396. pout("N3 if then else \0", (float)(n3 * 3) * (float)(xtra), 2, (SPDP)(j),
  397. secs, calibrate, 3);
  398. /* Section 4, Integer arithmetic */
  399. j = 1;
  400. k = 2;
  401. l = 3;
  402. e1[0] = 0.0;
  403. e1[1] = 0.0;
  404. start_time();
  405. {
  406. for (ix = 0; ix < xtra; ix++) {
  407. for (i = 0; i < n4; i++) {
  408. j = j * (k - j) * (l - k);
  409. k = l * k - (l - j) * k;
  410. l = (l - k) * (k + j);
  411. e1[l - 2] = e1[l - 2] + j + k + l;
  412. e1[k - 2] = e1[k - 2] + j * k * l;
  413. // was e1[l-2] = j + k + l; and e1[k-2] = j * k * l;
  414. }
  415. }
  416. }
  417. end_time();
  418. x = (e1[0] + e1[1]) / (SPDP)n4 / (SPDP)xtra; // was x = e1[0]+e1[1];
  419. pout("N4 fixed point \0", (float)(n4 * 15) * (float)(xtra), 2, x, secs,
  420. calibrate, 4);
  421. /* Section 5, Trig functions */
  422. x = 0.5;
  423. y = 0.5;
  424. start_time();
  425. {
  426. for (ix = 0; ix < xtra; ix++) {
  427. for (i = 1; i < n5; i++) {
  428. x = t * atan(t2 * sin(x) * cos(x) /
  429. (cos(x + y) + cos(x - y) - 1.0));
  430. y = t * atan(t2 * sin(y) * cos(y) /
  431. (cos(x + y) + cos(x - y) - 1.0));
  432. }
  433. t = 1.0 - t;
  434. }
  435. t = t0;
  436. }
  437. end_time();
  438. pout("N5 sin,cos etc. \0", (float)(n5 * 26) * (float)(xtra), 2, y, secs,
  439. calibrate, 5);
  440. /* Section 6, Procedure calls */
  441. x = 1.0;
  442. y = 1.0;
  443. z = 1.0;
  444. start_time();
  445. {
  446. for (ix = 0; ix < xtra; ix++) {
  447. for (i = 0; i < n6; i++) {
  448. p3(&x, &y, &z, t, t1, t2);
  449. }
  450. }
  451. }
  452. end_time();
  453. pout("N6 floating point\0", (float)(n6 * 6) * (float)(xtra), 1, z, secs,
  454. calibrate, 6);
  455. /* Section 7, Array refrences */
  456. j = 0;
  457. k = 1;
  458. l = 2;
  459. e1[0] = 1.0;
  460. e1[1] = 2.0;
  461. e1[2] = 3.0;
  462. start_time();
  463. {
  464. for (ix = 0; ix < xtra; ix++) {
  465. for (i = 0; i < n7; i++) {
  466. po(e1, j, k, l);
  467. }
  468. }
  469. }
  470. end_time();
  471. pout("N7 assignments \0", (float)(n7 * 3) * (float)(xtra), 2, e1[2], secs,
  472. calibrate, 7);
  473. /* Section 8, Standard functions */
  474. x = 0.75;
  475. start_time();
  476. {
  477. for (ix = 0; ix < xtra; ix++) {
  478. for (i = 0; i < n8; i++) {
  479. x = sqrt(exp(log(x) / t1));
  480. }
  481. }
  482. }
  483. end_time();
  484. pout("N8 exp,sqrt etc. \0", (float)(n8 * 4) * (float)(xtra), 2, x, secs,
  485. calibrate, 8);
  486. return;
  487. }
  488. void pa(SPDP e[4], SPDP t, SPDP t2)
  489. {
  490. long j;
  491. for (j = 0; j < 6; j++) {
  492. e[0] = (e[0] + e[1] + e[2] - e[3]) * t;
  493. e[1] = (e[0] + e[1] - e[2] + e[3]) * t;
  494. e[2] = (e[0] - e[1] + e[2] + e[3]) * t;
  495. e[3] = (-e[0] + e[1] + e[2] + e[3]) / t2;
  496. }
  497. return;
  498. }
  499. void po(SPDP e1[4], long j, long k, long l)
  500. {
  501. e1[j] = e1[k];
  502. e1[k] = e1[l];
  503. e1[l] = e1[j];
  504. return;
  505. }
  506. void p3(SPDP* x, SPDP* y, SPDP* z, SPDP t, SPDP t1, SPDP t2)
  507. {
  508. *x = *y;
  509. *y = *z;
  510. *x = t * (*x + *y);
  511. *y = t1 * (*x + *y);
  512. *z = (*x + *y) / t2;
  513. return;
  514. }
  515. void pout(char *title, float ops, int type, SPDP checknum, SPDP time,
  516. int calibrate, int section)
  517. {
  518. SPDP mops, mflops;
  519. Check = Check + checknum;
  520. loop_time[section] = time;
  521. strcpy(headings[section], title);
  522. TimeUsed = TimeUsed + time;
  523. if (calibrate == 1)
  524. {
  525. results[section] = checknum;
  526. }
  527. if (calibrate == 0) {
  528. printf("%s %20.17f ", headings[section], results[section]);
  529. if (type == 1) {
  530. if (time > 0) {
  531. mflops = ops / (1000000L * time);
  532. } else {
  533. mflops = 0;
  534. }
  535. loop_mops[section] = 99999;
  536. loop_mflops[section] = mflops;
  537. printf(" %9.3f %9.3f\n", loop_mflops[section],
  538. loop_time[section]);
  539. } else {
  540. if (time > 0) {
  541. mops = ops / (1000000L * time);
  542. } else {
  543. mops = 0;
  544. }
  545. loop_mops[section] = mops;
  546. loop_mflops[section] = 0;
  547. printf(" %9.3f%9.3f\n", loop_mops[section],
  548. loop_time[section]);
  549. }
  550. }
  551. return;
  552. }