Przeglądaj źródła

Merge branch 'main' into gitbook

Wenyong Huang 2 lat temu
rodzic
commit
6ba04db0e7
90 zmienionych plików z 4746 dodań i 516 usunięć
  1. 4 82
      .github/workflows/compilation_on_android_ubuntu.yml
  2. 1 1
      .github/workflows/compilation_on_nuttx.yml
  3. 67 137
      .github/workflows/compilation_on_sgx.yml
  4. 622 0
      .github/workflows/nightly_run.yml
  5. 1 1
      .github/workflows/spec_test_on_nuttx.yml
  6. 10 0
      ATTRIBUTIONS.md
  7. 39 0
      RELEASE_NOTES.md
  8. 1 1
      build-scripts/build_llvm.py
  9. 26 0
      build-scripts/config_common.cmake
  10. 1 1
      build-scripts/requirements.txt
  11. 4 0
      core/config.h
  12. 296 7
      core/iwasm/aot/aot_loader.c
  13. 9 0
      core/iwasm/aot/aot_reloc.h
  14. 535 0
      core/iwasm/aot/aot_runtime.c
  15. 98 0
      core/iwasm/aot/aot_runtime.h
  16. 6 0
      core/iwasm/aot/arch/aot_reloc_x86_32.c
  17. 18 5
      core/iwasm/aot/arch/aot_reloc_x86_64.c
  18. 6 1
      core/iwasm/common/wasm_memory.c
  19. 35 1
      core/iwasm/common/wasm_runtime_common.c
  20. 1 0
      core/iwasm/common/wasm_runtime_common.h
  21. 1 1
      core/iwasm/common/wasm_shared_memory.c
  22. 8 0
      core/iwasm/compilation/aot_compiler.h
  23. 284 32
      core/iwasm/compilation/aot_emit_aot_file.c
  24. 1 5
      core/iwasm/compilation/aot_emit_function.c
  25. 185 48
      core/iwasm/compilation/aot_emit_memory.c
  26. 1 1
      core/iwasm/compilation/aot_emit_memory.h
  27. 1 1
      core/iwasm/compilation/aot_emit_variable.c
  28. 105 18
      core/iwasm/compilation/aot_llvm.c
  29. 40 3
      core/iwasm/compilation/aot_llvm.h
  30. 60 7
      core/iwasm/compilation/aot_llvm_extra.cpp
  31. 52 22
      core/iwasm/compilation/simd/simd_load_store.c
  32. 3 0
      core/iwasm/include/aot_export.h
  33. 31 4
      core/iwasm/include/wasm_export.h
  34. 18 3
      core/iwasm/interpreter/wasm_interp_classic.c
  35. 12 3
      core/iwasm/interpreter/wasm_interp_fast.c
  36. 2 1
      core/iwasm/interpreter/wasm_loader.c
  37. 2 1
      core/iwasm/interpreter/wasm_mini_loader.c
  38. 0 1
      core/iwasm/libraries/lib-pthread/lib_pthread_wrapper.c
  39. 1 1
      core/iwasm/libraries/libc-wasi/sandboxed-system-primitives/src/ssp_config.h
  40. 12 0
      core/shared/platform/common/posix/posix_thread.c
  41. 1 0
      core/shared/platform/include/platform_api_extension.h
  42. 14 0
      core/shared/platform/linux/platform_internal.h
  43. 1 0
      core/shared/platform/windows/platform_internal.h
  44. 1 1
      core/shared/platform/windows/shared_platform.cmake
  45. 1 1
      core/version.h
  46. 3 1
      doc/build_wasm_app.md
  47. 2 0
      language-bindings/python/README.md
  48. 1 0
      language-bindings/python/setup.py
  49. 2 0
      language-bindings/python/wamr-api/README.md
  50. 0 18
      language-bindings/python/wamr-api/samples/compile.sh
  51. 134 5
      product-mini/platforms/posix/main.c
  52. 0 1
      product-mini/platforms/windows/CMakeLists.txt
  53. 2 2
      samples/ref-types/src/hello.c
  54. 0 9
      samples/wasm-c-api/CMakeLists.txt
  55. 1 1
      test-tools/wamr-ide/VSCode-Extension/package.json
  56. 24 43
      test-tools/wamr-ide/VSCode-Extension/src/debugConfigurationProvider.ts
  57. 0 1
      test-tools/wamr-ide/VSCode-Extension/src/extension.ts
  58. 62 0
      tests/benchmarks/README.md
  59. 2 0
      tests/benchmarks/coremark/README.md
  60. 7 0
      tests/benchmarks/coremark/build.sh
  61. 10 3
      tests/benchmarks/coremark/run.sh
  62. 50 0
      tests/benchmarks/coremark/test_pgo.sh
  63. 7 0
      tests/benchmarks/dhrystone/LICENSE
  64. 24 0
      tests/benchmarks/dhrystone/build.sh
  65. 306 0
      tests/benchmarks/dhrystone/include/dhry.h
  66. 19 0
      tests/benchmarks/dhrystone/run.sh
  67. 485 0
      tests/benchmarks/dhrystone/src/dhry_1.c
  68. 187 0
      tests/benchmarks/dhrystone/src/dhry_2.c
  69. 50 0
      tests/benchmarks/dhrystone/test_pgo.sh
  70. 2 0
      tests/benchmarks/jetstream/README.md
  71. 113 13
      tests/benchmarks/jetstream/build.sh
  72. 9 6
      tests/benchmarks/jetstream/jetstream.patch
  73. 15 3
      tests/benchmarks/jetstream/run_aot.sh
  74. 87 0
      tests/benchmarks/jetstream/test_pgo.sh
  75. 24 0
      tests/benchmarks/jetstream/tsf.patch
  76. 10 1
      tests/benchmarks/libsodium/build.sh
  77. 39 6
      tests/benchmarks/libsodium/run_aot.sh
  78. 116 0
      tests/benchmarks/libsodium/test_pgo.sh
  79. 8 0
      tests/benchmarks/polybench/build.sh
  80. 11 1
      tests/benchmarks/polybench/run_aot.sh
  81. 1 1
      tests/benchmarks/polybench/run_interp.sh
  82. 90 0
      tests/benchmarks/polybench/test_pgo.sh
  83. 2 0
      tests/benchmarks/sightglass/README.md
  84. 6 1
      tests/benchmarks/sightglass/build.sh
  85. 11 1
      tests/benchmarks/sightglass/run_aot.sh
  86. 2 2
      tests/benchmarks/sightglass/run_interp.sh
  87. 89 0
      tests/benchmarks/sightglass/test_pgo.sh
  88. 20 3
      tests/wamr-test-suites/spec-test-script/thread_proposal_ignore_cases.patch
  89. 19 2
      tests/wamr-test-suites/test_wamr.sh
  90. 77 1
      wamr-compiler/main.c

+ 4 - 82
.github/workflows/compilation_on_android_ubuntu.yml

@@ -76,7 +76,7 @@ jobs:
     with:
       os: "ubuntu-22.04"
       arch: "X86"
-
+  
   build_wamrc:
     needs:
       [build_llvm_libraries_on_ubuntu_2004, build_llvm_libraries_on_ubuntu_2204]
@@ -117,84 +117,6 @@ jobs:
           cmake --build . --config Release --parallel 4
         working-directory: wamr-compiler
 
-  build_iwasm_linux_gcc4_8:
-    runs-on: ubuntu-latest
-    container:
-      image: ubuntu:14.04
-    strategy:
-      matrix:
-        make_options_run_mode: [
-            # Running mode
-            $CLASSIC_INTERP_BUILD_OPTIONS,
-            $FAST_INTERP_BUILD_OPTIONS,
-            $FAST_JIT_BUILD_OPTIONS,
-          ]
-        make_options_feature: [
-            # Features
-            "-DWAMR_BUILD_CUSTOM_NAME_SECTION=1",
-            "-DWAMR_BUILD_DEBUG_AOT=1",
-            "-DWAMR_BUILD_DEBUG_INTERP=1",
-            "-DWAMR_BUILD_DUMP_CALL_STACK=1",
-            "-DWAMR_BUILD_LIB_PTHREAD=1",
-            "-DWAMR_BUILD_LIB_WASI_THREADS=1",
-            "-DWAMR_BUILD_LOAD_CUSTOM_SECTION=1",
-            "-DWAMR_BUILD_MINI_LOADER=1",
-            "-DWAMR_BUILD_MEMORY_PROFILING=1",
-            "-DWAMR_BUILD_MULTI_MODULE=1",
-            "-DWAMR_BUILD_PERF_PROFILING=1",
-            "-DWAMR_BUILD_REF_TYPES=1",
-            "-DWAMR_BUILD_SIMD=1",
-            "-DWAMR_BUILD_TAIL_CALL=1",
-            "-DWAMR_DISABLE_HW_BOUND_CHECK=1",
-          ]
-        exclude:
-          # uncompatiable feature and platform
-          # uncompatiable mode and feature
-          # MULTI_MODULE only on INTERP mode
-          - make_options_run_mode: $FAST_JIT_BUILD_OPTIONS
-            make_options_feature: "-DWAMR_BUILD_MULTI_MODULE=1"
-          # SIMD only on JIT/AOT mode
-          - make_options_run_mode: $CLASSIC_INTERP_BUILD_OPTIONS
-            make_options_feature: "-DWAMR_BUILD_SIMD=1"
-          - make_options_run_mode: $FAST_INTERP_BUILD_OPTIONS
-            make_options_feature: "-DWAMR_BUILD_SIMD=1"
-          # DEBUG_INTERP only on CLASSIC INTERP mode
-          - make_options_run_mode: $FAST_INTERP_BUILD_OPTIONS
-            make_options_feature: "-DWAMR_BUILD_DEBUG_INTERP=1"
-          - make_options_run_mode: $FAST_JIT_BUILD_OPTIONS
-            make_options_feature: "-DWAMR_BUILD_DEBUG_INTERP=1"
-          # DEBUG_AOT only on JIT/AOT mode
-          - make_options_run_mode: $CLASSIC_INTERP_BUILD_OPTIONS
-            make_options_feature: "-DWAMR_BUILD_DEBUG_AOT=1"
-          - make_options_run_mode: $FAST_INTERP_BUILD_OPTIONS
-            make_options_feature: "-DWAMR_BUILD_DEBUG_AOT=1"
-          # TODO: DEBUG_AOT on JIT
-          - make_options_run_mode: $FAST_JIT_BUILD_OPTIONS
-            make_options_feature: "-DWAMR_BUILD_DEBUG_AOT=1"
-          # MINI_LOADER only on INTERP mode
-          - make_options_run_mode: $FAST_JIT_BUILD_OPTIONS
-            make_options_feature: "-DWAMR_BUILD_MINI_LOADER=1"
-    steps:
-      - name: checkout
-        uses: actions/checkout@v3
-
-      - name: Install dependencies
-        run: apt update && apt install -y make g++-4.8 gcc-4.8 wget git
-
-      - name: Install cmake
-        run: |
-          wget https://github.com/Kitware/CMake/releases/download/v3.26.1/cmake-3.26.1-linux-x86_64.tar.gz -O cmake.tar.gz
-          tar xzf cmake.tar.gz
-          cp cmake-3.26.1-linux-x86_64/bin/cmake /usr/local/bin
-          cp -r cmake-3.26.1-linux-x86_64/share/cmake-3.26/ /usr/local/share/
-
-      - name: Build iwasm
-        run: |
-          mkdir build && cd build
-          cmake .. ${{ matrix.make_options_run_mode }} ${{ matrix.make_options_feature }} -DCMAKE_C_COMPILER=gcc-4.8 -DCMAKE_CXX_COMPILER=g++-4.8
-          cmake --build . --config Release --parallel 4
-        working-directory: product-mini/platforms/linux
-
   build_iwasm:
     needs:
       [build_llvm_libraries_on_ubuntu_2004, build_llvm_libraries_on_ubuntu_2204]
@@ -342,7 +264,6 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        sanitizer: ["", "ubsan"]
         make_options: [
             # Running mode
             $AOT_BUILD_OPTIONS,
@@ -400,14 +321,14 @@ jobs:
         if: (!endsWith(matrix.make_options, '_INTERP_BUILD_OPTIONS'))
         run: |
           mkdir build && cd build
-          cmake -DSANITIZER="${{matrix.sanitizer}}" ..
+          cmake ..
           cmake --build . --config Release --parallel 4
         working-directory: wamr-compiler
 
       - name: Build Sample [wasm-c-api]
         run: |
           VERBOSE=1
-          cmake -S . -B build ${{ matrix.make_options }} -DSANITIZER="${{matrix.sanitizer}}"
+          cmake -S . -B build ${{ matrix.make_options }}
           cmake --build build --config Release --parallel 4
           ctest --test-dir build --output-on-failure
         working-directory: samples/wasm-c-api
@@ -515,6 +436,7 @@ jobs:
       ]
     runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
         os: [ubuntu-20.04, ubuntu-22.04]
         running_mode:

+ 1 - 1
.github/workflows/compilation_on_nuttx.yml

@@ -94,7 +94,7 @@ jobs:
       - name: Install RISC-V Compilers
         if: contains(matrix.nuttx_board_config, 'risc-v')
         run: |
-          curl -L https://static.dev.sifive.com/dev-tools/freedom-tools/v2020.12/riscv64-unknown-elf-toolchain-10.2.0-2020.12.8-x86_64-linux-ubuntu14.tar.gz > riscv.tar.gz
+          curl -L -k https://static.dev.sifive.com/dev-tools/freedom-tools/v2020.12/riscv64-unknown-elf-toolchain-10.2.0-2020.12.8-x86_64-linux-ubuntu14.tar.gz > riscv.tar.gz
           tar xvf riscv.tar.gz
           echo "$PWD/riscv64-unknown-elf-toolchain-10.2.0-2020.12.8-x86_64-linux-ubuntu14/bin" >> $GITHUB_PATH
 

+ 67 - 137
.github/workflows/compilation_on_sgx.yml

@@ -51,6 +51,7 @@ env:
   AOT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0"
   CLASSIC_INTERP_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=0 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0"
   FAST_INTERP_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=0 -DWAMR_BUILD_FAST_INTERP=1 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0"
+  FAST_JIT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_FAST_JIT=1 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=1"
   LLVM_LAZY_JIT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_JIT=1 -DWAMR_BUILD_LAZY_JIT=1"
   LLVM_EAGER_JIT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_JIT=1 -DWAMR_BUILD_LAZY_JIT=0"
 
@@ -70,6 +71,7 @@ jobs:
             $AOT_BUILD_OPTIONS,
             $CLASSIC_INTERP_BUILD_OPTIONS,
             $FAST_INTERP_BUILD_OPTIONS,
+            $FAST_JIT_BUILD_OPTIONS,
             # Running modes unsupported
             #$LLVM_LAZY_JIT_BUILD_OPTIONS,
             #$LLVM_EAGER_JIT_BUILD_OPTIONS,
@@ -127,69 +129,24 @@ jobs:
           mkdir build && cd build
           cmake .. ${{ matrix.make_options_run_mode }} ${{ matrix.make_options_feature }}
           cmake --build . --config Release --parallel 4
+          cd ../enclave-sample
+          make
         working-directory: product-mini/platforms/${{ matrix.platform }}
 
-  build_wamrc:
-    needs: [build_llvm_libraries]
+  run_samples_file:
+    needs: [build_iwasm, build_llvm_libraries]
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        include:
-          - os: ubuntu-20.04
-            llvm_cache_key: ${{ needs.build_llvm_libraries.outputs.cache_key }}
-    steps:
-      - name: install SGX SDK and necessary libraries
-        run: |
-          mkdir -p /opt/intel
-          cd /opt/intel
-          wget https://download.01.org/intel-sgx/sgx-linux/2.15/distro/ubuntu20.04-server/sgx_linux_x64_sdk_2.15.100.3.bin
-          chmod +x sgx_linux_x64_sdk_2.15.100.3.bin
-          echo 'yes' | ./sgx_linux_x64_sdk_2.15.100.3.bin
-          echo 'deb [arch=amd64] https://download.01.org/intel-sgx/sgx_repo/ubuntu focal main' | sudo tee /etc/apt/sources.list.d/intel-sgx.list
-          wget -qO - https://download.01.org/intel-sgx/sgx_repo/ubuntu/intel-sgx-deb.key | sudo apt-key add -
-          sudo apt update
-          sudo apt install -y libsgx-launch libsgx-urts
-          source /opt/intel/sgxsdk/environment
-
-      - name: checkout
-        uses: actions/checkout@v3
-
-      - name: Get LLVM libraries
-        id: retrieve_llvm_libs
-        uses: actions/cache@v3
-        with:
-          path: |
-            ./core/deps/llvm/build/bin
-            ./core/deps/llvm/build/include
-            ./core/deps/llvm/build/lib
-            ./core/deps/llvm/build/libexec
-            ./core/deps/llvm/build/share
-          key: ${{ matrix.llvm_cache_key }}
-
-      - name: Quit if cache miss
-        if: steps.retrieve_llvm_libs.outputs.cache-hit != 'true'
-        run: echo "::error::can not get prebuilt llvm libraries" && exit 1
-
-      - name: Build wamrc
-        run: |
-          mkdir build && cd build
-          cmake ..
-          cmake --build . --config Release --parallel 4
-        working-directory: wamr-compiler
-
-  build_samples_wasm_c_api:
-    needs: [build_iwasm]
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        make_options: [
+        iwasm_make_options_run_mode: [
             # Running modes supported
+            $AOT_BUILD_OPTIONS,
             $CLASSIC_INTERP_BUILD_OPTIONS,
             $FAST_INTERP_BUILD_OPTIONS,
+            $FAST_JIT_BUILD_OPTIONS,
             # Running modes unsupported
-            #$LLVM_EAGER_JIT_BUILD_OPTIONS,
             #$LLVM_LAZY_JIT_BUILD_OPTIONS,
-            #$AOT_BUILD_OPTIONS,
+            #$LLVM_EAGER_JIT_BUILD_OPTIONS,
           ]
         os: [ubuntu-20.04]
         wasi_sdk_release:
@@ -200,51 +157,15 @@ jobs:
           [
             "https://github.com/WebAssembly/wabt/releases/download/1.0.31/wabt-1.0.31-ubuntu.tar.gz",
           ]
-    steps:
-      - name: checkout
-        uses: actions/checkout@v3
-
-      - name: download and install wabt
-        run: |
-          cd /opt
-          sudo wget ${{ matrix.wabt_release }}
-          sudo tar -xzf wabt-1.0.31-*.tar.gz
-          sudo mv wabt-1.0.31 wabt
-
-      - name: install SGX SDK and necessary libraries
-        run: |
-          mkdir -p /opt/intel
-          cd /opt/intel
-          wget https://download.01.org/intel-sgx/sgx-linux/2.15/distro/ubuntu20.04-server/sgx_linux_x64_sdk_2.15.100.3.bin
-          chmod +x sgx_linux_x64_sdk_2.15.100.3.bin
-          echo 'yes' | ./sgx_linux_x64_sdk_2.15.100.3.bin
-          echo 'deb [arch=amd64] https://download.01.org/intel-sgx/sgx_repo/ubuntu focal main' | sudo tee /etc/apt/sources.list.d/intel-sgx.list
-          wget -qO - https://download.01.org/intel-sgx/sgx_repo/ubuntu/intel-sgx-deb.key | sudo apt-key add -
-          sudo apt update
-          sudo apt install -y libsgx-launch libsgx-urts
-          source /opt/intel/sgxsdk/environment
-
-      - name: Build Sample [wasm-c-api]
-        run: |
-          cmake -S . -B build ${{ matrix.make_options }}
-          cmake --build build --config Release --parallel 4
-          ctest --test-dir build
-        working-directory: samples/wasm-c-api
-
-  build_samples_others:
-    needs: [build_iwasm]
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [ubuntu-20.04]
-        wasi_sdk_release:
-          [
-            "https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-19/wasi-sdk-19.0-linux.tar.gz",
-          ]
-        wabt_release:
-          [
-            "https://github.com/WebAssembly/wabt/releases/download/1.0.31/wabt-1.0.31-ubuntu.tar.gz",
+        iwasm_make_options_feature: [
+            # Features to be tested: IPFS
+            "-DWAMR_BUILD_SGX_IPFS=1",
           ]
+        platform: [linux-sgx]
+        include:
+          - os: ubuntu-20.04
+            llvm_cache_key: ${{ needs.build_llvm_libraries.outputs.cache_key }}
+
     steps:
       - name: checkout
         uses: actions/checkout@v3
@@ -290,76 +211,85 @@ jobs:
           wget -qO - https://download.01.org/intel-sgx/sgx_repo/ubuntu/intel-sgx-deb.key | sudo apt-key add -
           sudo apt update
           sudo apt install -y libsgx-launch libsgx-urts
-          source /opt/intel/sgxsdk/environment
 
-      - name: Build Sample [basic]
-        run: |
-          cd samples/basic
-          ./build.sh
-          ./run.sh
-
-      - name: Build Sample [file]
+      - name: Build iwasm for testing samples
         run: |
-          cd samples/file
           mkdir build && cd build
-          cmake ..
+          cmake .. ${{ matrix.iwasm_make_options_run_mode }} ${{ matrix.iwasm_make_options_feature }}
           cmake --build . --config Release --parallel 4
-          ./src/iwasm -f wasm-app/file.wasm -d .
+          cd ../enclave-sample
+          make
+        working-directory: product-mini/platforms/${{ matrix.platform }}
 
-      - name: Build Sample [multi-thread]
-        run: |
-          cd samples/multi-thread
-          mkdir build && cd build
-          cmake ..
-          cmake --build . --config Release --parallel 4
-          ./iwasm wasm-apps/test.wasm
+      - name: Get LLVM libraries
+        if: matrix.iwasm_make_options_run_mode == '$AOT_BUILD_OPTIONS'
+        id: retrieve_llvm_libs
+        uses: actions/cache@v3
+        with:
+          path: |
+            ./core/deps/llvm/build/bin
+            ./core/deps/llvm/build/include
+            ./core/deps/llvm/build/lib
+            ./core/deps/llvm/build/libexec
+            ./core/deps/llvm/build/share
+          key: ${{ matrix.llvm_cache_key }}
+          fail-on-cache-miss: true
 
-      - name: Build Sample [multi-module]
+      - name: Build wamrc only for testing samples in aot mode
+        if: matrix.iwasm_make_options_run_mode == '$AOT_BUILD_OPTIONS'
         run: |
-          cd samples/multi-module
           mkdir build && cd build
           cmake ..
           cmake --build . --config Release --parallel 4
-          ./multi_module
+          cp wamrc `pwd`/../../product-mini/platforms/${{ matrix.platform }}/enclave-sample
+        working-directory: wamr-compiler
 
-      - name: Build Sample [spawn-thread]
+      - name: Build Sample [file]
         run: |
-          cd samples/spawn-thread
+          cd samples/file
           mkdir build && cd build
           cmake ..
           cmake --build . --config Release --parallel 4
-          ./spawn_thread
+          cp wasm-app/file.wasm `pwd`/../../../product-mini/platforms/${{ matrix.platform }}/enclave-sample
 
-      - name: Build Sample [ref-types]
+      - name: Test Sample [file] in non-aot mode
+        if: matrix.iwasm_make_options_run_mode != '$AOT_BUILD_OPTIONS'
         run: |
-          cd samples/ref-types
-          mkdir build && cd build
-          cmake ..
-          cmake --build . --config Release --parallel 4
-          ./hello
+          source /opt/intel/sgxsdk/environment
+          ./iwasm --dir=. file.wasm
+        working-directory: product-mini/platforms/${{ matrix.platform }}/enclave-sample
 
-      - name: Build Sample [wasi-threads]
+      - name: Test Sample [file] in aot mode
+        if: matrix.iwasm_make_options_run_mode == '$AOT_BUILD_OPTIONS'
         run: |
-          cd samples/wasi-threads
-          mkdir build && cd build
-          cmake -DWASI_SYSROOT=`pwd`/../../../core/deps/wasi-libc/sysroot ..
-          cmake --build . --config Release --parallel 4
-          ./iwasm wasm-apps/no_pthread.wasm
+          source /opt/intel/sgxsdk/environment
+          ./wamrc -sgx -o file.aot file.wasm
+          ./iwasm --dir=. file.aot
+        working-directory: product-mini/platforms/${{ matrix.platform }}/enclave-sample
 
   spec_test_default:
-    needs: [build_iwasm, build_llvm_libraries, build_wamrc]
+    needs: [build_iwasm, build_llvm_libraries]
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        running_mode: ["classic-interp", "fast-interp", "aot"]
-        test_option: ["-x -p -s spec -b -P", "-x -p -s spec -S -b -P"]
+        running_mode: ["classic-interp", "fast-interp", "aot", "fast-jit"]
+        test_option: ["-x -p -s spec -b -P", "-x -p -s spec -S -b -P", "-x -p -s spec -X -b -P"]
         llvm_cache_key: ["${{ needs.build_llvm_libraries.outputs.cache_key }}"]
-        # classic-interp and fast-interp don't support simd
         exclude:
+          # classic-interp, fast-interp and fast-jit don't support simd
           - running_mode: "classic-interp"
             test_option: "-x -p -s spec -S -b -P"
           - running_mode: "fast-interp"
             test_option: "-x -p -s spec -S -b -P"
+          - running_mode: "fast-jit"
+            test_option: "-x -p -s spec -S -b -P"
+          # classic-interp, fast-interp and fast jit don't support XIP
+          - running_mode: "classic-interp"
+            test_option: "-x -p -s spec -X -b -P"
+          - running_mode: "fast-interp"
+            test_option: "-x -p -s spec -X -b -P"
+          - running_mode: "fast-jit"
+            test_option: "-x -p -s spec -X -b -P"
 
     steps:
       - name: checkout

+ 622 - 0
.github/workflows/nightly_run.yml

@@ -0,0 +1,622 @@
+# Copyright (C) 2023 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+name: nightly_run
+
+on:
+  # midnight UTC
+  schedule:
+    - cron: "0 0 * * *"
+  # allow to be triggered manually
+  workflow_dispatch:
+
+# Cancel any in-flight jobs for the same PR/branch so there's only one active
+# at a time
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  # For BUILD
+  AOT_BUILD_OPTIONS: "           -DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_FAST_JIT=0 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0"
+  CLASSIC_INTERP_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=0 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_FAST_JIT=0 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0"
+  FAST_INTERP_BUILD_OPTIONS: "   -DWAMR_BUILD_AOT=0 -DWAMR_BUILD_FAST_INTERP=1 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_FAST_JIT=0 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0"
+  FAST_JIT_BUILD_OPTIONS: "      -DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_FAST_JIT=1 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0"
+  LLVM_LAZY_JIT_BUILD_OPTIONS: " -DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_FAST_JIT=0 -DWAMR_BUILD_JIT=1 -DWAMR_BUILD_LAZY_JIT=1"
+  LLVM_EAGER_JIT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_FAST_JIT=0 -DWAMR_BUILD_JIT=1 -DWAMR_BUILD_LAZY_JIT=0"
+  MULTI_TIER_JIT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_FAST_JIT=1 -DWAMR_BUILD_JIT=1 -DWAMR_BUILD_LAZY_JIT=1"
+  # For Spec Test
+  DEFAULT_TEST_OPTIONS: "-s spec -b -P"
+  MULTI_MODULES_TEST_OPTIONS: "-s spec -b -M -P"
+  SIMD_TEST_OPTIONS: "-s spec -b -S -P"
+  THREADS_TEST_OPTIONS: "-s spec -b -p -P"
+  X86_32_TARGET_TEST_OPTIONS: "-m x86_32 -P"
+  WASI_TEST_OPTIONS: "-s wasi_certification -w"
+
+jobs:
+  build_llvm_libraries_on_ubuntu_2004:
+    uses: ./.github/workflows/build_llvm_libraries.yml
+    with:
+      os: "ubuntu-20.04"
+      arch: "X86"
+
+  build_llvm_libraries_on_ubuntu_2204:
+    uses: ./.github/workflows/build_llvm_libraries.yml
+    with:
+      os: "ubuntu-22.04"
+      arch: "X86"
+  
+  build_wamrc:
+    needs:
+      [build_llvm_libraries_on_ubuntu_2004, build_llvm_libraries_on_ubuntu_2204]
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        include:
+          - os: ubuntu-20.04
+            llvm_cache_key: ${{ needs.build_llvm_libraries_on_ubuntu_2004.outputs.cache_key }}
+          - os: ubuntu-22.04
+            llvm_cache_key: ${{ needs.build_llvm_libraries_on_ubuntu_2204.outputs.cache_key }}
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+
+      # since jobs.id can't contain the dot character
+      # it is hard to use `format` to assemble the cache key
+      - name: Get LLVM libraries
+        id: retrieve_llvm_libs
+        uses: actions/cache@v3
+        with:
+          path: |
+            ./core/deps/llvm/build/bin
+            ./core/deps/llvm/build/include
+            ./core/deps/llvm/build/lib
+            ./core/deps/llvm/build/libexec
+            ./core/deps/llvm/build/share
+          key: ${{ matrix.llvm_cache_key }}
+
+      - name: Quit if cache miss
+        if: steps.retrieve_llvm_libs.outputs.cache-hit != 'true'
+        run: echo "::error::can not get prebuilt llvm libraries" && exit 1
+
+      - name: Build wamrc
+        run: |
+          mkdir build && cd build
+          cmake ..
+          cmake --build . --config Release --parallel 4
+        working-directory: wamr-compiler
+
+  build_iwasm:
+    needs:
+      [build_llvm_libraries_on_ubuntu_2004, build_llvm_libraries_on_ubuntu_2204]
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        make_options_run_mode: [
+            # Running mode
+            $AOT_BUILD_OPTIONS,
+            $CLASSIC_INTERP_BUILD_OPTIONS,
+            $FAST_INTERP_BUILD_OPTIONS,
+            $FAST_JIT_BUILD_OPTIONS,
+            $LLVM_LAZY_JIT_BUILD_OPTIONS,
+            $LLVM_EAGER_JIT_BUILD_OPTIONS,
+            $MULTI_TIER_JIT_BUILD_OPTIONS,
+          ]
+        make_options_feature: [
+            # Features
+            "-DWAMR_BUILD_CUSTOM_NAME_SECTION=1",
+            "-DWAMR_BUILD_DEBUG_AOT=1",
+            "-DWAMR_BUILD_DEBUG_INTERP=1",
+            "-DWAMR_BUILD_DUMP_CALL_STACK=1",
+            "-DWAMR_BUILD_LIB_PTHREAD=1",
+            "-DWAMR_BUILD_LIB_WASI_THREADS=1",
+            "-DWAMR_BUILD_LOAD_CUSTOM_SECTION=1",
+            "-DWAMR_BUILD_MINI_LOADER=1",
+            "-DWAMR_BUILD_MEMORY_PROFILING=1",
+            "-DWAMR_BUILD_MULTI_MODULE=1",
+            "-DWAMR_BUILD_PERF_PROFILING=1",
+            "-DWAMR_BUILD_REF_TYPES=1",
+            "-DWAMR_BUILD_SIMD=1",
+            "-DWAMR_BUILD_TAIL_CALL=1",
+            "-DWAMR_DISABLE_HW_BOUND_CHECK=1",
+          ]
+        os: [ubuntu-20.04, ubuntu-22.04]
+        platform: [android, linux]
+        exclude:
+          # uncompatiable feature and platform
+          # uncompatiable mode and feature
+          # MULTI_MODULE only on INTERP mode
+          - make_options_run_mode: $AOT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_MULTI_MODULE=1"
+          - make_options_run_mode: $FAST_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_MULTI_MODULE=1"
+          - make_options_run_mode: $LLVM_LAZY_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_MULTI_MODULE=1"
+          - make_options_run_mode: $LLVM_EAGER_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_MULTI_MODULE=1"
+          - make_options_run_mode: $MULTI_TIER_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_MULTI_MODULE=1"
+          # SIMD only on JIT/AOT mode
+          - make_options_run_mode: $CLASSIC_INTERP_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_SIMD=1"
+          - make_options_run_mode: $FAST_INTERP_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_SIMD=1"
+          # DEBUG_INTERP only on CLASSIC INTERP mode
+          - make_options_run_mode: $AOT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_DEBUG_INTERP=1"
+          - make_options_run_mode: $FAST_INTERP_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_DEBUG_INTERP=1"
+          - make_options_run_mode: $FAST_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_DEBUG_INTERP=1"
+          - make_options_run_mode: $LLVM_LAZY_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_DEBUG_INTERP=1"
+          - make_options_run_mode: $LLVM_EAGER_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_DEBUG_INTERP=1"
+          - make_options_run_mode: $MULTI_TIER_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_DEBUG_INTERP=1"
+          # DEBUG_AOT only on JIT/AOT mode
+          - make_options_run_mode: $CLASSIC_INTERP_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_DEBUG_AOT=1"
+          - make_options_run_mode: $FAST_INTERP_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_DEBUG_AOT=1"
+          # TODO: DEBUG_AOT on JIT
+          - make_options_run_mode: $FAST_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_DEBUG_AOT=1"
+          - make_options_run_mode: $LLVM_LAZY_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_DEBUG_AOT=1"
+          - make_options_run_mode: $LLVM_EAGER_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_DEBUG_AOT=1"
+          - make_options_run_mode: $MULTI_TIER_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_DEBUG_AOT=1"
+          # MINI_LOADER only on INTERP mode
+          - make_options_run_mode: $AOT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_MINI_LOADER=1"
+          - make_options_run_mode: $FAST_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_MINI_LOADER=1"
+          - make_options_run_mode: $LLVM_LAZY_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_MINI_LOADER=1"
+          - make_options_run_mode: $LLVM_EAGER_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_MINI_LOADER=1"
+          - make_options_run_mode: $MULTI_TIER_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_MINI_LOADER=1"
+          # Fast-JIT and Multi-Tier-JIT mode don't support android(X86-32)
+          - make_options_run_mode: $FAST_JIT_BUILD_OPTIONS
+            platform: android
+          - make_options_run_mode: $MULTI_TIER_JIT_BUILD_OPTIONS
+            platform: android
+          # only test andorid on ubuntu latest
+          - os: ubuntu-20.04
+            platform: android
+        include:
+          - os: ubuntu-20.04
+            llvm_cache_key: ${{ needs.build_llvm_libraries_on_ubuntu_2004.outputs.cache_key }}
+          - os: ubuntu-22.04
+            llvm_cache_key: ${{ needs.build_llvm_libraries_on_ubuntu_2204.outputs.cache_key }}
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+
+      # only download llvm cache when needed
+      - name: Get LLVM libraries
+        id: retrieve_llvm_libs
+        if: endsWith(matrix.make_options_run_mode, '_JIT_BUILD_OPTIONS')
+        uses: actions/cache@v3
+        with:
+          path: |
+            ./core/deps/llvm/build/bin
+            ./core/deps/llvm/build/include
+            ./core/deps/llvm/build/lib
+            ./core/deps/llvm/build/libexec
+            ./core/deps/llvm/build/share
+          key: ${{ matrix.llvm_cache_key }}
+
+      - name: Quit if cache miss
+        if: endsWith(matrix.make_options_run_mode, '_JIT_BUILD_OPTIONS') && (steps.retrieve_llvm_libs.outputs.cache-hit != 'true')
+        run: echo "::error::can not get prebuilt llvm libraries" && exit 1
+
+      - name: Build iwasm
+        run: |
+          mkdir build && cd build
+          cmake .. ${{ matrix.make_options_run_mode }} ${{ matrix.make_options_feature }}
+          cmake --build . --config Release --parallel 4
+        working-directory: product-mini/platforms/${{ matrix.platform }}
+
+  build_iwasm_linux_gcc4_8:
+    runs-on: ubuntu-latest
+    container:
+      image: ubuntu:14.04
+    strategy:
+      matrix:
+        make_options_run_mode: [
+            # Running mode
+            $CLASSIC_INTERP_BUILD_OPTIONS,
+            $FAST_INTERP_BUILD_OPTIONS,
+            $FAST_JIT_BUILD_OPTIONS,
+          ]
+        make_options_feature: [
+            # Features
+            "-DWAMR_BUILD_CUSTOM_NAME_SECTION=1",
+            "-DWAMR_BUILD_DEBUG_AOT=1",
+            "-DWAMR_BUILD_DEBUG_INTERP=1",
+            "-DWAMR_BUILD_DUMP_CALL_STACK=1",
+            "-DWAMR_BUILD_LIB_PTHREAD=1",
+            "-DWAMR_BUILD_LIB_WASI_THREADS=1",
+            "-DWAMR_BUILD_LOAD_CUSTOM_SECTION=1",
+            "-DWAMR_BUILD_MINI_LOADER=1",
+            "-DWAMR_BUILD_MEMORY_PROFILING=1",
+            "-DWAMR_BUILD_MULTI_MODULE=1",
+            "-DWAMR_BUILD_PERF_PROFILING=1",
+            "-DWAMR_BUILD_REF_TYPES=1",
+            "-DWAMR_BUILD_SIMD=1",
+            "-DWAMR_BUILD_TAIL_CALL=1",
+            "-DWAMR_DISABLE_HW_BOUND_CHECK=1",
+          ]
+        exclude:
+          # uncompatiable feature and platform
+          # uncompatiable mode and feature
+          # MULTI_MODULE only on INTERP mode
+          - make_options_run_mode: $FAST_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_MULTI_MODULE=1"
+          # SIMD only on JIT/AOT mode
+          - make_options_run_mode: $CLASSIC_INTERP_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_SIMD=1"
+          - make_options_run_mode: $FAST_INTERP_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_SIMD=1"
+          # DEBUG_INTERP only on CLASSIC INTERP mode
+          - make_options_run_mode: $FAST_INTERP_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_DEBUG_INTERP=1"
+          - make_options_run_mode: $FAST_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_DEBUG_INTERP=1"
+          # DEBUG_AOT only on JIT/AOT mode
+          - make_options_run_mode: $CLASSIC_INTERP_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_DEBUG_AOT=1"
+          - make_options_run_mode: $FAST_INTERP_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_DEBUG_AOT=1"
+          # TODO: DEBUG_AOT on JIT
+          - make_options_run_mode: $FAST_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_DEBUG_AOT=1"
+          # MINI_LOADER only on INTERP mode
+          - make_options_run_mode: $FAST_JIT_BUILD_OPTIONS
+            make_options_feature: "-DWAMR_BUILD_MINI_LOADER=1"
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: apt update && apt install -y make g++-4.8 gcc-4.8 wget git
+
+      - name: Install cmake
+        run: |
+          wget https://github.com/Kitware/CMake/releases/download/v3.26.1/cmake-3.26.1-linux-x86_64.tar.gz -O cmake.tar.gz
+          tar xzf cmake.tar.gz
+          cp cmake-3.26.1-linux-x86_64/bin/cmake /usr/local/bin
+          cp -r cmake-3.26.1-linux-x86_64/share/cmake-3.26/ /usr/local/share/
+      - name: Build iwasm
+        run: |
+          mkdir build && cd build
+          cmake .. ${{ matrix.make_options_run_mode }} ${{ matrix.make_options_feature }} -DCMAKE_C_COMPILER=gcc-4.8 -DCMAKE_CXX_COMPILER=g++-4.8
+          cmake --build . --config Release --parallel 4
+        working-directory: product-mini/platforms/linux
+
+  build_samples_wasm_c_api:
+    needs:
+      [
+        build_iwasm,
+        build_llvm_libraries_on_ubuntu_2004,
+        build_llvm_libraries_on_ubuntu_2204,
+        build_wamrc,
+      ]
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        sanitizer: ["", "ubsan", "asan"]
+        make_options: [
+            # Running mode
+            $AOT_BUILD_OPTIONS,
+            $CLASSIC_INTERP_BUILD_OPTIONS,
+            $FAST_INTERP_BUILD_OPTIONS,
+            $FAST_JIT_BUILD_OPTIONS,
+            $LLVM_LAZY_JIT_BUILD_OPTIONS,
+            $LLVM_EAGER_JIT_BUILD_OPTIONS,
+            $MULTI_TIER_JIT_BUILD_OPTIONS,
+          ]
+        os: [ubuntu-20.04, ubuntu-22.04]
+        wasi_sdk_release:
+          [
+            "https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-20/wasi-sdk-20.0-linux.tar.gz",
+          ]
+        wabt_release:
+          [
+            "https://github.com/WebAssembly/wabt/releases/download/1.0.31/wabt-1.0.31-ubuntu.tar.gz",
+          ]
+        include:
+          - os: ubuntu-20.04
+            llvm_cache_key: ${{ needs.build_llvm_libraries_on_ubuntu_2004.outputs.cache_key }}
+          - os: ubuntu-22.04
+            llvm_cache_key: ${{ needs.build_llvm_libraries_on_ubuntu_2204.outputs.cache_key }}
+        exclude:
+          - make_options: $MULTI_TIER_JIT_BUILD_OPTIONS
+            sanitizer: asan
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+
+      - name: Get LLVM libraries
+        id: retrieve_llvm_libs
+        if: (!endsWith(matrix.make_options, '_INTERP_BUILD_OPTIONS'))
+        uses: actions/cache@v3
+        with:
+          path: |
+            ./core/deps/llvm/build/bin
+            ./core/deps/llvm/build/include
+            ./core/deps/llvm/build/lib
+            ./core/deps/llvm/build/libexec
+            ./core/deps/llvm/build/share
+          key: ${{ matrix.llvm_cache_key }}
+
+      - name: Quit if cache miss
+        if: (!endsWith(matrix.make_options, '_INTERP_BUILD_OPTIONS')) && (steps.retrieve_llvm_libs.outputs.cache-hit != 'true')
+        run: echo "::error::can not get prebuilt llvm libraries" && exit 1
+
+      - name: download and install wabt
+        run: |
+          cd /opt
+          sudo wget ${{ matrix.wabt_release }}
+          sudo tar -xzf wabt-1.0.31-*.tar.gz
+          sudo mv wabt-1.0.31 wabt
+      - name: Build wamrc
+        if: (!endsWith(matrix.make_options, '_INTERP_BUILD_OPTIONS'))
+        run: |
+          mkdir build && cd build
+          cmake -D WAMR_BUILD_SANITIZER="${{matrix.sanitizer}}" ..
+          cmake --build . --config Release --parallel 4
+        working-directory: wamr-compiler
+
+      - name: Build Sample [wasm-c-api]
+        run: |
+          VERBOSE=1
+          cmake -S . -B build ${{ matrix.make_options }} -D WAMR_BUILD_SANITIZER="${{matrix.sanitizer}}"
+          cmake --build build --config Release --parallel 4
+          ctest --test-dir build --output-on-failure
+        working-directory: samples/wasm-c-api
+
+  build_samples_others:
+    needs: [build_iwasm]
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-20.04, ubuntu-22.04]
+        wasi_sdk_release:
+          [
+            "https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-20/wasi-sdk-20.0-linux.tar.gz",
+          ]
+        wabt_release:
+          [
+            "https://github.com/WebAssembly/wabt/releases/download/1.0.31/wabt-1.0.31-ubuntu.tar.gz",
+          ]
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+
+      - name: download and install wasi-sdk
+        run: |
+          cd /opt
+          sudo wget ${{ matrix.wasi_sdk_release }}
+          sudo tar -xzf wasi-sdk-*.tar.gz
+          sudo mv wasi-sdk-20.0 wasi-sdk
+      - name: download and install wabt
+        run: |
+          cd /opt
+          sudo wget ${{ matrix.wabt_release }}
+          sudo tar -xzf wabt-1.0.31-*.tar.gz
+          sudo mv wabt-1.0.31 wabt
+      - name: Build Sample [basic]
+        run: |
+          cd samples/basic
+          ./build.sh
+          ./run.sh
+      - name: Build Sample [file]
+        run: |
+          cd samples/file
+          mkdir build && cd build
+          cmake ..
+          cmake --build . --config Release --parallel 4
+          ./src/iwasm -f wasm-app/file.wasm -d .
+      - name: Build Sample [multi-thread]
+        run: |
+          cd samples/multi-thread
+          mkdir build && cd build
+          cmake ..
+          cmake --build . --config Release --parallel 4
+          ./iwasm wasm-apps/test.wasm
+      - name: Build Sample [multi-module]
+        run: |
+          cd samples/multi-module
+          mkdir build && cd build
+          cmake ..
+          cmake --build . --config Release --parallel 4
+          ./multi_module
+      - name: Build Sample [spawn-thread]
+        run: |
+          cd samples/spawn-thread
+          mkdir build && cd build
+          cmake ..
+          cmake --build . --config Release --parallel 4
+          ./spawn_thread
+      - name: Build Sample [ref-types]
+        run: |
+          cd samples/ref-types
+          mkdir build && cd build
+          cmake ..
+          cmake --build . --config Release --parallel 4
+          ./hello
+      - name: Build Sample [simple]
+        run: |
+          ./build.sh -p host-interp
+          python3 ./sample_test_run.py $(pwd)/out
+          exit $?
+        working-directory: ./samples/simple
+
+      - name: Build Sample [wasi-threads]
+        run: |
+          cd samples/wasi-threads
+          mkdir build && cd build
+          cmake ..
+          cmake --build . --config Release --parallel 4
+          ./iwasm wasm-apps/no_pthread.wasm
+  test:
+    needs:
+      [
+        build_iwasm,
+        build_llvm_libraries_on_ubuntu_2004,
+        build_llvm_libraries_on_ubuntu_2204,
+        build_wamrc,
+      ]
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-20.04, ubuntu-22.04]
+        sanitizer: ["", "ubsan", "asan"]
+        running_mode:
+          [
+            "classic-interp",
+            "fast-interp",
+            "jit",
+            "aot",
+            "fast-jit",
+            "multi-tier-jit",
+          ]
+        test_option:
+          [
+            $DEFAULT_TEST_OPTIONS,
+            $MULTI_MODULES_TEST_OPTIONS,
+            $SIMD_TEST_OPTIONS,
+            $THREADS_TEST_OPTIONS,
+            $WASI_TEST_OPTIONS,
+          ]
+        wasi_sdk_release:
+          [
+            "https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-20/wasi-sdk-20.0-linux.tar.gz",
+          ]
+        include:
+          - os: ubuntu-20.04
+            llvm_cache_key: ${{ needs.build_llvm_libraries_on_ubuntu_2004.outputs.cache_key }}
+            ubuntu_version: "20.04"
+          - os: ubuntu-22.04
+            llvm_cache_key: ${{ needs.build_llvm_libraries_on_ubuntu_2204.outputs.cache_key }}
+            ubuntu_version: "22.04"
+        exclude:
+          # uncompatiable modes and features
+          - os: ubuntu-20.04
+            sanitizer: asan
+          # asan works only for aot now
+          - running_mode: "classic-interp"
+            sanitizer: asan
+          - running_mode: "fast-interp"
+            sanitizer: asan
+          - running_mode: "jit"
+            sanitizer: asan
+          - running_mode: "fast-jit"
+            sanitizer: asan
+          - running_mode: "multi-tier-jit"
+            sanitizer: asan
+          # classic-interp and fast-interp don't support simd
+          - running_mode: "classic-interp"
+            test_option: $SIMD_TEST_OPTIONS
+          - running_mode: "fast-interp"
+            test_option: $SIMD_TEST_OPTIONS
+          # aot and jit don't support multi module
+          - running_mode: "aot"
+            test_option: $MULTI_MODULES_TEST_OPTIONS
+          - running_mode: "jit"
+            test_option: $MULTI_MODULES_TEST_OPTIONS
+          # fast-jit doesn't support multi module, simd
+          - running_mode: "fast-jit"
+            test_option: $MULTI_MODULES_TEST_OPTIONS
+          - running_mode: "fast-jit"
+            test_option: $SIMD_TEST_OPTIONS
+          # multi-tier-jit doesn't support multi module, simd
+          - running_mode: "multi-tier-jit"
+            test_option: $MULTI_MODULES_TEST_OPTIONS
+          - running_mode: "multi-tier-jit"
+            test_option: $SIMD_TEST_OPTIONS
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+
+      - name: download and install wasi-sdk
+        if: matrix.test_option == '$WASI_TEST_OPTIONS'
+        run: |
+          cd /opt
+          sudo wget ${{ matrix.wasi_sdk_release }}
+          sudo tar -xzf wasi-sdk-*.tar.gz
+          sudo mv wasi-sdk-20.0 wasi-sdk
+      - name: set env variable(if llvm are used)
+        if: matrix.running_mode == 'aot' || matrix.running_mode == 'jit' || matrix.running_mode == 'multi-tier-jit'
+        run: echo "USE_LLVM=true" >> $GITHUB_ENV
+
+      - name: set env variable(if x86_32 test needed)
+        if: >
+          (matrix.test_option == '$DEFAULT_TEST_OPTIONS' || matrix.test_option == '$THREADS_TEST_OPTIONS'
+           || matrix.test_option == '$WASI_TEST_OPTIONS')
+          && matrix.running_mode != 'fast-jit' && matrix.running_mode != 'jit' && matrix.running_mode != 'multi-tier-jit'
+        run: echo "TEST_ON_X86_32=true" >> $GITHUB_ENV
+      
+      - name: set sanitizer
+        run: echo "WAMR_BUILD_SANITIZER=${{ matrix.sanitizer }}" >> $GITHUB_ENV
+
+      #only download llvm libraries in jit and aot mode
+      - name: Get LLVM libraries
+        if: env.USE_LLVM == 'true'
+        id: retrieve_llvm_libs
+        uses: actions/cache@v3
+        with:
+          path: |
+            ./core/deps/llvm/build/bin
+            ./core/deps/llvm/build/include
+            ./core/deps/llvm/build/lib
+            ./core/deps/llvm/build/libexec
+            ./core/deps/llvm/build/share
+          key: ${{ matrix.llvm_cache_key }}
+
+      - name: Quit if cache miss
+        if: env.USE_LLVM == 'true' && steps.retrieve_llvm_libs.outputs.cache-hit != 'true'
+        run: echo "::error::can not get prebuilt llvm libraries" && exit 1
+
+      - name: install jq JSON processor
+        if: matrix.running_mode == 'aot' && matrix.test_option == '$WASI_TEST_OPTIONS'
+        run: sudo apt-get update && sudo apt install -y jq
+
+      - name: Build WASI thread tests
+        if: matrix.test_option == '$WASI_TEST_OPTIONS'
+        run: bash build.sh
+        working-directory: ./core/iwasm/libraries/lib-wasi-threads/test/
+
+      - name: build socket api tests
+        if: matrix.test_option == '$WASI_TEST_OPTIONS'
+        run: bash build.sh
+        working-directory: ./core/iwasm/libraries/lib-socket/test/
+
+      - name: run tests
+        timeout-minutes: 10
+        run: ./test_wamr.sh ${{ matrix.test_option }} -t ${{ matrix.running_mode }}
+        working-directory: ./tests/wamr-test-suites
+
+      #only install x32 support libraries when to run x86_32 cases
+      - name: install x32 support libraries
+        if: env.TEST_ON_X86_32 == 'true'
+        run:
+          # Add another apt repository as some packages cannot
+          # be downloaded with the github default repository
+          sudo curl -sSL https://packages.microsoft.com/keys/microsoft.asc | sudo tee /etc/apt/trusted.gpg.d/microsoft.asc &&
+          sudo apt-add-repository https://packages.microsoft.com/ubuntu/${{ matrix.ubuntu_version }}/prod &&
+          sudo apt-get update &&
+          sudo apt install -y g++-multilib lib32gcc-9-dev
+
+      - name: run tests x86_32
+        timeout-minutes: 10
+        if: env.TEST_ON_X86_32 == 'true'
+        run: ./test_wamr.sh ${{ env.X86_32_TARGET_TEST_OPTIONS }} ${{ matrix.test_option }} -t ${{ matrix.running_mode }}
+        working-directory: ./tests/wamr-test-suites

+ 1 - 1
.github/workflows/spec_test_on_nuttx.yml

@@ -52,7 +52,7 @@ jobs:
       - name: Install RISC-V Compilers
         if: contains(matrix.nuttx_board_config, 'risc-v')
         run: |
-          curl -L https://static.dev.sifive.com/dev-tools/freedom-tools/v2020.12/riscv64-unknown-elf-toolchain-10.2.0-2020.12.8-x86_64-linux-ubuntu14.tar.gz > riscv.tar.gz
+          curl -L -k https://static.dev.sifive.com/dev-tools/freedom-tools/v2020.12/riscv64-unknown-elf-toolchain-10.2.0-2020.12.8-x86_64-linux-ubuntu14.tar.gz > riscv.tar.gz
           tar xvf riscv.tar.gz
           echo "$PWD/riscv64-unknown-elf-toolchain-10.2.0-2020.12.8-x86_64-linux-ubuntu14/bin" >> $GITHUB_PATH
 

+ 10 - 0
ATTRIBUTIONS.md

@@ -16,6 +16,7 @@ WAMR project reused some components from other open source project:
 - **asmjit**: for the Fast JIT x86-64 codegen implementation
 - **zydis**: for the Fast JIT x86-64 codegen implementation
 - **NuttX ELF headers**: used in core/iwasm/aot/debug/elf_parser.c
+- **Dhrystone**: for the test benchmakr dhrystone
 
 The WAMR fast interpreter is a clean room development. We would acknowledge the inspirations by [WASM3](https://github.com/wasm3/wasm3) open source project for the approach of pre-calculated oprand stack location.
 
@@ -35,6 +36,7 @@ The WAMR fast interpreter is a clean room development. We would acknowledge the
 | asmjit | unspecified | unspecified | https://github.com/asmjit/asmjit | |
 | zydis | unspecified | e14a07895136182a5b53e181eec3b1c6e0b434de | https://github.com/zyantific/zydis | |
 | NuttX ELF headers | 72313301e23f9c2de969fb64b9a0f67bb4c284df | 10.3.0 | https://github.com/apache/incubator-nuttx | |
+| Dhrystone | 2.1 | 2.1 | https://fossies.org/linux/privat/old/ | |
 
 ## Licenses
 
@@ -81,15 +83,19 @@ The WAMR fast interpreter is a clean room development. We would acknowledge the
 [LICENSE](./tests/wamr-test-suites/spec-test-script/LICENSE)
 
 ### libuv
+
 [LICENSE](./core/iwasm/libraries/libc-uvwasi/LICENSE_LIBUV)
 
 ### uvwasi
+
 [LICENSE](./core/iwasm/libraries/libc-uvwasi/LICENSE_UVWASI)
 
 ### asmjit
+
 [LICENSE](./core/iwasm/fast-jit/cg/LICENSE_ASMJIT)
 
 ### zydis
+
 [LICENSE](./core/iwasm/fast-jit/cg/LICENSE_ZYDIS)
 
 ### NuttX ELF headers
@@ -97,3 +103,7 @@ The WAMR fast interpreter is a clean room development. We would acknowledge the
 [LICENSE](./core/iwasm/aot/debug/LICENSE_NUTTX)
 
 [NOTICE](./core/iwasm/aot/debug/NOTICE_NUTTX)
+
+### Dhrystone
+
+[LICENSE](./tests/benchmarks/dhrystone/LICENSE)

+ 39 - 0
RELEASE_NOTES.md

@@ -1,3 +1,42 @@
+## WAMR-1.2.2
+
+### Breaking Changes
+
+### New Features
+- Implement Fast JIT multi-threading feature (#2134)
+
+### Bug Fixes
+- Update request.ts wasm_response_send signature (#2122)
+- Fix ems allocator unaligned memory access on riscv64 (#2140)
+- libc_wasi_wrapper.c: Fix min func issue for size_t < 8 bytes on some platforms (#2152)
+- Fix three multi-threading and wasm-c-api-imports issues (#2173)
+- Fix build polybench benchmark error with wasi-sdk-19.0 (#2187)
+- Fix wamr-ide debugger ignoring launch config (#2155)
+
+### Enhancements
+- Add test for validating linear memory size updates (#2078)
+- Update Zephyr docs to remove unsupported west subcommand (#2128)
+- Update messages/comments to refer the new place of the version definition (#2133)
+- build_wamr_lldb.yml: sync lldb build options between ubuntu and macos (#2132)
+- build_wamr_vscode_ext.yml: vsce publish only on the official repo (#2130)
+- VSCode-Extension: Download lldb built for ubuntu 20.04 (#2139)
+- Avoid re-installing if Tensorflow is already installed for WASI-NN (#2148)
+- wamrc: Add --stack-usage option (#2158)
+- Fix URL in language-bindings/python/README.md (#2166)
+- Fix URL in embed_wamr.md (#2165)
+- Fix URL in README.md (#2168)
+- Return error when exception was raised after main thread finishes (#2169)
+- wasi-nn: Add external delegation to support several NPU/GPU (#2162)
+- Update document for iwasm/wamrc dependent packages (#2183)
+- Use a manual flag to disable clock_nanosleep on the unsupported platforms (#2176)
+- Fix compile warnings on windows platform (#2208)
+
+### Others
+- CI: Add ubsan checks to samples/wasm-c-api (#2147)
+- CI: More precise trigger paths for github actions (#2157)
+
+---
+
 ## WAMR-1.2.1
 
 ### Breaking Changes

+ 1 - 1
build-scripts/build_llvm.py

@@ -61,7 +61,7 @@ def build_llvm(llvm_dir, platform, backends, projects, use_clang=False, extra_fl
         "-DLLVM_ENABLE_IDE:BOOL=OFF",
         "-DLLVM_ENABLE_LIBEDIT=OFF",
         "-DLLVM_ENABLE_TERMINFO:BOOL=OFF",
-        "-DLLVM_ENABLE_ZLIB:BOOL=OFF",
+        "-DLLVM_ENABLE_ZLIB:BOOL=ON",
         "-DLLVM_INCLUDE_BENCHMARKS:BOOL=OFF",
         "-DLLVM_INCLUDE_DOCS:BOOL=OFF",
         "-DLLVM_INCLUDE_EXAMPLES:BOOL=OFF",

+ 26 - 0
build-scripts/config_common.cmake

@@ -127,6 +127,28 @@ else ()
   unset (LLVM_AVAILABLE_LIBS)
 endif ()
 
+# Sanitizers
+
+set(WAMR_BUILD_SANITIZER $ENV{WAMR_BUILD_SANITIZER})
+
+if (NOT DEFINED WAMR_BUILD_SANITIZER)
+  set(WAMR_BUILD_SANITIZER "")
+elseif (WAMR_BUILD_SANITIZER STREQUAL "ubsan")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=all -fno-sanitize=alignment" )
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=undefined")
+elseif (WAMR_BUILD_SANITIZER STREQUAL "asan")
+  if (NOT WAMR_BUILD_JIT EQUAL 1)
+    set (ASAN_OPTIONS "verbosity=2 debug=true ")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fno-omit-frame-pointer -fsanitize=address -fno-sanitize-recover=all" )
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address")
+  endif()
+elseif (WAMR_BUILD_SANITIZER STREQUAL "tsan") 
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all" )
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=thread")
+elseif (NOT (WAMR_BUILD_SANITIZER STREQUAL "") )
+  message(SEND_ERROR "Unsupported sanitizer: ${WAMR_BUILD_SANITIZER}")
+endif()
+
 ########################################
 
 message ("-- Build Configurations:")
@@ -366,3 +388,7 @@ if ("$ENV{COLLECT_CODE_COVERAGE}" STREQUAL "1" OR COLLECT_CODE_COVERAGE EQUAL 1)
   add_definitions (-DCOLLECT_CODE_COVERAGE)
   message ("     Collect code coverage enabled")
 endif ()
+if (WAMR_BUILD_STATIC_PGO EQUAL 1)
+  add_definitions (-DWASM_ENABLE_STATIC_PGO=1)
+  message ("     AOT static PGO enabled")
+endif ()

+ 1 - 1
build-scripts/requirements.txt

@@ -1 +1 @@
-requests==2.28.2
+requests==2.31.0

+ 4 - 0
core/config.h

@@ -445,4 +445,8 @@
 #define WASM_ENABLE_WASM_CACHE 0
 #endif
 
+#ifndef WASM_ENABLE_STATIC_PGO
+#define WASM_ENABLE_STATIC_PGO 0
+#endif
+
 #endif /* end of _CONFIG_H_ */

+ 296 - 7
core/iwasm/aot/aot_loader.c

@@ -1430,8 +1430,28 @@ destroy_object_data_sections(AOTObjectDataSection *data_sections,
     uint32 i;
     AOTObjectDataSection *data_section = data_sections;
     for (i = 0; i < data_section_count; i++, data_section++)
-        if (data_section->data)
+        if (data_section->data) {
+#if WASM_ENABLE_STATIC_PGO != 0
+            if (!strncmp(data_section->name, "__llvm_prf_data", 15)) {
+                LLVMProfileData *data = (LLVMProfileData *)data_section->data;
+                if (data->values) {
+                    uint32 num_value_sites =
+                        data->num_value_sites[0] + data->num_value_sites[1];
+                    uint32 j;
+                    for (j = 0; j < num_value_sites; j++) {
+                        ValueProfNode *node = data->values[j], *node_next;
+                        while (node) {
+                            node_next = node->next;
+                            wasm_runtime_free(node);
+                            node = node_next;
+                        }
+                    }
+                    wasm_runtime_free(data->values);
+                }
+            }
+#endif
             os_munmap(data_section->data, data_section->size);
+        }
     wasm_runtime_free(data_sections);
 }
 
@@ -1900,6 +1920,8 @@ str2uint64(const char *buf, uint64 *p_res)
     return true;
 }
 
+#define R_X86_64_GOTPCREL 9 /* 32 bit signed PC relative offset to GOT */
+
 static bool
 do_text_relocation(AOTModule *module, AOTRelocationGroup *group,
                    char *error_buf, uint32 error_buf_size)
@@ -1937,6 +1959,14 @@ do_text_relocation(AOTModule *module, AOTRelocationGroup *group,
         bh_memcpy_s(symbol, symbol_len, relocation->symbol_name, symbol_len);
         symbol[symbol_len] = '\0';
 
+#if WASM_ENABLE_STATIC_PGO != 0
+        if (!strcmp(symbol, "__llvm_profile_runtime")
+            || !strcmp(symbol, "__llvm_profile_register_function")
+            || !strcmp(symbol, "__llvm_profile_register_names_function")) {
+            continue;
+        }
+#endif
+
         if (!strncmp(symbol, AOT_FUNC_PREFIX, strlen(AOT_FUNC_PREFIX))) {
             p = symbol + strlen(AOT_FUNC_PREFIX);
             if (*p == '\0'
@@ -1945,7 +1975,26 @@ do_text_relocation(AOTModule *module, AOTRelocationGroup *group,
                                 "invalid import symbol %s", symbol);
                 goto check_symbol_fail;
             }
+#if (defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)) \
+    && !defined(BH_PLATFORM_WINDOWS)
+            if (relocation->relocation_type == R_X86_64_GOTPCREL) {
+                GOTItem *got_item = module->got_item_list;
+                uint32 got_item_idx = 0;
+
+                while (got_item) {
+                    if (got_item->func_idx == func_index)
+                        break;
+                    got_item_idx++;
+                    got_item = got_item->next;
+                }
+                /* Calculate `GOT + G` */
+                symbol_addr = module->got_func_ptrs + got_item_idx;
+            }
+            else
+                symbol_addr = module->func_ptrs[func_index];
+#else
             symbol_addr = module->func_ptrs[func_index];
+#endif
         }
         else if (!strcmp(symbol, ".text")) {
             symbol_addr = module->code;
@@ -1956,7 +2005,13 @@ do_text_relocation(AOTModule *module, AOTRelocationGroup *group,
                  /* ".rodata.cst4/8/16/.." */
                  || !strncmp(symbol, ".rodata.cst", strlen(".rodata.cst"))
                  /* ".rodata.strn.m" */
-                 || !strncmp(symbol, ".rodata.str", strlen(".rodata.str"))) {
+                 || !strncmp(symbol, ".rodata.str", strlen(".rodata.str"))
+#if WASM_ENABLE_STATIC_PGO != 0
+                 || !strncmp(symbol, "__llvm_prf_cnts", 15)
+                 || !strncmp(symbol, "__llvm_prf_data", 15)
+                 || !strncmp(symbol, "__llvm_prf_names", 16)
+#endif
+        ) {
             symbol_addr = get_data_section_addr(module, symbol, NULL);
             if (!symbol_addr) {
                 set_error_buf_v(error_buf, error_buf_size,
@@ -2088,6 +2143,14 @@ do_data_relocation(AOTModule *module, AOTRelocationGroup *group,
     else if (!strcmp(group->section_name, ".rdata")) {
         data_section_name = group->section_name;
     }
+#if WASM_ENABLE_STATIC_PGO != 0
+    else if (!strncmp(group->section_name, ".rel__llvm_prf_data", 19)) {
+        data_section_name = group->section_name + strlen(".rel");
+    }
+    else if (!strncmp(group->section_name, ".rela__llvm_prf_data", 20)) {
+        data_section_name = group->section_name + strlen(".rela");
+    }
+#endif
     else {
         set_error_buf(error_buf, error_buf_size,
                       "invalid data relocation section name");
@@ -2107,6 +2170,49 @@ do_data_relocation(AOTModule *module, AOTRelocationGroup *group,
         if (!strcmp(symbol, ".text")) {
             symbol_addr = module->code;
         }
+#if WASM_ENABLE_STATIC_PGO != 0
+        else if (!strncmp(symbol, AOT_FUNC_PREFIX, strlen(AOT_FUNC_PREFIX))) {
+            char *p = symbol + strlen(AOT_FUNC_PREFIX);
+            uint32 func_index;
+            if (*p == '\0'
+                || (func_index = (uint32)atoi(p)) > module->func_count) {
+                set_error_buf_v(error_buf, error_buf_size,
+                                "invalid relocation symbol %s", symbol);
+                return false;
+            }
+            symbol_addr = module->func_ptrs[func_index];
+        }
+        else if (!strcmp(symbol, "__llvm_prf_cnts")) {
+            uint32 j;
+            for (j = 0; j < module->data_section_count; j++) {
+                if (!strncmp(module->data_sections[j].name, symbol, 15)) {
+                    bh_assert(relocation->relocation_addend + sizeof(uint64)
+                              <= module->data_sections[j].size);
+                    symbol_addr = module->data_sections[j].data;
+                    break;
+                }
+            }
+            if (j == module->data_section_count) {
+                set_error_buf_v(error_buf, error_buf_size,
+                                "invalid relocation symbol %s", symbol);
+                return false;
+            }
+        }
+        else if (!strncmp(symbol, "__llvm_prf_cnts", 15)) {
+            uint32 j;
+            for (j = 0; j < module->data_section_count; j++) {
+                if (!strcmp(module->data_sections[j].name, symbol)) {
+                    symbol_addr = module->data_sections[j].data;
+                    break;
+                }
+            }
+            if (j == module->data_section_count) {
+                set_error_buf_v(error_buf, error_buf_size,
+                                "invalid relocation symbol %s", symbol);
+                return false;
+            }
+        }
+#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */
         else {
             set_error_buf_v(error_buf, error_buf_size,
                             "invalid relocation symbol %s", symbol);
@@ -2154,7 +2260,7 @@ load_relocation_section(const uint8 *buf, const uint8 *buf_end,
 {
     AOTRelocationGroup *groups = NULL, *group;
     uint32 symbol_count = 0;
-    uint32 group_count = 0, i, j;
+    uint32 group_count = 0, i, j, got_item_count = 0;
     uint64 size;
     uint32 *symbol_offsets, total_string_len;
     uint8 *symbol_buf, *symbol_buf_end;
@@ -2216,6 +2322,8 @@ load_relocation_section(const uint8 *buf, const uint8 *buf_end,
 
         for (j = 0; j < relocation_count; j++) {
             AOTRelocation relocation = { 0 };
+            char group_name_buf[128] = { 0 };
+            char symbol_name_buf[128] = { 0 };
             uint32 symbol_index, offset32;
             int32 addend32;
             uint16 symbol_name_len;
@@ -2244,10 +2352,10 @@ load_relocation_section(const uint8 *buf, const uint8 *buf_end,
             symbol_name_len = *(uint16 *)symbol_name;
             symbol_name += sizeof(uint16);
 
-            char group_name_buf[128] = { 0 };
-            char symbol_name_buf[128] = { 0 };
-            memcpy(group_name_buf, group_name, group_name_len);
-            memcpy(symbol_name_buf, symbol_name, symbol_name_len);
+            bh_memcpy_s(group_name_buf, (uint32)sizeof(group_name_buf),
+                        group_name, group_name_len);
+            bh_memcpy_s(symbol_name_buf, (uint32)sizeof(symbol_name_buf),
+                        symbol_name, symbol_name_len);
 
             if ((group_name_len == strlen(".text")
                  || (module->is_indirect_mode
@@ -2309,6 +2417,139 @@ load_relocation_section(const uint8 *buf, const uint8 *buf_end,
     }
 #endif /* end of defined(BH_PLATFORM_WINDOWS) */
 
+#if (defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)) \
+    && !defined(BH_PLATFORM_WINDOWS)
+    buf = symbol_buf_end;
+    read_uint32(buf, buf_end, group_count);
+
+    /* Resolve the relocations of type R_X86_64_GOTPCREL */
+    for (i = 0; i < group_count; i++) {
+        uint32 name_index, relocation_count;
+        uint16 group_name_len;
+        uint8 *group_name;
+
+        /* section name address is 4 bytes aligned. */
+        buf = (uint8 *)align_ptr(buf, sizeof(uint32));
+        read_uint32(buf, buf_end, name_index);
+
+        if (name_index >= symbol_count) {
+            set_error_buf(error_buf, error_buf_size,
+                          "symbol index out of range");
+            goto fail;
+        }
+
+        group_name = symbol_buf + symbol_offsets[name_index];
+        group_name_len = *(uint16 *)group_name;
+        group_name += sizeof(uint16);
+
+        read_uint32(buf, buf_end, relocation_count);
+
+        for (j = 0; j < relocation_count; j++) {
+            AOTRelocation relocation = { 0 };
+            char group_name_buf[128] = { 0 };
+            char symbol_name_buf[128] = { 0 };
+            uint32 symbol_index;
+            uint16 symbol_name_len;
+            uint8 *symbol_name;
+
+            /* relocation offset and addend */
+            buf += sizeof(void *) * 2;
+
+            read_uint32(buf, buf_end, relocation.relocation_type);
+            read_uint32(buf, buf_end, symbol_index);
+
+            if (symbol_index >= symbol_count) {
+                set_error_buf(error_buf, error_buf_size,
+                              "symbol index out of range");
+                goto fail;
+            }
+
+            symbol_name = symbol_buf + symbol_offsets[symbol_index];
+            symbol_name_len = *(uint16 *)symbol_name;
+            symbol_name += sizeof(uint16);
+
+            bh_memcpy_s(group_name_buf, (uint32)sizeof(group_name_buf),
+                        group_name, group_name_len);
+            bh_memcpy_s(symbol_name_buf, (uint32)sizeof(symbol_name_buf),
+                        symbol_name, symbol_name_len);
+
+            if (relocation.relocation_type == R_X86_64_GOTPCREL
+                && !strncmp(symbol_name_buf, AOT_FUNC_PREFIX,
+                            strlen(AOT_FUNC_PREFIX))) {
+                uint32 func_idx =
+                    atoi(symbol_name_buf + strlen(AOT_FUNC_PREFIX));
+                GOTItem *got_item = module->got_item_list;
+
+                if (func_idx >= module->func_count) {
+                    set_error_buf(error_buf, error_buf_size,
+                                  "func index out of range");
+                    goto fail;
+                }
+
+                while (got_item) {
+                    if (got_item->func_idx == func_idx)
+                        break;
+                    got_item = got_item->next;
+                }
+
+                if (!got_item) {
+                    /* Create the got item and append to the list */
+                    got_item = wasm_runtime_malloc(sizeof(GOTItem));
+                    if (!got_item) {
+                        set_error_buf(error_buf, error_buf_size,
+                                      "allocate memory failed");
+                        goto fail;
+                    }
+
+                    got_item->func_idx = func_idx;
+                    got_item->next = NULL;
+                    if (!module->got_item_list) {
+                        module->got_item_list = module->got_item_list_end =
+                            got_item;
+                    }
+                    else {
+                        module->got_item_list_end->next = got_item;
+                        module->got_item_list_end = got_item;
+                    }
+
+                    got_item_count++;
+                }
+            }
+        }
+    }
+
+    if (got_item_count) {
+        GOTItem *got_item = module->got_item_list;
+        uint32 got_item_idx = 0;
+
+        map_prot = MMAP_PROT_READ | MMAP_PROT_WRITE;
+        /* aot code and data in x86_64 must be in range 0 to 2G due to
+           relocation for R_X86_64_32/32S/PC32 */
+        map_flags = MMAP_MAP_32BIT;
+
+        /* Create the GOT for func_ptrs, note that it is different from
+           the .got section of a dynamic object file */
+        size = (uint64)sizeof(void *) * got_item_count;
+        if (size > UINT32_MAX
+            || !(module->got_func_ptrs =
+                     os_mmap(NULL, (uint32)size, map_prot, map_flags))) {
+            set_error_buf(error_buf, error_buf_size, "mmap memory failed");
+            goto fail;
+        }
+
+        while (got_item) {
+            module->got_func_ptrs[got_item_idx++] =
+                module->func_ptrs[got_item->func_idx];
+            got_item = got_item->next;
+        }
+
+        module->got_item_count = got_item_count;
+    }
+#else
+    (void)got_item_count;
+#endif /* (defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)) && \
+          !defined(BH_PLATFORM_WINDOWS) */
+
     buf = symbol_buf_end;
     read_uint32(buf, buf_end, group_count);
 
@@ -2889,6 +3130,16 @@ load(const uint8 *buf, uint32 size, AOTModule *module, char *error_buf,
            module->code and will be destroyed in aot_unload() */
         destroy_sections(section_list, false);
     }
+
+#if 0
+    {
+        uint32 i;
+        for (i = 0; i < module->func_count; i++) {
+            os_printf("AOT func %u, addr: %p\n", i, module->func_ptrs[i]);
+        }
+    }
+#endif
+
     return ret;
 fail:
     return false;
@@ -2984,9 +3235,27 @@ aot_unload(AOTModule *module)
     }
 #endif
 
+#if (defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)) \
+    && !defined(BH_PLATFORM_WINDOWS)
+    {
+        GOTItem *got_item = module->got_item_list, *got_item_next;
+
+        if (module->got_func_ptrs) {
+            os_munmap(module->got_func_ptrs,
+                      sizeof(void *) * module->got_item_count);
+        }
+        while (got_item) {
+            got_item_next = got_item->next;
+            wasm_runtime_free(got_item);
+            got_item = got_item_next;
+        }
+    }
+#endif
+
     if (module->data_sections)
         destroy_object_data_sections(module->data_sections,
                                      module->data_section_count);
+
 #if WASM_ENABLE_DEBUG_AOT != 0
     jit_code_entry_destroy(module->elf_hdr);
 #endif
@@ -3033,3 +3302,23 @@ aot_get_custom_section(const AOTModule *module, const char *name, uint32 *len)
     return NULL;
 }
 #endif /* end of WASM_ENABLE_LOAD_CUSTOM_SECTION */
+
+#if WASM_ENABLE_STATIC_PGO != 0
+void
+aot_exchange_uint16(uint8 *p_data)
+{
+    return exchange_uint16(p_data);
+}
+
+void
+aot_exchange_uint32(uint8 *p_data)
+{
+    return exchange_uint32(p_data);
+}
+
+void
+aot_exchange_uint64(uint8 *p_data)
+{
+    return exchange_uint64(p_data);
+}
+#endif

+ 9 - 0
core/iwasm/aot/aot_reloc.h

@@ -121,6 +121,14 @@ typedef struct {
     REG_SYM(aot_intrinsic_i32_rem_s),     \
     REG_SYM(aot_intrinsic_i32_rem_u),     \
 
+#if WASM_ENABLE_STATIC_PGO != 0
+#define REG_LLVM_PGO_SYM()               \
+    { "__llvm_profile_instrument_target", llvm_profile_instrument_target }, \
+    { "__llvm_profile_instrument_memop", llvm_profile_instrument_memop },
+#else
+#define REG_LLVM_PGO_SYM()
+#endif
+
 #define REG_COMMON_SYMBOLS                \
     REG_SYM(aot_set_exception_with_id),   \
     REG_SYM(aot_invoke_native),           \
@@ -150,6 +158,7 @@ typedef struct {
     REG_REF_TYPES_SYM()                   \
     REG_AOT_TRACE_SYM()                   \
     REG_INTRINSIC_SYM()                   \
+    REG_LLVM_PGO_SYM()                    \
 
 #define CHECK_RELOC_OFFSET(data_size) do {              \
     if (!check_reloc_offset(target_section_size,        \

+ 535 - 0
core/iwasm/aot/aot_runtime.c

@@ -1015,6 +1015,15 @@ execute_post_instantiate_functions(AOTModuleInstance *module_inst,
         }
     }
 
+#if defined(os_writegsbase)
+    {
+        AOTMemoryInstance *memory_inst = aot_get_default_memory(module_inst);
+        if (memory_inst)
+            /* write base addr of linear memory to GS segment register */
+            os_writegsbase(memory_inst->memory_data);
+    }
+#endif
+
     /* Execute start function for both main insance and sub instance */
     if (module->start_function) {
         AOTFunctionInstance start_func = { 0 };
@@ -1453,6 +1462,15 @@ aot_call_function(WASMExecEnv *exec_env, AOTFunctionInstance *function,
     }
     argc = func_type->param_cell_num;
 
+#if defined(os_writegsbase)
+    {
+        AOTMemoryInstance *memory_inst = aot_get_default_memory(module_inst);
+        if (memory_inst)
+            /* write base addr of linear memory to GS segment register */
+            os_writegsbase(memory_inst->memory_data);
+    }
+#endif
+
     /* func pointer was looked up previously */
     bh_assert(function->u.func.func_ptr != NULL);
 
@@ -2834,3 +2852,520 @@ aot_dump_perf_profiling(const AOTModuleInstance *module_inst)
     }
 }
 #endif /* end of WASM_ENABLE_PERF_PROFILING */
+
+#if WASM_ENABLE_STATIC_PGO != 0
+
+/* indirect call target */
+#define IPVK_IndirectCallTarget 0
+/* memory intrinsic functions size */
+#define IPVK_MemOPSize 1
+#define IPVK_First IPVK_IndirectCallTarget
+#define IPVK_Last IPVK_MemOPSize
+
+#define INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE 24
+#define INSTR_PROF_MAX_NUM_VAL_PER_SITE 255
+
+static int hasNonDefaultValsPerSite = 0;
+static uint32 VPMaxNumValsPerSite = INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE;
+
+static bool
+cmpxchg_ptr(void **ptr, void *old_val, void *new_val)
+{
+#if defined(os_atomic_cmpxchg)
+    return os_atomic_cmpxchg(ptr, &old_val, new_val);
+#else
+    /* TODO: add lock when thread-manager is enabled */
+    void *read = *ptr;
+    if (read == old_val) {
+        *ptr = new_val;
+        return true;
+    }
+    return false;
+#endif
+}
+
+static int
+allocateValueProfileCounters(LLVMProfileData *Data)
+{
+    ValueProfNode **Mem;
+    uint64 NumVSites = 0, total_size;
+    uint32 VKI;
+
+    /* When dynamic allocation is enabled, allow tracking the max number of
+       values allowed. */
+    if (!hasNonDefaultValsPerSite)
+        VPMaxNumValsPerSite = INSTR_PROF_MAX_NUM_VAL_PER_SITE;
+
+    for (VKI = IPVK_First; VKI <= IPVK_Last; ++VKI)
+        NumVSites += Data->num_value_sites[VKI];
+
+    /* If NumVSites = 0, calloc is allowed to return a non-null pointer. */
+    bh_assert(NumVSites > 0 && "NumVSites can't be zero");
+
+    total_size = (uint64)sizeof(ValueProfNode *) * NumVSites;
+    if (total_size > UINT32_MAX
+        || !(Mem = (ValueProfNode **)wasm_runtime_malloc((uint32)total_size))) {
+        return 0;
+    }
+    memset(Mem, 0, (uint32)total_size);
+
+    if (!cmpxchg_ptr((void **)&Data->values, NULL, Mem)) {
+        wasm_runtime_free(Mem);
+        return 0;
+    }
+    return 1;
+}
+
+static ValueProfNode *
+allocateOneNode(void)
+{
+    ValueProfNode *Node;
+
+    Node = wasm_runtime_malloc((uint32)sizeof(ValueProfNode));
+    if (Node)
+        memset(Node, 0, sizeof(ValueProfNode));
+    return Node;
+}
+
+static void
+instrumentTargetValueImpl(uint64 TargetValue, void *Data, uint32 CounterIndex,
+                          uint64 CountValue)
+{
+    ValueProfNode **ValueCounters;
+    ValueProfNode *PrevVNode = NULL, *MinCountVNode = NULL, *CurVNode;
+    LLVMProfileData *PData = (LLVMProfileData *)Data;
+    uint64 MinCount = UINT64_MAX;
+    uint8 VDataCount = 0;
+    bool success = false;
+
+    if (!PData)
+        return;
+    if (!CountValue)
+        return;
+    if (!PData->values) {
+        if (!allocateValueProfileCounters(PData))
+            return;
+    }
+
+    ValueCounters = (ValueProfNode **)PData->values;
+    CurVNode = ValueCounters[CounterIndex];
+
+    while (CurVNode) {
+        if (TargetValue == CurVNode->value) {
+            CurVNode->count += CountValue;
+            return;
+        }
+        if (CurVNode->count < MinCount) {
+            MinCount = CurVNode->count;
+            MinCountVNode = CurVNode;
+        }
+        PrevVNode = CurVNode;
+        CurVNode = CurVNode->next;
+        ++VDataCount;
+    }
+
+    if (VDataCount >= VPMaxNumValsPerSite) {
+        if (MinCountVNode->count <= CountValue) {
+            CurVNode = MinCountVNode;
+            CurVNode->value = TargetValue;
+            CurVNode->count = CountValue;
+        }
+        else
+            MinCountVNode->count -= CountValue;
+
+        return;
+    }
+
+    CurVNode = allocateOneNode();
+    if (!CurVNode)
+        return;
+    CurVNode->value = TargetValue;
+    CurVNode->count += CountValue;
+
+    if (!ValueCounters[CounterIndex]) {
+        success =
+            cmpxchg_ptr((void **)&ValueCounters[CounterIndex], NULL, CurVNode);
+    }
+    else if (PrevVNode && !PrevVNode->next) {
+        success = cmpxchg_ptr((void **)&PrevVNode->next, 0, CurVNode);
+    }
+
+    if (!success) {
+        wasm_runtime_free(CurVNode);
+    }
+}
+
+void
+llvm_profile_instrument_target(uint64 target_value, void *data,
+                               uint32 counter_idx)
+{
+    instrumentTargetValueImpl(target_value, data, counter_idx, 1);
+}
+
+static inline uint32
+popcount64(uint64 u)
+{
+    uint32 ret = 0;
+    while (u) {
+        u = (u & (u - 1));
+        ret++;
+    }
+    return ret;
+}
+
+static inline uint32
+clz64(uint64 type)
+{
+    uint32 num = 0;
+    if (type == 0)
+        return 64;
+    while (!(type & 0x8000000000000000LL)) {
+        num++;
+        type <<= 1;
+    }
+    return num;
+}
+
+/* Map an (observed) memop size value to the representative value of its range.
+   For example, 5 -> 5, 22 -> 17, 99 -> 65, 256 -> 256, 1001 -> 513. */
+static uint64
+InstrProfGetRangeRepValue(uint64 Value)
+{
+    if (Value <= 8)
+        /* The first ranges are individually tracked. Use the value as is. */
+        return Value;
+    else if (Value >= 513)
+        /* The last range is mapped to its lowest value. */
+        return 513;
+    else if (popcount64(Value) == 1)
+        /* If it's a power of two, use it as is. */
+        return Value;
+    else
+        /* Otherwise, take to the previous power of two + 1. */
+        return (((uint64)1) << (64 - clz64(Value) - 1)) + 1;
+}
+
+void
+llvm_profile_instrument_memop(uint64 target_value, void *data,
+                              uint32 counter_idx)
+{
+    uint64 rep_value = InstrProfGetRangeRepValue(target_value);
+    instrumentTargetValueImpl(rep_value, data, counter_idx, 1);
+}
+
+static uint32
+get_pgo_prof_data_size(AOTModuleInstance *module_inst, uint32 *p_num_prof_data,
+                       uint32 *p_num_prof_counters, uint32 *p_padding_size,
+                       uint32 *p_prof_counters_size, uint32 *p_prof_names_size,
+                       uint32 *p_value_counters_size, uint8 **p_prof_names)
+{
+    AOTModule *module = (AOTModule *)module_inst->module;
+    LLVMProfileData *prof_data;
+    uint8 *prof_names = NULL;
+    uint32 num_prof_data = 0, num_prof_counters = 0, padding_size, i;
+    uint32 prof_counters_size = 0, prof_names_size = 0;
+    uint32 total_size, total_size_wo_value_counters;
+
+    for (i = 0; i < module->data_section_count; i++) {
+        if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) {
+            bh_assert(module->data_sections[i].size == sizeof(LLVMProfileData));
+            num_prof_data++;
+            prof_data = (LLVMProfileData *)module->data_sections[i].data;
+            num_prof_counters += prof_data->num_counters;
+        }
+        else if (!strncmp(module->data_sections[i].name, "__llvm_prf_cnts",
+                          15)) {
+            prof_counters_size += module->data_sections[i].size;
+        }
+        else if (!strncmp(module->data_sections[i].name, "__llvm_prf_names",
+                          16)) {
+            prof_names_size = module->data_sections[i].size;
+            prof_names = module->data_sections[i].data;
+        }
+    }
+
+    if (prof_counters_size != num_prof_counters * sizeof(uint64))
+        return 0;
+
+    total_size = sizeof(LLVMProfileRawHeader)
+                 + num_prof_data * sizeof(LLVMProfileData_64)
+                 + prof_counters_size + prof_names_size;
+    padding_size = sizeof(uint64) - (prof_names_size % sizeof(uint64));
+    if (padding_size != sizeof(uint64))
+        total_size += padding_size;
+
+    /* Total size excluding value counters */
+    total_size_wo_value_counters = total_size;
+
+    for (i = 0; i < module->data_section_count; i++) {
+        if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) {
+            uint32 j, k, num_value_sites, num_value_nodes;
+            ValueProfNode **values, *value_node;
+
+            prof_data = (LLVMProfileData *)module->data_sections[i].data;
+            values = prof_data->values;
+
+            if (prof_data->num_value_sites[0] > 0
+                || prof_data->num_value_sites[1] > 0) {
+                /* TotalSize (uint32) and NumValueKinds (uint32) */
+                total_size += 8;
+                for (j = 0; j < 2; j++) {
+                    if ((num_value_sites = prof_data->num_value_sites[j]) > 0) {
+                        /* ValueKind (uint32) and NumValueSites (uint32) */
+                        total_size += 8;
+                        /* (Value + Counter) group counts of each value site,
+                           each count is one byte */
+                        total_size += align_uint(num_value_sites, 8);
+
+                        if (values) {
+                            for (k = 0; k < num_value_sites; k++) {
+                                num_value_nodes = 0;
+                                value_node = *values;
+                                while (value_node) {
+                                    num_value_nodes++;
+                                    value_node = value_node->next;
+                                }
+                                if (num_value_nodes) {
+                                    /* (Value + Counter) groups */
+                                    total_size += num_value_nodes * 8 * 2;
+                                }
+                                values++;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if (p_num_prof_data)
+        *p_num_prof_data = num_prof_data;
+    if (p_num_prof_counters)
+        *p_num_prof_counters = num_prof_counters;
+    if (p_padding_size)
+        *p_padding_size = padding_size;
+    if (p_prof_counters_size)
+        *p_prof_counters_size = prof_counters_size;
+    if (p_prof_names_size)
+        *p_prof_names_size = prof_names_size;
+    if (p_value_counters_size)
+        *p_value_counters_size = total_size - total_size_wo_value_counters;
+    if (p_prof_names)
+        *p_prof_names = prof_names;
+
+    return total_size;
+}
+
+uint32
+aot_get_pgo_prof_data_size(AOTModuleInstance *module_inst)
+{
+    return get_pgo_prof_data_size(module_inst, NULL, NULL, NULL, NULL, NULL,
+                                  NULL, NULL);
+}
+
+static union {
+    int a;
+    char b;
+} __ue = { .a = 1 };
+
+#define is_little_endian() (__ue.b == 1)
+
+uint32
+aot_dump_pgo_prof_data_to_buf(AOTModuleInstance *module_inst, char *buf,
+                              uint32 len)
+{
+    AOTModule *module = (AOTModule *)module_inst->module;
+    LLVMProfileRawHeader prof_header = { 0 };
+    LLVMProfileData *prof_data;
+    uint8 *prof_names = NULL;
+    uint32 num_prof_data = 0, num_prof_counters = 0, padding_size, i;
+    uint32 prof_counters_size = 0, prof_names_size = 0;
+    uint32 value_counters_size = 0, value_counters_size_backup = 0;
+    uint32 total_size, size;
+    int64 counters_delta, offset_counters;
+
+    total_size = get_pgo_prof_data_size(module_inst, &num_prof_data,
+                                        &num_prof_counters, &padding_size,
+                                        &prof_counters_size, &prof_names_size,
+                                        &value_counters_size, &prof_names);
+    if (len < total_size)
+        return 0;
+
+    value_counters_size_backup = value_counters_size;
+    value_counters_size = 0;
+
+    prof_header.counters_delta = counters_delta =
+        sizeof(LLVMProfileData_64) * num_prof_data;
+    offset_counters = 0;
+    for (i = 0; i < module->data_section_count; i++) {
+        if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) {
+            prof_data = (LLVMProfileData *)module->data_sections[i].data;
+            prof_data->offset_counters = counters_delta + offset_counters;
+            offset_counters += prof_data->num_counters * sizeof(uint64);
+            counters_delta -= sizeof(LLVMProfileData_64);
+        }
+    }
+
+    prof_header.magic = 0xFF6C70726F667281LL;
+    /* Version 8 */
+    prof_header.version = 0x0000000000000008LL;
+    /* with VARIANT_MASK_IR_PROF (IR Instrumentation) */
+    prof_header.version |= 0x1ULL << 56;
+    /* with VARIANT_MASK_MEMPROF (Memory Profile) */
+    prof_header.version |= 0x1ULL << 62;
+    prof_header.num_prof_data = num_prof_data;
+    prof_header.num_prof_counters = num_prof_counters;
+    prof_header.names_size = prof_names_size;
+    prof_header.value_kind_last = 1;
+
+    if (!is_little_endian()) {
+        aot_exchange_uint64((uint8 *)&prof_header.magic);
+        aot_exchange_uint64((uint8 *)&prof_header.version);
+        aot_exchange_uint64((uint8 *)&prof_header.num_prof_data);
+        aot_exchange_uint64((uint8 *)&prof_header.num_prof_counters);
+        aot_exchange_uint64((uint8 *)&prof_header.names_size);
+        aot_exchange_uint64((uint8 *)&prof_header.counters_delta);
+        aot_exchange_uint64((uint8 *)&prof_header.value_kind_last);
+    }
+
+    size = sizeof(LLVMProfileRawHeader);
+    bh_memcpy_s(buf, size, &prof_header, size);
+    buf += size;
+
+    for (i = 0; i < module->data_section_count; i++) {
+        if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) {
+            LLVMProfileData_64 *prof_data_64 = (LLVMProfileData_64 *)buf;
+
+            /* Convert LLVMProfileData to LLVMProfileData_64, the pointer width
+               in the output file is alawys 8 bytes */
+            prof_data = (LLVMProfileData *)module->data_sections[i].data;
+            prof_data_64->func_md5 = prof_data->func_md5;
+            prof_data_64->func_hash = prof_data->func_hash;
+            prof_data_64->offset_counters = prof_data->offset_counters;
+            prof_data_64->func_ptr = prof_data->func_ptr;
+            prof_data_64->values = (uint64)(uintptr_t)prof_data->values;
+            prof_data_64->num_counters = prof_data->num_counters;
+            prof_data_64->num_value_sites[0] = prof_data->num_value_sites[0];
+            prof_data_64->num_value_sites[1] = prof_data->num_value_sites[1];
+
+            if (!is_little_endian()) {
+                aot_exchange_uint64((uint8 *)&prof_data_64->func_hash);
+                aot_exchange_uint64((uint8 *)&prof_data_64->offset_counters);
+                aot_exchange_uint64((uint8 *)&prof_data_64->offset_counters);
+                aot_exchange_uint64((uint8 *)&prof_data_64->func_ptr);
+                aot_exchange_uint64((uint8 *)&prof_data_64->values);
+                aot_exchange_uint32((uint8 *)&prof_data_64->num_counters);
+                aot_exchange_uint16((uint8 *)&prof_data_64->num_value_sites[0]);
+                aot_exchange_uint16((uint8 *)&prof_data_64->num_value_sites[1]);
+            }
+            buf += sizeof(LLVMProfileData_64);
+        }
+    }
+
+    for (i = 0; i < module->data_section_count; i++) {
+        if (!strncmp(module->data_sections[i].name, "__llvm_prf_cnts", 15)) {
+            size = module->data_sections[i].size;
+            bh_memcpy_s(buf, size, module->data_sections[i].data, size);
+            buf += size;
+        }
+    }
+
+    if (prof_names && prof_names_size > 0) {
+        size = prof_names_size;
+        bh_memcpy_s(buf, size, prof_names, size);
+        buf += size;
+        padding_size = sizeof(uint64) - (prof_names_size % sizeof(uint64));
+        if (padding_size != sizeof(uint64)) {
+            char padding_buf[8] = { 0 };
+            bh_memcpy_s(buf, padding_size, padding_buf, padding_size);
+            buf += padding_size;
+        }
+    }
+
+    for (i = 0; i < module->data_section_count; i++) {
+        if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) {
+            uint32 j, k, num_value_sites, num_value_nodes;
+            ValueProfNode **values, **values_tmp, *value_node;
+
+            prof_data = (LLVMProfileData *)module->data_sections[i].data;
+            values = values_tmp = prof_data->values;
+
+            if (prof_data->num_value_sites[0] > 0
+                || prof_data->num_value_sites[1] > 0) {
+                uint32 *buf_total_size = (uint32 *)buf;
+
+                buf += 4; /* emit TotalSize later */
+                *(uint32 *)buf = (prof_data->num_value_sites[0] > 0
+                                  && prof_data->num_value_sites[1] > 0)
+                                     ? 2
+                                     : 1;
+                if (!is_little_endian())
+                    aot_exchange_uint32((uint8 *)buf);
+                buf += 4;
+
+                for (j = 0; j < 2; j++) {
+                    if ((num_value_sites = prof_data->num_value_sites[j]) > 0) {
+                        /* ValueKind */
+                        *(uint32 *)buf = j;
+                        if (!is_little_endian())
+                            aot_exchange_uint32((uint8 *)buf);
+                        buf += 4;
+                        /* NumValueSites */
+                        *(uint32 *)buf = num_value_sites;
+                        if (!is_little_endian())
+                            aot_exchange_uint32((uint8 *)buf);
+                        buf += 4;
+
+                        for (k = 0; k < num_value_sites; k++) {
+                            num_value_nodes = 0;
+                            if (values_tmp) {
+                                value_node = *values_tmp;
+                                while (value_node) {
+                                    num_value_nodes++;
+                                    value_node = value_node->next;
+                                }
+                                values_tmp++;
+                            }
+                            bh_assert(num_value_nodes < 255);
+                            *(uint8 *)buf++ = (uint8)num_value_nodes;
+                        }
+                        if (num_value_sites % 8) {
+                            buf += 8 - (num_value_sites % 8);
+                        }
+
+                        for (k = 0; k < num_value_sites; k++) {
+                            if (values) {
+                                value_node = *values;
+                                while (value_node) {
+                                    *(uint64 *)buf = value_node->value;
+                                    if (!is_little_endian())
+                                        aot_exchange_uint64((uint8 *)buf);
+                                    buf += 8;
+                                    *(uint64 *)buf = value_node->count;
+                                    if (!is_little_endian())
+                                        aot_exchange_uint64((uint8 *)buf);
+                                    buf += 8;
+                                    value_node = value_node->next;
+                                }
+                                values++;
+                            }
+                        }
+                    }
+                }
+
+                /* TotalSize */
+                *(uint32 *)buf_total_size =
+                    (uint8 *)buf - (uint8 *)buf_total_size;
+                if (!is_little_endian())
+                    aot_exchange_uint64((uint8 *)buf_total_size);
+                value_counters_size += (uint8 *)buf - (uint8 *)buf_total_size;
+            }
+        }
+    }
+
+    bh_assert(value_counters_size == value_counters_size_backup);
+    (void)value_counters_size_backup;
+
+    return total_size;
+}
+#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */

+ 98 - 0
core/iwasm/aot/aot_runtime.h

@@ -41,6 +41,10 @@ typedef struct AOTObjectDataSection {
     char *name;
     uint8 *data;
     uint32 size;
+#if WASM_ENABLE_WAMR_COMPILER != 0 || WASM_ENABLE_JIT != 0
+    bool is_name_allocated;
+    bool is_data_allocated;
+#endif
 } AOTObjectDataSection;
 
 /* Relocation info */
@@ -51,6 +55,9 @@ typedef struct AOTRelocation {
     char *symbol_name;
     /* index in the symbol offset field */
     uint32 symbol_index;
+#if WASM_ENABLE_WAMR_COMPILER != 0 || WASM_ENABLE_JIT != 0
+    bool is_symbol_name_allocated;
+#endif
 } AOTRelocation;
 
 /* Relocation Group */
@@ -60,6 +67,9 @@ typedef struct AOTRelocationGroup {
     uint32 name_index;
     uint32 relocation_count;
     AOTRelocation *relocations;
+#if WASM_ENABLE_WAMR_COMPILER != 0 || WASM_ENABLE_JIT != 0
+    bool is_section_name_allocated;
+#endif
 } AOTRelocationGroup;
 
 /* AOT function instance */
@@ -108,6 +118,13 @@ typedef struct AOTUnwindInfo {
 #define PLT_ITEM_SIZE 12
 #endif
 
+#if defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)
+typedef struct GOTItem {
+    uint32 func_idx;
+    struct GOTItem *next;
+} GOTItem, *GOTItemList;
+#endif
+
 typedef struct AOTModule {
     uint32 module_type;
 
@@ -204,6 +221,13 @@ typedef struct AOTModule {
     bool rtl_func_table_registered;
 #endif
 
+#if defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)
+    uint32 got_item_count;
+    GOTItemList got_item_list;
+    GOTItemList got_item_list_end;
+    void **got_func_ptrs;
+#endif
+
     /* data sections in AOT object file, including .data, .rodata
        and .rodata.cstN. */
     AOTObjectDataSection *data_sections;
@@ -294,6 +318,54 @@ typedef struct AOTFrame {
 #endif
 } AOTFrame;
 
+#if WASM_ENABLE_STATIC_PGO != 0
+typedef struct LLVMProfileRawHeader {
+    uint64 magic;
+    uint64 version;
+    uint64 binary_ids_size;
+    uint64 num_prof_data;
+    uint64 padding_bytes_before_counters;
+    uint64 num_prof_counters;
+    uint64 padding_bytes_after_counters;
+    uint64 names_size;
+    uint64 counters_delta;
+    uint64 names_delta;
+    uint64 value_kind_last;
+} LLVMProfileRawHeader;
+
+typedef struct ValueProfNode {
+    uint64 value;
+    uint64 count;
+    struct ValueProfNode *next;
+} ValueProfNode;
+
+/* The profiling data of data sections created by aot compiler and
+   used when profiling, the width of pointer can be 8 bytes (64-bit)
+   or 4 bytes (32-bit) */
+typedef struct LLVMProfileData {
+    uint64 func_md5;
+    uint64 func_hash;
+    uint64 offset_counters;
+    uintptr_t func_ptr;
+    ValueProfNode **values;
+    uint32 num_counters;
+    uint16 num_value_sites[2];
+} LLVMProfileData;
+
+/* The profiling data for writting to the output file, the width of
+   pointer is 8 bytes suppose we always use wamrc and llvm-profdata
+   with 64-bit mode */
+typedef struct LLVMProfileData_64 {
+    uint64 func_md5;
+    uint64 func_hash;
+    uint64 offset_counters;
+    uint64 func_ptr;
+    uint64 values;
+    uint32 num_counters;
+    uint16 num_value_sites[2];
+} LLVMProfileData_64;
+#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */
+
 /**
  * Load a AOT module from aot file buffer
  * @param buf the byte buffer which contains the AOT file data
@@ -564,6 +636,32 @@ aot_dump_perf_profiling(const AOTModuleInstance *module_inst);
 const uint8 *
 aot_get_custom_section(const AOTModule *module, const char *name, uint32 *len);
 
+#if WASM_ENABLE_STATIC_PGO != 0
+void
+llvm_profile_instrument_target(uint64 target_value, void *data,
+                               uint32 counter_idx);
+
+void
+llvm_profile_instrument_memop(uint64 target_value, void *data,
+                              uint32 counter_idx);
+
+uint32
+aot_get_pgo_prof_data_size(AOTModuleInstance *module_inst);
+
+uint32
+aot_dump_pgo_prof_data_to_buf(AOTModuleInstance *module_inst, char *buf,
+                              uint32 len);
+
+void
+aot_exchange_uint16(uint8 *p_data);
+
+void
+aot_exchange_uint32(uint8 *p_data);
+
+void
+aot_exchange_uint64(uint8 *p_data);
+#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */
+
 #ifdef __cplusplus
 } /* end of extern "C" */
 #endif

+ 6 - 0
core/iwasm/aot/arch/aot_reloc_x86_32.c

@@ -8,6 +8,9 @@
 #define R_386_32 1    /* Direct 32 bit  */
 #define R_386_PC32 2  /* PC relative 32 bit */
 #define R_386_PLT32 4 /* 32-bit address ProcedureLinkageTable */
+#define R_386_TLS_GD_32                      \
+    24 /*  Direct 32 bit for general dynamic \
+           thread local data */
 
 #if !defined(_WIN32) && !defined(_WIN32_)
 /* clang-format off */
@@ -110,6 +113,9 @@ apply_relocation(AOTModule *module, uint8 *target_section_addr,
 {
     switch (reloc_type) {
         case R_386_32:
+#if WASM_ENABLE_STATIC_PGO != 0
+        case R_386_TLS_GD_32:
+#endif
         {
             intptr_t value;
 

+ 18 - 5
core/iwasm/aot/arch/aot_reloc_x86_64.c

@@ -6,11 +6,13 @@
 #include "aot_reloc.h"
 
 #if !defined(BH_PLATFORM_WINDOWS)
-#define R_X86_64_64 1    /* Direct 64 bit  */
-#define R_X86_64_PC32 2  /* PC relative 32 bit signed */
-#define R_X86_64_PLT32 4 /* 32 bit PLT address */
-#define R_X86_64_32 10   /* Direct 32 bit zero extended */
-#define R_X86_64_32S 11  /* Direct 32 bit sign extended */
+#define R_X86_64_64 1       /* Direct 64 bit  */
+#define R_X86_64_PC32 2     /* PC relative 32 bit signed */
+#define R_X86_64_PLT32 4    /* 32 bit PLT address */
+#define R_X86_64_GOTPCREL 9 /* 32 bit signed PC relative offset to GOT */
+#define R_X86_64_32 10      /* Direct 32 bit zero extended */
+#define R_X86_64_32S 11     /* Direct 32 bit sign extended */
+#define R_X86_64_PC64 24    /* PC relative 64 bit */
 #else
 #ifndef IMAGE_REL_AMD64_ADDR64
 #define IMAGE_REL_AMD64_ADDR64 1 /* The 64-bit VA of the relocation target */
@@ -164,6 +166,7 @@ apply_relocation(AOTModule *module, uint8 *target_section_addr,
 #endif
 #if !defined(BH_PLATFORM_WINDOWS)
         case R_X86_64_PC32:
+        case R_X86_64_GOTPCREL: /* GOT + G has been calculated as symbol_addr */
         {
             intptr_t target_addr = (intptr_t) /* S + A - P */
                 ((uintptr_t)symbol_addr + reloc_addend
@@ -182,6 +185,16 @@ apply_relocation(AOTModule *module, uint8 *target_section_addr,
             *(int32 *)(target_section_addr + reloc_offset) = (int32)target_addr;
             break;
         }
+        case R_X86_64_PC64:
+        {
+            intptr_t target_addr = (intptr_t) /* S + A - P */
+                ((uintptr_t)symbol_addr + reloc_addend
+                 - (uintptr_t)(target_section_addr + reloc_offset));
+
+            CHECK_RELOC_OFFSET(sizeof(int64));
+            *(int64 *)(target_section_addr + reloc_offset) = (int64)target_addr;
+            break;
+        }
         case R_X86_64_32:
         case R_X86_64_32S:
         {

+ 6 - 1
core/iwasm/common/wasm_memory.c

@@ -624,6 +624,11 @@ wasm_enlarge_memory_internal(WASMModuleInstance *module, uint32 inc_page_count)
 #endif
 #endif
 
+#if defined(os_writegsbase)
+    /* write base addr of linear memory to GS segment register */
+    os_writegsbase(memory_data_new);
+#endif
+
     return ret;
 }
 #else
@@ -756,4 +761,4 @@ wasm_get_linear_memory_size(WASMMemoryInstance *memory, void *node)
 #endif
     return linear_mem_size;
 }
-#endif
+#endif

+ 35 - 1
core/iwasm/common/wasm_runtime_common.c

@@ -130,7 +130,7 @@ static JitCompOptions jit_options = { 0 };
 #endif
 
 #if WASM_ENABLE_JIT != 0
-static LLVMJITOptions llvm_jit_options = { 3, 3 };
+static LLVMJITOptions llvm_jit_options = { 3, 3, 0 };
 #endif
 
 static RunningMode runtime_running_mode = Mode_Default;
@@ -554,6 +554,7 @@ wasm_runtime_full_init(RuntimeInitArgs *init_args)
 #if WASM_ENABLE_JIT != 0
     llvm_jit_options.size_level = init_args->llvm_jit_size_level;
     llvm_jit_options.opt_level = init_args->llvm_jit_opt_level;
+    llvm_jit_options.segue_flags = init_args->segue_flags;
 #endif
 
     if (!wasm_runtime_env_init()) {
@@ -4212,6 +4213,12 @@ static V128FuncPtr invokeNative_V128 = (V128FuncPtr)(uintptr_t)invokeNative;
           || defined(BUILD_TARGET_RISCV64_LP64) */
 #endif /* end of defined(_WIN32) || defined(_WIN32_) */
 
+/* ASAN is not designed to work with custom stack unwind or other low-level \
+ things. > Ignore a function that does some low-level magic. (e.g. walking \
+ through the thread's stack bypassing the frame boundaries) */
+#if defined(__GNUC__)
+__attribute__((no_sanitize_address))
+#endif
 bool
 wasm_runtime_invoke_native(WASMExecEnv *exec_env, void *func_ptr,
                            const WASMType *func_type, const char *signature,
@@ -5026,6 +5033,33 @@ wasm_runtime_dump_call_stack_to_buf(wasm_exec_env_t exec_env, char *buf,
 }
 #endif /* end of WASM_ENABLE_DUMP_CALL_STACK */
 
+#if WASM_ENABLE_STATIC_PGO != 0
+uint32
+wasm_runtime_get_pgo_prof_data_size(WASMModuleInstanceCommon *module_inst)
+{
+#if WASM_ENABLE_AOT != 0
+    if (module_inst->module_type == Wasm_Module_AoT) {
+        AOTModuleInstance *aot_inst = (AOTModuleInstance *)module_inst;
+        return aot_get_pgo_prof_data_size(aot_inst);
+    }
+#endif
+    return 0;
+}
+
+uint32
+wasm_runtime_dump_pgo_prof_data_to_buf(WASMModuleInstanceCommon *module_inst,
+                                       char *buf, uint32 len)
+{
+#if WASM_ENABLE_AOT != 0
+    if (module_inst->module_type == Wasm_Module_AoT) {
+        AOTModuleInstance *aot_inst = (AOTModuleInstance *)module_inst;
+        return aot_dump_pgo_prof_data_to_buf(aot_inst, buf, len);
+    }
+#endif
+    return 0;
+}
+#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */
+
 bool
 wasm_runtime_get_table_elem_type(const WASMModuleCommon *module_comm,
                                  uint32 table_idx, uint8 *out_elem_type,

+ 1 - 0
core/iwasm/common/wasm_runtime_common.h

@@ -420,6 +420,7 @@ typedef struct wasm_frame_t {
 typedef struct LLVMJITOptions {
     uint32 opt_level;
     uint32 size_level;
+    uint32 segue_flags;
 } LLVMJITOptions;
 #endif
 

+ 1 - 1
core/iwasm/common/wasm_shared_memory.c

@@ -384,7 +384,7 @@ wasm_runtime_atomic_wait(WASMModuleInstanceCommon *module, void *address,
 
     /* unit of timeout is nsec, convert it to usec */
     timeout_left = (uint64)timeout / 1000;
-    timeout_1sec = 1e6;
+    timeout_1sec = (uint64)1e6;
 
     while (1) {
         if (timeout < 0) {

+ 8 - 0
core/iwasm/compilation/aot_compiler.h

@@ -239,6 +239,13 @@ check_type_compatible(uint8 src_type, uint8 dst_type)
 #define FUNC_REF_TYPE comp_ctx->basic_types.funcref_type
 #define EXTERN_REF_TYPE comp_ctx->basic_types.externref_type
 
+#define INT8_PTR_TYPE_GS comp_ctx->basic_types.int8_ptr_type_gs
+#define INT16_PTR_TYPE_GS comp_ctx->basic_types.int16_ptr_type_gs
+#define INT32_PTR_TYPE_GS comp_ctx->basic_types.int32_ptr_type_gs
+#define INT64_PTR_TYPE_GS comp_ctx->basic_types.int64_ptr_type_gs
+#define F32_PTR_TYPE_GS comp_ctx->basic_types.float32_ptr_type_gs
+#define F64_PTR_TYPE_GS comp_ctx->basic_types.float64_ptr_type_gs
+
 #define I32_CONST(v) LLVMConstInt(I32_TYPE, v, true)
 #define I64_CONST(v) LLVMConstInt(I64_TYPE, v, true)
 #define F32_CONST(v) LLVMConstReal(F32_TYPE, v)
@@ -272,6 +279,7 @@ check_type_compatible(uint8 src_type, uint8 dst_type)
 
 #define V128_TYPE comp_ctx->basic_types.v128_type
 #define V128_PTR_TYPE comp_ctx->basic_types.v128_ptr_type
+#define V128_PTR_TYPE_GS comp_ctx->basic_types.v128_ptr_type_gs
 #define V128_i8x16_TYPE comp_ctx->basic_types.i8x16_vec_type
 #define V128_i16x8_TYPE comp_ctx->basic_types.i16x8_vec_type
 #define V128_i32x4_TYPE comp_ctx->basic_types.i32x4_vec_type

+ 284 - 32
core/iwasm/compilation/aot_emit_aot_file.c

@@ -111,6 +111,8 @@ typedef struct AOTSymbolList {
 
 /* AOT object data */
 typedef struct AOTObjectData {
+    AOTCompContext *comp_ctx;
+
     LLVMMemoryBufferRef mem_buf;
     LLVMBinaryRef binary;
 
@@ -119,6 +121,12 @@ typedef struct AOTObjectData {
     void *text;
     uint32 text_size;
 
+    void *text_unlikely;
+    uint32 text_unlikely_size;
+
+    void *text_hot;
+    uint32 text_hot_size;
+
     /* literal data and size */
     void *literal;
     uint32 literal_size;
@@ -558,8 +566,10 @@ get_init_data_section_size(AOTCompContext *comp_ctx, AOTCompData *comp_data,
 static uint32
 get_text_section_size(AOTObjectData *obj_data)
 {
-    return (sizeof(uint32) + obj_data->literal_size + obj_data->text_size + 3)
-           & ~3;
+    return sizeof(uint32) + align_uint(obj_data->literal_size, 4)
+           + align_uint(obj_data->text_size, 4)
+           + align_uint(obj_data->text_unlikely_size, 4)
+           + align_uint(obj_data->text_hot_size, 4);
 }
 
 static uint32
@@ -1702,12 +1712,28 @@ aot_emit_text_section(uint8 *buf, uint8 *buf_end, uint32 *p_offset,
     EMIT_U32(AOT_SECTION_TYPE_TEXT);
     EMIT_U32(section_size);
     EMIT_U32(obj_data->literal_size);
-    if (obj_data->literal_size > 0)
+
+    if (obj_data->literal_size > 0) {
         EMIT_BUF(obj_data->literal, obj_data->literal_size);
-    EMIT_BUF(obj_data->text, obj_data->text_size);
+        while (offset & 3)
+            EMIT_BUF(&placeholder, 1);
+    }
 
-    while (offset & 3)
-        EMIT_BUF(&placeholder, 1);
+    if (obj_data->text_size > 0) {
+        EMIT_BUF(obj_data->text, obj_data->text_size);
+        while (offset & 3)
+            EMIT_BUF(&placeholder, 1);
+    }
+    if (obj_data->text_unlikely_size > 0) {
+        EMIT_BUF(obj_data->text_unlikely, obj_data->text_unlikely_size);
+        while (offset & 3)
+            EMIT_BUF(&placeholder, 1);
+    }
+    if (obj_data->text_hot_size > 0) {
+        EMIT_BUF(obj_data->text_hot, obj_data->text_hot_size);
+        while (offset & 3)
+            EMIT_BUF(&placeholder, 1);
+    }
 
     if (offset - *p_offset != section_size + sizeof(uint32) * 2) {
         aot_set_last_error("emit text section failed.");
@@ -2211,11 +2237,23 @@ aot_resolve_text(AOTObjectData *obj_data)
         }
         while (
             !LLVMObjectFileIsSectionIteratorAtEnd(obj_data->binary, sec_itr)) {
-            if ((name = (char *)LLVMGetSectionName(sec_itr))
-                && !strcmp(name, ".text")) {
-                obj_data->text = (char *)LLVMGetSectionContents(sec_itr);
-                obj_data->text_size = (uint32)LLVMGetSectionSize(sec_itr);
-                break;
+            if ((name = (char *)LLVMGetSectionName(sec_itr))) {
+                if (!strcmp(name, ".text")) {
+                    obj_data->text = (char *)LLVMGetSectionContents(sec_itr);
+                    obj_data->text_size = (uint32)LLVMGetSectionSize(sec_itr);
+                }
+                else if (!strcmp(name, ".text.unlikely.")) {
+                    obj_data->text_unlikely =
+                        (char *)LLVMGetSectionContents(sec_itr);
+                    obj_data->text_unlikely_size =
+                        (uint32)LLVMGetSectionSize(sec_itr);
+                }
+                else if (!strcmp(name, ".text.hot.")) {
+                    obj_data->text_hot =
+                        (char *)LLVMGetSectionContents(sec_itr);
+                    obj_data->text_hot_size =
+                        (uint32)LLVMGetSectionSize(sec_itr);
+                }
             }
             LLVMMoveToNextSection(sec_itr);
         }
@@ -2253,7 +2291,8 @@ static bool
 get_relocations_count(LLVMSectionIteratorRef sec_itr, uint32 *p_count);
 
 static bool
-is_data_section(LLVMSectionIteratorRef sec_itr, char *section_name)
+is_data_section(AOTObjectData *obj_data, LLVMSectionIteratorRef sec_itr,
+                char *section_name)
 {
     uint32 relocation_count = 0;
 
@@ -2265,7 +2304,11 @@ is_data_section(LLVMSectionIteratorRef sec_itr, char *section_name)
             || !strncmp(section_name, ".rodata.str", strlen(".rodata.str"))
             || (!strcmp(section_name, ".rdata")
                 && get_relocations_count(sec_itr, &relocation_count)
-                && relocation_count > 0));
+                && relocation_count > 0)
+            || (obj_data->comp_ctx->enable_llvm_pgo
+                && (!strncmp(section_name, "__llvm_prf_cnts", 15)
+                    || !strncmp(section_name, "__llvm_prf_data", 15)
+                    || !strncmp(section_name, "__llvm_prf_names", 16))));
 }
 
 static bool
@@ -2281,7 +2324,7 @@ get_object_data_sections_count(AOTObjectData *obj_data, uint32 *p_count)
     }
     while (!LLVMObjectFileIsSectionIteratorAtEnd(obj_data->binary, sec_itr)) {
         if ((name = (char *)LLVMGetSectionName(sec_itr))
-            && (is_data_section(sec_itr, name))) {
+            && (is_data_section(obj_data, sec_itr, name))) {
             count++;
         }
         LLVMMoveToNextSection(sec_itr);
@@ -2306,6 +2349,9 @@ aot_resolve_object_data_sections(AOTObjectData *obj_data)
     }
 
     if (sections_count > 0) {
+        uint32 llvm_prf_cnts_idx = 0, llvm_prf_data_idx = 0;
+        char buf[32];
+
         size = (uint32)sizeof(AOTObjectDataSection) * sections_count;
         if (!(data_section = obj_data->data_sections =
                   wasm_runtime_malloc(size))) {
@@ -2322,10 +2368,46 @@ aot_resolve_object_data_sections(AOTObjectData *obj_data)
         while (
             !LLVMObjectFileIsSectionIteratorAtEnd(obj_data->binary, sec_itr)) {
             if ((name = (char *)LLVMGetSectionName(sec_itr))
-                && (is_data_section(sec_itr, name))) {
+                && (is_data_section(obj_data, sec_itr, name))) {
                 data_section->name = name;
-                data_section->data = (uint8 *)LLVMGetSectionContents(sec_itr);
-                data_section->size = (uint32)LLVMGetSectionSize(sec_itr);
+                if (obj_data->comp_ctx->enable_llvm_pgo
+                    && !strcmp(name, "__llvm_prf_cnts")) {
+                    snprintf(buf, sizeof(buf), "%s%u", name,
+                             llvm_prf_cnts_idx++);
+                    size = strlen(buf) + 1;
+                    if (!(data_section->name = wasm_runtime_malloc(size))) {
+                        aot_set_last_error(
+                            "allocate memory for data section name failed.");
+                        return false;
+                    }
+                    bh_memcpy_s(data_section->name, size, buf, size);
+                    data_section->is_name_allocated = true;
+                }
+                else if (obj_data->comp_ctx->enable_llvm_pgo
+                         && !strcmp(name, "__llvm_prf_data")) {
+                    snprintf(buf, sizeof(buf), "%s%u", name,
+                             llvm_prf_data_idx++);
+                    size = strlen(buf) + 1;
+                    if (!(data_section->name = wasm_runtime_malloc(size))) {
+                        aot_set_last_error(
+                            "allocate memory for data section name failed.");
+                        return false;
+                    }
+                    bh_memcpy_s(data_section->name, size, buf, size);
+                    data_section->is_name_allocated = true;
+                }
+
+                if (obj_data->comp_ctx->enable_llvm_pgo
+                    && !strcmp(name, "__llvm_prf_names")) {
+                    data_section->data = (uint8 *)aot_compress_aot_func_names(
+                        obj_data->comp_ctx, &data_section->size);
+                    data_section->is_data_allocated = true;
+                }
+                else {
+                    data_section->data =
+                        (uint8 *)LLVMGetSectionContents(sec_itr);
+                    data_section->size = (uint32)LLVMGetSectionSize(sec_itr);
+                }
                 data_section++;
             }
             LLVMMoveToNextSection(sec_itr);
@@ -2365,9 +2447,36 @@ aot_resolve_functions(AOTCompContext *comp_ctx, AOTObjectData *obj_data)
             && str_starts_with(name, prefix)) {
             func_index = (uint32)atoi(name + strlen(prefix));
             if (func_index < obj_data->func_count) {
+                LLVMSectionIteratorRef contain_section;
+                char *contain_section_name;
+
                 func = obj_data->funcs + func_index;
                 func->func_name = name;
-                func->text_offset = LLVMGetSymbolAddress(sym_itr);
+
+                if (!(contain_section = LLVMObjectFileCopySectionIterator(
+                          obj_data->binary))) {
+                    aot_set_last_error("llvm get section iterator failed.");
+                    LLVMDisposeSymbolIterator(sym_itr);
+                    return false;
+                }
+                LLVMMoveToContainingSection(contain_section, sym_itr);
+                contain_section_name =
+                    (char *)LLVMGetSectionName(contain_section);
+                LLVMDisposeSectionIterator(contain_section);
+
+                if (!strcmp(contain_section_name, ".text.unlikely.")) {
+                    func->text_offset = align_uint(obj_data->text_size, 4)
+                                        + LLVMGetSymbolAddress(sym_itr);
+                }
+                else if (!strcmp(contain_section_name, ".text.hot.")) {
+                    func->text_offset =
+                        align_uint(obj_data->text_size, 4)
+                        + align_uint(obj_data->text_unlikely_size, 4)
+                        + LLVMGetSymbolAddress(sym_itr);
+                }
+                else {
+                    func->text_offset = LLVMGetSymbolAddress(sym_itr);
+                }
             }
         }
         LLVMMoveToNextSymbol(sym_itr);
@@ -2478,9 +2587,86 @@ aot_resolve_object_relocation_group(AOTObjectData *obj_data,
         }
 
         /* set relocation fields */
-        relocation->relocation_offset = offset;
         relocation->relocation_type = (uint32)type;
         relocation->symbol_name = (char *)LLVMGetSymbolName(rel_sym);
+        relocation->relocation_offset = offset;
+        if (!strcmp(group->section_name, ".rela.text.unlikely.")
+            || !strcmp(group->section_name, ".rel.text.unlikely.")) {
+            relocation->relocation_offset += align_uint(obj_data->text_size, 4);
+        }
+        else if (!strcmp(group->section_name, ".rela.text.hot.")
+                 || !strcmp(group->section_name, ".rel.text.hot.")) {
+            relocation->relocation_offset +=
+                align_uint(obj_data->text_size, 4)
+                + align_uint(obj_data->text_unlikely_size, 4);
+        }
+        if (!strcmp(relocation->symbol_name, ".text.unlikely.")) {
+            relocation->symbol_name = ".text";
+            relocation->relocation_addend += align_uint(obj_data->text_size, 4);
+        }
+        if (!strcmp(relocation->symbol_name, ".text.hot.")) {
+            relocation->symbol_name = ".text";
+            relocation->relocation_addend +=
+                align_uint(obj_data->text_size, 4)
+                + align_uint(obj_data->text_unlikely_size, 4);
+        }
+
+        if (obj_data->comp_ctx->enable_llvm_pgo
+            && (!strcmp(relocation->symbol_name, "__llvm_prf_cnts")
+                || !strcmp(relocation->symbol_name, "__llvm_prf_data"))) {
+            LLVMSectionIteratorRef sec_itr;
+            char buf[32], *section_name;
+            uint32 prof_section_idx = 0;
+
+            if (!(sec_itr =
+                      LLVMObjectFileCopySectionIterator(obj_data->binary))) {
+                aot_set_last_error("llvm get section iterator failed.");
+                LLVMDisposeSymbolIterator(rel_sym);
+                goto fail;
+            }
+            while (!LLVMObjectFileIsSectionIteratorAtEnd(obj_data->binary,
+                                                         sec_itr)) {
+                section_name = (char *)LLVMGetSectionName(sec_itr);
+                if (section_name
+                    && !strcmp(section_name, relocation->symbol_name)) {
+                    if (LLVMGetSectionContainsSymbol(sec_itr, rel_sym))
+                        break;
+                    prof_section_idx++;
+                }
+                LLVMMoveToNextSection(sec_itr);
+            }
+            LLVMDisposeSectionIterator(sec_itr);
+
+            if (!strcmp(group->section_name, ".rela.text")
+                || !strcmp(group->section_name, ".rel.text")) {
+                snprintf(buf, sizeof(buf), "%s%u", relocation->symbol_name,
+                         prof_section_idx);
+                size = strlen(buf) + 1;
+                if (!(relocation->symbol_name = wasm_runtime_malloc(size))) {
+                    aot_set_last_error(
+                        "allocate memory for relocation symbol name failed.");
+                    LLVMDisposeSymbolIterator(rel_sym);
+                    goto fail;
+                }
+                bh_memcpy_s(relocation->symbol_name, size, buf, size);
+                relocation->is_symbol_name_allocated = true;
+            }
+            else if (!strncmp(group->section_name, ".rela__llvm_prf_data", 20)
+                     || !strncmp(group->section_name, ".rel__llvm_prf_data",
+                                 19)) {
+                snprintf(buf, sizeof(buf), "%s%u", relocation->symbol_name,
+                         prof_section_idx);
+                size = strlen(buf) + 1;
+                if (!(relocation->symbol_name = wasm_runtime_malloc(size))) {
+                    aot_set_last_error(
+                        "allocate memory for relocation symbol name failed.");
+                    LLVMDisposeSymbolIterator(rel_sym);
+                    goto fail;
+                }
+                bh_memcpy_s(relocation->symbol_name, size, buf, size);
+                relocation->is_symbol_name_allocated = true;
+            }
+        }
 
         /* for ".LCPIxxx", ".LJTIxxx", ".LBBxxx" and switch lookup table
          * relocation, transform the symbol name to real section name and set
@@ -2525,10 +2711,14 @@ fail:
 }
 
 static bool
-is_relocation_section_name(char *section_name)
+is_relocation_section_name(AOTObjectData *obj_data, char *section_name)
 {
     return (!strcmp(section_name, ".rela.text")
             || !strcmp(section_name, ".rel.text")
+            || !strcmp(section_name, ".rela.text.unlikely.")
+            || !strcmp(section_name, ".rel.text.unlikely.")
+            || !strcmp(section_name, ".rela.text.hot.")
+            || !strcmp(section_name, ".rel.text.hot.")
             || !strcmp(section_name, ".rela.literal")
             || !strcmp(section_name, ".rela.data")
             || !strcmp(section_name, ".rel.data")
@@ -2536,6 +2726,9 @@ is_relocation_section_name(char *section_name)
             || !strcmp(section_name, ".rel.sdata")
             || !strcmp(section_name, ".rela.rodata")
             || !strcmp(section_name, ".rel.rodata")
+            || (obj_data->comp_ctx->enable_llvm_pgo
+                && (!strcmp(section_name, ".rela__llvm_prf_data")
+                    || !strcmp(section_name, ".rel__llvm_prf_data")))
             /* ".rela.rodata.cst4/8/16/.." */
             || !strncmp(section_name, ".rela.rodata.cst",
                         strlen(".rela.rodata.cst"))
@@ -2545,14 +2738,15 @@ is_relocation_section_name(char *section_name)
 }
 
 static bool
-is_relocation_section(LLVMSectionIteratorRef sec_itr)
+is_relocation_section(AOTObjectData *obj_data, LLVMSectionIteratorRef sec_itr)
 {
     uint32 count = 0;
     char *name = (char *)LLVMGetSectionName(sec_itr);
     if (name) {
-        if (is_relocation_section_name(name))
+        if (is_relocation_section_name(obj_data, name))
             return true;
-        else if ((!strcmp(name, ".text") || !strcmp(name, ".rdata"))
+        else if ((!strcmp(name, ".text") || !strcmp(name, ".text.unlikely.")
+                  || !strcmp(name, ".text.hot.") || !strcmp(name, ".rdata"))
                  && get_relocations_count(sec_itr, &count) && count > 0)
             return true;
     }
@@ -2570,7 +2764,7 @@ get_relocation_groups_count(AOTObjectData *obj_data, uint32 *p_count)
         return false;
     }
     while (!LLVMObjectFileIsSectionIteratorAtEnd(obj_data->binary, sec_itr)) {
-        if (is_relocation_section(sec_itr)) {
+        if (is_relocation_section(obj_data, sec_itr)) {
             count++;
         }
         LLVMMoveToNextSection(sec_itr);
@@ -2586,7 +2780,7 @@ aot_resolve_object_relocation_groups(AOTObjectData *obj_data)
 {
     LLVMSectionIteratorRef sec_itr;
     AOTRelocationGroup *relocation_group;
-    uint32 group_count;
+    uint32 group_count, llvm_prf_data_idx = 0;
     char *name;
     uint32 size;
 
@@ -2612,14 +2806,50 @@ aot_resolve_object_relocation_groups(AOTObjectData *obj_data)
         return false;
     }
     while (!LLVMObjectFileIsSectionIteratorAtEnd(obj_data->binary, sec_itr)) {
-        if (is_relocation_section(sec_itr)) {
+        if (is_relocation_section(obj_data, sec_itr)) {
             name = (char *)LLVMGetSectionName(sec_itr);
             relocation_group->section_name = name;
+
+            if (obj_data->comp_ctx->enable_llvm_pgo
+                && (!strcmp(name, ".rela__llvm_prf_data")
+                    || !strcmp(name, ".rel__llvm_prf_data"))) {
+                char buf[32];
+                snprintf(buf, sizeof(buf), "%s%u", name, llvm_prf_data_idx);
+                size = strlen(buf) + 1;
+                if (!(relocation_group->section_name =
+                          wasm_runtime_malloc(size))) {
+                    aot_set_last_error(
+                        "allocate memory for section name failed.");
+                    LLVMDisposeSectionIterator(sec_itr);
+                    return false;
+                }
+                bh_memcpy_s(relocation_group->section_name, size, buf, size);
+                relocation_group->is_section_name_allocated = true;
+            }
+
             if (!aot_resolve_object_relocation_group(obj_data, relocation_group,
                                                      sec_itr)) {
                 LLVMDisposeSectionIterator(sec_itr);
                 return false;
             }
+
+            if (obj_data->comp_ctx->enable_llvm_pgo
+                && (!strcmp(name, ".rela__llvm_prf_data")
+                    || !strcmp(name, ".rel__llvm_prf_data"))) {
+                llvm_prf_data_idx++;
+            }
+
+            if (!strcmp(relocation_group->section_name, ".rela.text.unlikely.")
+                || !strcmp(relocation_group->section_name, ".rela.text.hot.")) {
+                relocation_group->section_name = ".rela.text";
+            }
+            else if (!strcmp(relocation_group->section_name,
+                             ".rel.text.unlikely.")
+                     || !strcmp(relocation_group->section_name,
+                                ".rel.text.hot.")) {
+                relocation_group->section_name = ".rel.text";
+            }
+
             relocation_group++;
         }
         LLVMMoveToNextSection(sec_itr);
@@ -2633,12 +2863,21 @@ static void
 destroy_relocation_groups(AOTRelocationGroup *relocation_groups,
                           uint32 relocation_group_count)
 {
-    uint32 i;
+    uint32 i, j;
     AOTRelocationGroup *relocation_group = relocation_groups;
 
-    for (i = 0; i < relocation_group_count; i++, relocation_group++)
-        if (relocation_group->relocations)
+    for (i = 0; i < relocation_group_count; i++, relocation_group++) {
+        if (relocation_group->relocations) {
+            for (j = 0; j < relocation_group->relocation_count; j++) {
+                if (relocation_group->relocations[j].is_symbol_name_allocated)
+                    wasm_runtime_free(
+                        relocation_group->relocations[j].symbol_name);
+            }
             wasm_runtime_free(relocation_group->relocations);
+        }
+        if (relocation_group->is_section_name_allocated)
+            wasm_runtime_free(relocation_group->section_name);
+    }
     wasm_runtime_free(relocation_groups);
 }
 
@@ -2664,8 +2903,20 @@ aot_obj_data_destroy(AOTObjectData *obj_data)
         LLVMDisposeMemoryBuffer(obj_data->mem_buf);
     if (obj_data->funcs)
         wasm_runtime_free(obj_data->funcs);
-    if (obj_data->data_sections)
+    if (obj_data->data_sections) {
+        uint32 i;
+        for (i = 0; i < obj_data->data_sections_count; i++) {
+            if (obj_data->data_sections[i].name
+                && obj_data->data_sections[i].is_name_allocated) {
+                wasm_runtime_free(obj_data->data_sections[i].name);
+            }
+            if (obj_data->data_sections[i].data
+                && obj_data->data_sections[i].is_data_allocated) {
+                wasm_runtime_free(obj_data->data_sections[i].data);
+            }
+        }
         wasm_runtime_free(obj_data->data_sections);
+    }
     if (obj_data->relocation_groups)
         destroy_relocation_groups(obj_data->relocation_groups,
                                   obj_data->relocation_group_count);
@@ -2688,6 +2939,7 @@ aot_obj_data_create(AOTCompContext *comp_ctx)
         return false;
     }
     memset(obj_data, 0, sizeof(AOTObjectData));
+    obj_data->comp_ctx = comp_ctx;
 
     bh_print_time("Begin to emit object file");
     if (comp_ctx->external_llc_compiler || comp_ctx->external_asm_compiler) {
@@ -2821,8 +3073,8 @@ aot_obj_data_create(AOTCompContext *comp_ctx)
     if (!aot_resolve_target_info(comp_ctx, obj_data)
         || !aot_resolve_text(obj_data) || !aot_resolve_literal(obj_data)
         || !aot_resolve_object_data_sections(obj_data)
-        || !aot_resolve_object_relocation_groups(obj_data)
-        || !aot_resolve_functions(comp_ctx, obj_data))
+        || !aot_resolve_functions(comp_ctx, obj_data)
+        || !aot_resolve_object_relocation_groups(obj_data))
         goto fail;
 
     return obj_data;

+ 1 - 5
core/iwasm/compilation/aot_emit_function.c

@@ -868,10 +868,6 @@ aot_compile_op_call(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 #if LLVM_VERSION_MAJOR >= 14
         LLVMTypeRef llvm_func_type;
 #endif
-        bool recursive_call =
-            (func_ctx == func_ctxes[func_idx - import_func_count]) ? true
-                                                                   : false;
-
         if (comp_ctx->is_indirect_mode) {
             LLVMTypeRef func_ptr_type;
 
@@ -971,7 +967,7 @@ aot_compile_op_call(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
         /* Check whether there was exception thrown when executing
            the function */
-        if (!tail_call && !recursive_call && comp_ctx->enable_bound_check
+        if (!tail_call && comp_ctx->enable_bound_check
             && !check_exception_thrown(comp_ctx, func_ctx))
             goto fail;
     }

+ 185 - 48
core/iwasm/compilation/aot_emit_memory.c

@@ -81,7 +81,7 @@ get_memory_curr_page_count(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
 
 LLVMValueRef
 aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
-                          uint32 offset, uint32 bytes)
+                          uint32 offset, uint32 bytes, bool enable_segue)
 {
     LLVMValueRef offset_const = I32_CONST(offset);
     LLVMValueRef addr, maddr, offset1, cmp1, cmp2, cmp;
@@ -162,11 +162,20 @@ aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
             /* inside memory space */
             offset1 = I32_CONST((uint32)mem_offset);
             CHECK_LLVM_CONST(offset1);
-            if (!(maddr = LLVMBuildInBoundsGEP2(comp_ctx->builder, INT8_TYPE,
-                                                mem_base_addr, &offset1, 1,
-                                                "maddr"))) {
-                aot_set_last_error("llvm build add failed.");
-                goto fail;
+            if (!enable_segue) {
+                if (!(maddr = LLVMBuildInBoundsGEP2(comp_ctx->builder,
+                                                    INT8_TYPE, mem_base_addr,
+                                                    &offset1, 1, "maddr"))) {
+                    aot_set_last_error("llvm build add failed.");
+                    goto fail;
+                }
+            }
+            else {
+                if (!(maddr = LLVMBuildIntToPtr(comp_ctx->builder, offset1,
+                                                INT8_PTR_TYPE_GS, "maddr"))) {
+                    aot_set_last_error("llvm build IntToPtr failed.");
+                    goto fail;
+                }
             }
             return maddr;
         }
@@ -244,11 +253,29 @@ aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         }
     }
 
-    /* maddr = mem_base_addr + offset1 */
-    if (!(maddr = LLVMBuildInBoundsGEP2(comp_ctx->builder, INT8_TYPE,
+    if (!enable_segue) {
+        /* maddr = mem_base_addr + offset1 */
+        if (!(maddr =
+                  LLVMBuildInBoundsGEP2(comp_ctx->builder, INT8_TYPE,
                                         mem_base_addr, &offset1, 1, "maddr"))) {
-        aot_set_last_error("llvm build add failed.");
-        goto fail;
+            aot_set_last_error("llvm build add failed.");
+            goto fail;
+        }
+    }
+    else {
+        LLVMValueRef maddr_base;
+
+        if (!(maddr_base = LLVMBuildIntToPtr(comp_ctx->builder, addr,
+                                             INT8_PTR_TYPE_GS, "maddr_base"))) {
+            aot_set_last_error("llvm build int to ptr failed.");
+            goto fail;
+        }
+        if (!(maddr = LLVMBuildInBoundsGEP2(comp_ctx->builder, INT8_TYPE,
+                                            maddr_base, &offset_const, 1,
+                                            "maddr"))) {
+            aot_set_last_error("llvm build inboundgep failed.");
+            goto fail;
+        }
     }
     return maddr;
 fail:
@@ -388,13 +415,18 @@ aot_compile_op_i32_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 {
     LLVMValueRef maddr, value = NULL;
     LLVMTypeRef data_type;
+    bool enable_segue = comp_ctx->enable_segue_i32_load;
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
+                                            enable_segue)))
         return false;
 
     switch (bytes) {
         case 4:
-            BUILD_PTR_CAST(INT32_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT32_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT32_PTR_TYPE_GS);
 #if WASM_ENABLE_SHARED_MEMORY != 0
             if (atomic)
                 BUILD_ATOMIC_LOAD(align, I32_TYPE);
@@ -405,11 +437,17 @@ aot_compile_op_i32_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         case 2:
         case 1:
             if (bytes == 2) {
-                BUILD_PTR_CAST(INT16_PTR_TYPE);
+                if (!enable_segue)
+                    BUILD_PTR_CAST(INT16_PTR_TYPE);
+                else
+                    BUILD_PTR_CAST(INT16_PTR_TYPE_GS);
                 data_type = INT16_TYPE;
             }
             else {
-                BUILD_PTR_CAST(INT8_PTR_TYPE);
+                if (!enable_segue)
+                    BUILD_PTR_CAST(INT8_PTR_TYPE);
+                else
+                    BUILD_PTR_CAST(INT8_PTR_TYPE_GS);
                 data_type = INT8_TYPE;
             }
 
@@ -447,13 +485,18 @@ aot_compile_op_i64_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 {
     LLVMValueRef maddr, value = NULL;
     LLVMTypeRef data_type;
+    bool enable_segue = comp_ctx->enable_segue_i64_load;
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
+                                            enable_segue)))
         return false;
 
     switch (bytes) {
         case 8:
-            BUILD_PTR_CAST(INT64_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT64_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT64_PTR_TYPE_GS);
 #if WASM_ENABLE_SHARED_MEMORY != 0
             if (atomic)
                 BUILD_ATOMIC_LOAD(align, I64_TYPE);
@@ -465,15 +508,24 @@ aot_compile_op_i64_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         case 2:
         case 1:
             if (bytes == 4) {
-                BUILD_PTR_CAST(INT32_PTR_TYPE);
+                if (!enable_segue)
+                    BUILD_PTR_CAST(INT32_PTR_TYPE);
+                else
+                    BUILD_PTR_CAST(INT32_PTR_TYPE_GS);
                 data_type = I32_TYPE;
             }
             else if (bytes == 2) {
-                BUILD_PTR_CAST(INT16_PTR_TYPE);
+                if (!enable_segue)
+                    BUILD_PTR_CAST(INT16_PTR_TYPE);
+                else
+                    BUILD_PTR_CAST(INT16_PTR_TYPE_GS);
                 data_type = INT16_TYPE;
             }
             else {
-                BUILD_PTR_CAST(INT8_PTR_TYPE);
+                if (!enable_segue)
+                    BUILD_PTR_CAST(INT8_PTR_TYPE);
+                else
+                    BUILD_PTR_CAST(INT8_PTR_TYPE_GS);
                 data_type = INT8_TYPE;
             }
 
@@ -509,12 +561,18 @@ aot_compile_op_f32_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                         uint32 align, uint32 offset)
 {
     LLVMValueRef maddr, value;
+    bool enable_segue = comp_ctx->enable_segue_f32_load;
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 4)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 4,
+                                            enable_segue)))
         return false;
 
-    BUILD_PTR_CAST(F32_PTR_TYPE);
+    if (!enable_segue)
+        BUILD_PTR_CAST(F32_PTR_TYPE);
+    else
+        BUILD_PTR_CAST(F32_PTR_TYPE_GS);
     BUILD_LOAD(F32_TYPE);
+
     PUSH_F32(value);
     return true;
 fail:
@@ -526,12 +584,18 @@ aot_compile_op_f64_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                         uint32 align, uint32 offset)
 {
     LLVMValueRef maddr, value;
+    bool enable_segue = comp_ctx->enable_segue_f64_load;
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 8)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 8,
+                                            enable_segue)))
         return false;
 
-    BUILD_PTR_CAST(F64_PTR_TYPE);
+    if (!enable_segue)
+        BUILD_PTR_CAST(F64_PTR_TYPE);
+    else
+        BUILD_PTR_CAST(F64_PTR_TYPE_GS);
     BUILD_LOAD(F64_TYPE);
+
     PUSH_F64(value);
     return true;
 fail:
@@ -543,22 +607,33 @@ aot_compile_op_i32_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                          uint32 align, uint32 offset, uint32 bytes, bool atomic)
 {
     LLVMValueRef maddr, value;
+    bool enable_segue = comp_ctx->enable_segue_i32_store;
 
     POP_I32(value);
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
+                                            enable_segue)))
         return false;
 
     switch (bytes) {
         case 4:
-            BUILD_PTR_CAST(INT32_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT32_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT32_PTR_TYPE_GS);
             break;
         case 2:
-            BUILD_PTR_CAST(INT16_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT16_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT16_PTR_TYPE_GS);
             BUILD_TRUNC(value, INT16_TYPE);
             break;
         case 1:
-            BUILD_PTR_CAST(INT8_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT8_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT8_PTR_TYPE_GS);
             BUILD_TRUNC(value, INT8_TYPE);
             break;
         default:
@@ -582,26 +657,40 @@ aot_compile_op_i64_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                          uint32 align, uint32 offset, uint32 bytes, bool atomic)
 {
     LLVMValueRef maddr, value;
+    bool enable_segue = comp_ctx->enable_segue_i64_store;
 
     POP_I64(value);
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
+                                            enable_segue)))
         return false;
 
     switch (bytes) {
         case 8:
-            BUILD_PTR_CAST(INT64_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT64_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT64_PTR_TYPE_GS);
             break;
         case 4:
-            BUILD_PTR_CAST(INT32_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT32_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT32_PTR_TYPE_GS);
             BUILD_TRUNC(value, I32_TYPE);
             break;
         case 2:
-            BUILD_PTR_CAST(INT16_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT16_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT16_PTR_TYPE_GS);
             BUILD_TRUNC(value, INT16_TYPE);
             break;
         case 1:
-            BUILD_PTR_CAST(INT8_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT8_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT8_PTR_TYPE_GS);
             BUILD_TRUNC(value, INT8_TYPE);
             break;
         default:
@@ -625,13 +714,18 @@ aot_compile_op_f32_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                          uint32 align, uint32 offset)
 {
     LLVMValueRef maddr, value;
+    bool enable_segue = comp_ctx->enable_segue_f32_store;
 
     POP_F32(value);
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 4)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 4,
+                                            enable_segue)))
         return false;
 
-    BUILD_PTR_CAST(F32_PTR_TYPE);
+    if (!enable_segue)
+        BUILD_PTR_CAST(F32_PTR_TYPE);
+    else
+        BUILD_PTR_CAST(F32_PTR_TYPE_GS);
     BUILD_STORE();
     return true;
 fail:
@@ -643,13 +737,18 @@ aot_compile_op_f64_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                          uint32 align, uint32 offset)
 {
     LLVMValueRef maddr, value;
+    bool enable_segue = comp_ctx->enable_segue_f64_store;
 
     POP_F64(value);
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 8)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 8,
+                                            enable_segue)))
         return false;
 
-    BUILD_PTR_CAST(F64_PTR_TYPE);
+    if (!enable_segue)
+        BUILD_PTR_CAST(F64_PTR_TYPE);
+    else
+        BUILD_PTR_CAST(F64_PTR_TYPE_GS);
     BUILD_STORE();
     return true;
 fail:
@@ -1140,13 +1239,19 @@ aot_compile_op_atomic_rmw(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                           uint32 offset, uint32 bytes)
 {
     LLVMValueRef maddr, value, result;
+    bool enable_segue = (op_type == VALUE_TYPE_I32)
+                            ? comp_ctx->enable_segue_i32_load
+                                  && comp_ctx->enable_segue_i32_store
+                            : comp_ctx->enable_segue_i64_load
+                                  && comp_ctx->enable_segue_i64_store;
 
     if (op_type == VALUE_TYPE_I32)
         POP_I32(value);
     else
         POP_I64(value);
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
+                                            enable_segue)))
         return false;
 
     if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align))
@@ -1154,19 +1259,31 @@ aot_compile_op_atomic_rmw(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     switch (bytes) {
         case 8:
-            BUILD_PTR_CAST(INT64_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT64_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT64_PTR_TYPE_GS);
             break;
         case 4:
-            BUILD_PTR_CAST(INT32_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT32_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT32_PTR_TYPE_GS);
             if (op_type == VALUE_TYPE_I64)
                 BUILD_TRUNC(value, I32_TYPE);
             break;
         case 2:
-            BUILD_PTR_CAST(INT16_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT16_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT16_PTR_TYPE_GS);
             BUILD_TRUNC(value, INT16_TYPE);
             break;
         case 1:
-            BUILD_PTR_CAST(INT8_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT8_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT8_PTR_TYPE_GS);
             BUILD_TRUNC(value, INT8_TYPE);
             break;
         default:
@@ -1208,6 +1325,11 @@ aot_compile_op_atomic_cmpxchg(AOTCompContext *comp_ctx,
                               uint32 align, uint32 offset, uint32 bytes)
 {
     LLVMValueRef maddr, value, expect, result;
+    bool enable_segue = (op_type == VALUE_TYPE_I32)
+                            ? comp_ctx->enable_segue_i32_load
+                                  && comp_ctx->enable_segue_i32_store
+                            : comp_ctx->enable_segue_i64_load
+                                  && comp_ctx->enable_segue_i64_store;
 
     if (op_type == VALUE_TYPE_I32) {
         POP_I32(value);
@@ -1218,7 +1340,8 @@ aot_compile_op_atomic_cmpxchg(AOTCompContext *comp_ctx,
         POP_I64(expect);
     }
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
+                                            enable_segue)))
         return false;
 
     if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align))
@@ -1226,22 +1349,34 @@ aot_compile_op_atomic_cmpxchg(AOTCompContext *comp_ctx,
 
     switch (bytes) {
         case 8:
-            BUILD_PTR_CAST(INT64_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT64_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT64_PTR_TYPE_GS);
             break;
         case 4:
-            BUILD_PTR_CAST(INT32_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT32_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT32_PTR_TYPE_GS);
             if (op_type == VALUE_TYPE_I64) {
                 BUILD_TRUNC(value, I32_TYPE);
                 BUILD_TRUNC(expect, I32_TYPE);
             }
             break;
         case 2:
-            BUILD_PTR_CAST(INT16_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT16_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT16_PTR_TYPE_GS);
             BUILD_TRUNC(value, INT16_TYPE);
             BUILD_TRUNC(expect, INT16_TYPE);
             break;
         case 1:
-            BUILD_PTR_CAST(INT8_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT8_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT8_PTR_TYPE_GS);
             BUILD_TRUNC(value, INT8_TYPE);
             BUILD_TRUNC(expect, INT8_TYPE);
             break;
@@ -1318,7 +1453,8 @@ aot_compile_op_atomic_wait(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     CHECK_LLVM_CONST(is_wait64);
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
+                                            false)))
         return false;
 
     if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align))
@@ -1393,7 +1529,8 @@ aot_compiler_op_atomic_notify(AOTCompContext *comp_ctx,
 
     POP_I32(count);
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
+                                            false)))
         return false;
 
     if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align))

+ 1 - 1
core/iwasm/compilation/aot_emit_memory.h

@@ -53,7 +53,7 @@ aot_compile_op_f64_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
 LLVMValueRef
 aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
-                          uint32 offset, uint32 bytes);
+                          uint32 offset, uint32 bytes, bool enable_segue);
 
 bool
 aot_compile_op_memory_size(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);

+ 1 - 1
core/iwasm/compilation/aot_emit_variable.c

@@ -112,7 +112,7 @@ static bool
 compile_global(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                uint32 global_idx, bool is_set, bool is_aux_stack)
 {
-    AOTCompData *comp_data = comp_ctx->comp_data;
+    const AOTCompData *comp_data = comp_ctx->comp_data;
     uint32 import_global_count = comp_data->import_global_count;
     uint32 global_base_offset;
     uint32 global_offset;

+ 105 - 18
core/iwasm/compilation/aot_llvm.c

@@ -15,7 +15,7 @@
 #endif
 
 LLVMTypeRef
-wasm_type_to_llvm_type(AOTLLVMTypes *llvm_types, uint8 wasm_type)
+wasm_type_to_llvm_type(const AOTLLVMTypes *llvm_types, uint8 wasm_type)
 {
     switch (wasm_type) {
         case VALUE_TYPE_I32:
@@ -42,8 +42,8 @@ wasm_type_to_llvm_type(AOTLLVMTypes *llvm_types, uint8 wasm_type)
  * Add LLVM function
  */
 static LLVMValueRef
-aot_add_llvm_func(AOTCompContext *comp_ctx, LLVMModuleRef module,
-                  AOTFuncType *aot_func_type, uint32 func_index,
+aot_add_llvm_func(const AOTCompContext *comp_ctx, LLVMModuleRef module,
+                  const AOTFuncType *aot_func_type, uint32 func_index,
                   LLVMTypeRef *p_func_type)
 {
     LLVMValueRef func = NULL;
@@ -177,8 +177,9 @@ free_block_memory(AOTBlock *block)
  * Create first AOTBlock, or function block for the function
  */
 static AOTBlock *
-aot_create_func_block(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
-                      AOTFunc *func, AOTFuncType *aot_func_type)
+aot_create_func_block(const AOTCompContext *comp_ctx,
+                      const AOTFuncContext *func_ctx, const AOTFunc *func,
+                      const AOTFuncType *aot_func_type)
 {
     AOTBlock *aot_block;
     uint32 param_count = aot_func_type->param_count,
@@ -266,7 +267,8 @@ create_argv_buf(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
 }
 
 static bool
-create_native_stack_bound(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+create_native_stack_bound(const AOTCompContext *comp_ctx,
+                          AOTFuncContext *func_ctx)
 {
     LLVMValueRef stack_bound_offset = I32_FOUR, stack_bound_addr;
 
@@ -288,7 +290,8 @@ create_native_stack_bound(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
 }
 
 static bool
-create_native_stack_top_min(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+create_native_stack_top_min(const AOTCompContext *comp_ctx,
+                            AOTFuncContext *func_ctx)
 {
     LLVMValueRef offset = I32_NINE;
 
@@ -303,7 +306,7 @@ create_native_stack_top_min(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
 }
 
 static bool
-create_aux_stack_info(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+create_aux_stack_info(const AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
 {
     LLVMValueRef aux_stack_bound_offset = I32_SIX, aux_stack_bound_addr;
     LLVMValueRef aux_stack_bottom_offset = I32_SEVEN, aux_stack_bottom_addr;
@@ -355,7 +358,7 @@ create_aux_stack_info(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
 }
 
 static bool
-create_native_symbol(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+create_native_symbol(const AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
 {
     LLVMValueRef native_symbol_offset = I32_EIGHT, native_symbol_addr;
 
@@ -384,8 +387,9 @@ create_native_symbol(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
 }
 
 static bool
-create_local_variables(AOTCompData *comp_data, AOTCompContext *comp_ctx,
-                       AOTFuncContext *func_ctx, AOTFunc *func)
+create_local_variables(const AOTCompData *comp_data,
+                       const AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
+                       const AOTFunc *func)
 {
     AOTFuncType *aot_func_type = comp_data->func_types[func->func_type_index];
     char local_name[32];
@@ -475,7 +479,7 @@ create_local_variables(AOTCompData *comp_data, AOTCompContext *comp_ctx,
 }
 
 static bool
-create_memory_info(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
+create_memory_info(const AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                    LLVMTypeRef int8_ptr_type, uint32 func_index)
 {
     LLVMValueRef offset, mem_info_base;
@@ -807,7 +811,7 @@ create_memory_info(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 }
 
 static bool
-create_cur_exception(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+create_cur_exception(const AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
 {
     LLVMValueRef offset;
 
@@ -823,7 +827,8 @@ create_cur_exception(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
 }
 
 static bool
-create_func_type_indexes(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+create_func_type_indexes(const AOTCompContext *comp_ctx,
+                         AOTFuncContext *func_ctx)
 {
     LLVMValueRef offset, func_type_indexes_ptr;
     LLVMTypeRef int32_ptr_type;
@@ -861,7 +866,7 @@ create_func_type_indexes(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
 }
 
 static bool
-create_func_ptrs(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+create_func_ptrs(const AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
 {
     LLVMValueRef offset;
 
@@ -903,7 +908,7 @@ create_func_ptrs(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
  * Create function compiler context
  */
 static AOTFuncContext *
-aot_create_func_context(AOTCompData *comp_data, AOTCompContext *comp_ctx,
+aot_create_func_context(const AOTCompData *comp_data, AOTCompContext *comp_ctx,
                         AOTFunc *func, uint32 func_index)
 {
     AOTFuncContext *func_ctx;
@@ -1059,7 +1064,7 @@ aot_destroy_func_contexts(AOTFuncContext **func_ctxes, uint32 count)
  * Create function compiler contexts
  */
 static AOTFuncContext **
-aot_create_func_contexts(AOTCompData *comp_data, AOTCompContext *comp_ctx)
+aot_create_func_contexts(const AOTCompData *comp_data, AOTCompContext *comp_ctx)
 {
     AOTFuncContext **func_ctxes;
     uint64 size;
@@ -1127,6 +1132,28 @@ aot_set_llvm_basic_types(AOTLLVMTypes *basic_types, LLVMContextRef context)
     basic_types->v128_type = basic_types->i64x2_vec_type;
     basic_types->v128_ptr_type = LLVMPointerType(basic_types->v128_type, 0);
 
+    basic_types->int8_ptr_type_gs =
+        LLVMPointerType(basic_types->int8_type, 256);
+    basic_types->int16_ptr_type_gs =
+        LLVMPointerType(basic_types->int16_type, 256);
+    basic_types->int32_ptr_type_gs =
+        LLVMPointerType(basic_types->int32_type, 256);
+    basic_types->int64_ptr_type_gs =
+        LLVMPointerType(basic_types->int64_type, 256);
+    basic_types->float32_ptr_type_gs =
+        LLVMPointerType(basic_types->float32_type, 256);
+    basic_types->float64_ptr_type_gs =
+        LLVMPointerType(basic_types->float64_type, 256);
+    basic_types->v128_ptr_type_gs =
+        LLVMPointerType(basic_types->v128_type, 256);
+    if (!basic_types->int8_ptr_type_gs || !basic_types->int16_ptr_type_gs
+        || !basic_types->int32_ptr_type_gs || !basic_types->int64_ptr_type_gs
+        || !basic_types->float32_ptr_type_gs
+        || !basic_types->float64_ptr_type_gs
+        || !basic_types->v128_ptr_type_gs) {
+        return false;
+    }
+
     basic_types->i1x2_vec_type = LLVMVectorType(basic_types->int1_type, 2);
 
     basic_types->funcref_type = LLVMInt32TypeInContext(context);
@@ -1536,7 +1563,7 @@ aot_compiler_destroy(void)
 }
 
 AOTCompContext *
-aot_create_comp_context(AOTCompData *comp_data, aot_comp_option_t option)
+aot_create_comp_context(const AOTCompData *comp_data, aot_comp_option_t option)
 {
     AOTCompContext *comp_ctx, *ret = NULL;
     LLVMTargetRef target;
@@ -1643,6 +1670,12 @@ aot_create_comp_context(AOTCompData *comp_data, aot_comp_option_t option)
     if (option->disable_llvm_lto)
         comp_ctx->disable_llvm_lto = true;
 
+    if (option->enable_llvm_pgo)
+        comp_ctx->enable_llvm_pgo = true;
+
+    if (option->use_prof_file)
+        comp_ctx->use_prof_file = option->use_prof_file;
+
     if (option->enable_stack_estimation)
         comp_ctx->enable_stack_estimation = true;
 
@@ -2007,6 +2040,7 @@ aot_create_comp_context(AOTCompData *comp_data, aot_comp_option_t option)
         os_printf("Create AoT compiler with:\n");
         os_printf("  target:        %s\n", comp_ctx->target_arch);
         os_printf("  target cpu:    %s\n", cpu);
+        os_printf("  target triple: %s\n", triple_norm);
         os_printf("  cpu features:  %s\n", features);
         os_printf("  opt level:     %d\n", opt_level);
         os_printf("  size level:    %d\n", size_level);
@@ -2025,6 +2059,8 @@ aot_create_comp_context(AOTCompData *comp_data, aot_comp_option_t option)
                 break;
         }
 
+        LLVMSetTarget(comp_ctx->module, triple_norm);
+
         if (!LLVMTargetHasTargetMachine(target)) {
             snprintf(buf, sizeof(buf),
                      "no target machine for this target (%s).", triple_norm);
@@ -2065,6 +2101,37 @@ aot_create_comp_context(AOTCompData *comp_data, aot_comp_option_t option)
         }
     }
 
+    triple = LLVMGetTargetMachineTriple(comp_ctx->target_machine);
+    if (!triple) {
+        aot_set_last_error("get target machine triple failed.");
+        goto fail;
+    }
+    if (strstr(triple, "linux") && !strcmp(comp_ctx->target_arch, "x86_64")) {
+        if (option->segue_flags) {
+            if (option->segue_flags & (1 << 0))
+                comp_ctx->enable_segue_i32_load = true;
+            if (option->segue_flags & (1 << 1))
+                comp_ctx->enable_segue_i64_load = true;
+            if (option->segue_flags & (1 << 2))
+                comp_ctx->enable_segue_f32_load = true;
+            if (option->segue_flags & (1 << 3))
+                comp_ctx->enable_segue_f64_load = true;
+            if (option->segue_flags & (1 << 4))
+                comp_ctx->enable_segue_v128_load = true;
+            if (option->segue_flags & (1 << 8))
+                comp_ctx->enable_segue_i32_store = true;
+            if (option->segue_flags & (1 << 9))
+                comp_ctx->enable_segue_i64_store = true;
+            if (option->segue_flags & (1 << 10))
+                comp_ctx->enable_segue_f32_store = true;
+            if (option->segue_flags & (1 << 11))
+                comp_ctx->enable_segue_f64_store = true;
+            if (option->segue_flags & (1 << 12))
+                comp_ctx->enable_segue_v128_store = true;
+        }
+    }
+    LLVMDisposeMessage(triple);
+
     if (option->enable_simd && strcmp(comp_ctx->target_arch, "x86_64") != 0
         && strncmp(comp_ctx->target_arch, "aarch64", 7) != 0) {
         /* Disable simd if it isn't supported by target arch */
@@ -2768,3 +2835,23 @@ aot_load_const_from_table(AOTCompContext *comp_ctx, LLVMValueRef base,
     (void)const_type;
     return const_value;
 }
+
+bool
+aot_set_cond_br_weights(AOTCompContext *comp_ctx, LLVMValueRef cond_br,
+                        int32 weights_true, int32 weights_false)
+{
+    LLVMMetadataRef md_nodes[3], meta_data;
+    LLVMValueRef meta_data_as_value;
+
+    md_nodes[0] = LLVMMDStringInContext2(comp_ctx->context, "branch_weights",
+                                         strlen("branch_weights"));
+    md_nodes[1] = LLVMValueAsMetadata(I32_CONST(weights_true));
+    md_nodes[2] = LLVMValueAsMetadata(I32_CONST(weights_false));
+
+    meta_data = LLVMMDNodeInContext2(comp_ctx->context, md_nodes, 3);
+    meta_data_as_value = LLVMMetadataAsValue(comp_ctx->context, meta_data);
+
+    LLVMSetMetadata(cond_br, 2, meta_data_as_value);
+
+    return true;
+}

+ 40 - 3
core/iwasm/compilation/aot_llvm.h

@@ -214,6 +214,14 @@ typedef struct AOTLLVMTypes {
     LLVMTypeRef f32x4_vec_type;
     LLVMTypeRef f64x2_vec_type;
 
+    LLVMTypeRef int8_ptr_type_gs;
+    LLVMTypeRef int16_ptr_type_gs;
+    LLVMTypeRef int32_ptr_type_gs;
+    LLVMTypeRef int64_ptr_type_gs;
+    LLVMTypeRef float32_ptr_type_gs;
+    LLVMTypeRef float64_ptr_type_gs;
+    LLVMTypeRef v128_ptr_type_gs;
+
     LLVMTypeRef i1x2_vec_type;
 
     LLVMTypeRef meta_data_type;
@@ -275,7 +283,7 @@ typedef struct AOTLLVMConsts {
  * Compiler context
  */
 typedef struct AOTCompContext {
-    AOTCompData *comp_data;
+    const AOTCompData *comp_data;
 
     /* LLVM variables required to emit LLVM IR */
     LLVMContextRef context;
@@ -341,6 +349,25 @@ typedef struct AOTCompContext {
     /* Disable LLVM link time optimization */
     bool disable_llvm_lto;
 
+    /* Enable LLVM PGO (Profile-Guided Optimization) */
+    bool enable_llvm_pgo;
+
+    /* Use profile file collected by LLVM PGO */
+    char *use_prof_file;
+
+    /* Enable to use segument register as the base addr
+       of linear memory for load/store operations */
+    bool enable_segue_i32_load;
+    bool enable_segue_i64_load;
+    bool enable_segue_f32_load;
+    bool enable_segue_f64_load;
+    bool enable_segue_v128_load;
+    bool enable_segue_i32_store;
+    bool enable_segue_i64_store;
+    bool enable_segue_f32_store;
+    bool enable_segue_f64_store;
+    bool enable_segue_v128_store;
+
     /* Whether optimize the JITed code */
     bool optimize;
 
@@ -407,12 +434,15 @@ typedef struct AOTCompOption {
     bool enable_aux_stack_frame;
     bool disable_llvm_intrinsics;
     bool disable_llvm_lto;
+    bool enable_llvm_pgo;
     bool enable_stack_estimation;
+    char *use_prof_file;
     uint32 opt_level;
     uint32 size_level;
     uint32 output_format;
     uint32 bounds_checks;
     uint32 stack_bounds_checks;
+    uint32 segue_flags;
     char **custom_sections;
     uint32 custom_sections_count;
     const char *stack_usage_file;
@@ -425,7 +455,7 @@ void
 aot_compiler_destroy(void);
 
 AOTCompContext *
-aot_create_comp_context(AOTCompData *comp_data, aot_comp_option_t option);
+aot_create_comp_context(const AOTCompData *comp_data, aot_comp_option_t option);
 
 void
 aot_destroy_comp_context(AOTCompContext *comp_ctx);
@@ -464,7 +494,7 @@ void
 aot_block_destroy(AOTBlock *block);
 
 LLVMTypeRef
-wasm_type_to_llvm_type(AOTLLVMTypes *llvm_types, uint8 wasm_type);
+wasm_type_to_llvm_type(const AOTLLVMTypes *llvm_types, uint8 wasm_type);
 
 bool
 aot_checked_addr_list_add(AOTFuncContext *func_ctx, uint32 local_idx,
@@ -519,6 +549,13 @@ aot_apply_llvm_new_pass_manager(AOTCompContext *comp_ctx, LLVMModuleRef module);
 void
 aot_handle_llvm_errmsg(const char *string, LLVMErrorRef err);
 
+char *
+aot_compress_aot_func_names(AOTCompContext *comp_ctx, uint32 *p_size);
+
+bool
+aot_set_cond_br_weights(AOTCompContext *comp_ctx, LLVMValueRef cond_br,
+                        int32 weights_true, int32 weights_false);
+
 #ifdef __cplusplus
 } /* end of extern "C" */
 #endif

+ 60 - 7
core/iwasm/compilation/aot_llvm_extra.cpp

@@ -44,6 +44,7 @@
 #if LLVM_VERSION_MAJOR >= 12
 #include <llvm/Analysis/AliasAnalysis.h>
 #endif
+#include <llvm/ProfileData/InstrProf.h>
 
 #include <cstring>
 #include "../aot/aot_runtime.h"
@@ -232,14 +233,26 @@ aot_apply_llvm_new_pass_manager(AOTCompContext *comp_ctx, LLVMModuleRef module)
     PTO.SLPVectorization = true;
     PTO.LoopUnrolling = true;
 
+    Optional<PGOOptions> PGO = None;
+    if (comp_ctx->enable_llvm_pgo) {
+        /* Disable static counter allocation for value profiler,
+           it will be allocated by runtime */
+        const char *argv[] = { "", "-vp-static-alloc=false" };
+        cl::ParseCommandLineOptions(2, argv);
+        PGO = PGOOptions("", "", "", PGOOptions::IRInstr);
+    }
+    else if (comp_ctx->use_prof_file) {
+        PGO = PGOOptions(comp_ctx->use_prof_file, "", "", PGOOptions::IRUse);
+    }
+
 #ifdef DEBUG_PASS
     PassInstrumentationCallbacks PIC;
-    PassBuilder PB(TM, PTO, None, &PIC);
+    PassBuilder PB(TM, PTO, PGO, &PIC);
 #else
 #if LLVM_VERSION_MAJOR == 12
-    PassBuilder PB(false, TM, PTO);
+    PassBuilder PB(false, TM, PTO, PGO);
 #else
-    PassBuilder PB(TM, PTO);
+    PassBuilder PB(TM, PTO, PGO);
 #endif
 #endif
 
@@ -334,8 +347,16 @@ aot_apply_llvm_new_pass_manager(AOTCompContext *comp_ctx, LLVMModuleRef module)
         FPM.addPass(SLPVectorizerPass());
         FPM.addPass(LoadStoreVectorizerPass());
 
+        if (comp_ctx->enable_llvm_pgo || comp_ctx->use_prof_file) {
+            LICMOptions licm_opt;
+            /* LICM pass: loop invariant code motion, attempting to remove
+               as much code from the body of a loop as possible. Experiments
+               show it is good to enable it when pgo is enabled. */
+            FPM.addPass(
+                createFunctionToLoopPassAdaptor(LICMPass(licm_opt), true));
+        }
+
         /*
-        FPM.addPass(createFunctionToLoopPassAdaptor(LICMPass()));
         FPM.addPass(createFunctionToLoopPassAdaptor(LoopRotatePass()));
         FPM.addPass(createFunctionToLoopPassAdaptor(SimpleLoopUnswitchPass()));
         */
@@ -344,9 +365,10 @@ aot_apply_llvm_new_pass_manager(AOTCompContext *comp_ctx, LLVMModuleRef module)
 
         if (!disable_llvm_lto) {
             /* Apply LTO for AOT mode */
-            if (comp_ctx->comp_data->func_count >= 10)
-                /* Adds the pre-link optimizations if the func count
-                   is large enough */
+            if (comp_ctx->comp_data->func_count >= 10
+                || comp_ctx->enable_llvm_pgo || comp_ctx->use_prof_file)
+                /* Add the pre-link optimizations if the func count
+                   is large enough or PGO is enabled */
                 MPM.addPass(PB.buildLTOPreLinkDefaultPipeline(OL));
             else
                 MPM.addPass(PB.buildLTODefaultPipeline(OL, NULL));
@@ -358,3 +380,34 @@ aot_apply_llvm_new_pass_manager(AOTCompContext *comp_ctx, LLVMModuleRef module)
 
     MPM.run(*M, MAM);
 }
+
+char *
+aot_compress_aot_func_names(AOTCompContext *comp_ctx, uint32 *p_size)
+{
+    std::vector<std::string> NameStrs;
+    std::string Result;
+    char buf[32], *compressed_str;
+    uint32 compressed_str_len, i;
+
+    for (i = 0; i < comp_ctx->func_ctx_count; i++) {
+        snprintf(buf, sizeof(buf), "%s%d", AOT_FUNC_PREFIX, i);
+        std::string str(buf);
+        NameStrs.push_back(str);
+    }
+
+    if (collectPGOFuncNameStrings(NameStrs, true, Result)) {
+        aot_set_last_error("collect pgo func name strings failed");
+        return NULL;
+    }
+
+    compressed_str_len = Result.size();
+    if (!(compressed_str = (char *)wasm_runtime_malloc(compressed_str_len))) {
+        aot_set_last_error("allocate memory failed");
+        return NULL;
+    }
+
+    bh_memcpy_s(compressed_str, compressed_str_len, Result.c_str(),
+                compressed_str_len);
+    *p_size = compressed_str_len;
+    return compressed_str;
+}

+ 52 - 22
core/iwasm/compilation/simd/simd_load_store.c

@@ -14,12 +14,12 @@
 static LLVMValueRef
 simd_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, uint32 align,
           uint32 offset, uint32 data_length, LLVMTypeRef ptr_type,
-          LLVMTypeRef data_type)
+          LLVMTypeRef data_type, bool enable_segue)
 {
     LLVMValueRef maddr, data;
 
     if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset,
-                                            data_length))) {
+                                            data_length, enable_segue))) {
         HANDLE_FAILURE("aot_check_memory_overflow");
         return NULL;
     }
@@ -44,10 +44,12 @@ bool
 aot_compile_simd_v128_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                            uint32 align, uint32 offset)
 {
+    bool enable_segue = comp_ctx->enable_segue_v128_load;
+    LLVMTypeRef v128_ptr_type = enable_segue ? V128_PTR_TYPE_GS : V128_PTR_TYPE;
     LLVMValueRef result;
 
     if (!(result = simd_load(comp_ctx, func_ctx, align, offset, 16,
-                             V128_PTR_TYPE, V128_TYPE))) {
+                             v128_ptr_type, V128_TYPE, enable_segue))) {
         return false;
     }
 
@@ -75,6 +77,7 @@ aot_compile_simd_load_extend(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         LLVMVectorType(I32_TYPE, 2),   LLVMVectorType(I32_TYPE, 2),
     };
     LLVMTypeRef sub_vector_type, sub_vector_ptr_type;
+    bool enable_segue = comp_ctx->enable_segue_v128_load;
 
     bh_assert(opcode_index < 6);
 
@@ -82,13 +85,15 @@ aot_compile_simd_load_extend(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     /* to vector ptr type */
     if (!sub_vector_type
-        || !(sub_vector_ptr_type = LLVMPointerType(sub_vector_type, 0))) {
+        || !(sub_vector_ptr_type =
+                 LLVMPointerType(sub_vector_type, enable_segue ? 256 : 0))) {
         HANDLE_FAILURE("LLVMPointerType");
         return false;
     }
 
-    if (!(sub_vector = simd_load(comp_ctx, func_ctx, align, offset, 8,
-                                 sub_vector_ptr_type, sub_vector_type))) {
+    if (!(sub_vector =
+              simd_load(comp_ctx, func_ctx, align, offset, 8,
+                        sub_vector_ptr_type, sub_vector_type, enable_segue))) {
         return false;
     }
 
@@ -118,6 +123,9 @@ aot_compile_simd_load_splat(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     LLVMValueRef element, result;
     LLVMTypeRef element_ptr_types[] = { INT8_PTR_TYPE, INT16_PTR_TYPE,
                                         INT32_PTR_TYPE, INT64_PTR_TYPE };
+    LLVMTypeRef element_ptr_types_gs[] = { INT8_PTR_TYPE_GS, INT16_PTR_TYPE_GS,
+                                           INT32_PTR_TYPE_GS,
+                                           INT64_PTR_TYPE_GS };
     LLVMTypeRef element_data_types[] = { INT8_TYPE, INT16_TYPE, I32_TYPE,
                                          I64_TYPE };
     uint32 data_lengths[] = { 1, 2, 4, 8 };
@@ -133,13 +141,16 @@ aot_compile_simd_load_splat(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         LLVM_CONST(i32x4_zero),
         LLVM_CONST(i32x2_zero),
     };
+    bool enable_segue = comp_ctx->enable_segue_v128_load;
 
     bh_assert(opcode_index < 4);
 
-    if (!(element = simd_load(comp_ctx, func_ctx, align, offset,
-                              data_lengths[opcode_index],
-                              element_ptr_types[opcode_index],
-                              element_data_types[opcode_index]))) {
+    if (!(element = simd_load(
+              comp_ctx, func_ctx, align, offset, data_lengths[opcode_index],
+              comp_ctx->enable_segue_v128_load
+                  ? element_ptr_types_gs[opcode_index]
+                  : element_ptr_types[opcode_index],
+              element_data_types[opcode_index], enable_segue))) {
         return false;
     }
 
@@ -170,11 +181,15 @@ aot_compile_simd_load_lane(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     uint32 data_lengths[] = { 1, 2, 4, 8 };
     LLVMTypeRef element_ptr_types[] = { INT8_PTR_TYPE, INT16_PTR_TYPE,
                                         INT32_PTR_TYPE, INT64_PTR_TYPE };
+    LLVMTypeRef element_ptr_types_gs[] = { INT8_PTR_TYPE_GS, INT16_PTR_TYPE_GS,
+                                           INT32_PTR_TYPE_GS,
+                                           INT64_PTR_TYPE_GS };
     LLVMTypeRef element_data_types[] = { INT8_TYPE, INT16_TYPE, I32_TYPE,
                                          I64_TYPE };
     LLVMTypeRef vector_types[] = { V128_i8x16_TYPE, V128_i16x8_TYPE,
                                    V128_i32x4_TYPE, V128_i64x2_TYPE };
     LLVMValueRef lane = simd_lane_id_to_llvm_value(comp_ctx, lane_id);
+    bool enable_segue = comp_ctx->enable_segue_v128_load;
 
     bh_assert(opcode_index < 4);
 
@@ -183,10 +198,12 @@ aot_compile_simd_load_lane(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         return false;
     }
 
-    if (!(element = simd_load(comp_ctx, func_ctx, align, offset,
-                              data_lengths[opcode_index],
-                              element_ptr_types[opcode_index],
-                              element_data_types[opcode_index]))) {
+    if (!(element = simd_load(
+              comp_ctx, func_ctx, align, offset, data_lengths[opcode_index],
+              comp_ctx->enable_segue_v128_load
+                  ? element_ptr_types_gs[opcode_index]
+                  : element_ptr_types[opcode_index],
+              element_data_types[opcode_index], enable_segue))) {
         return false;
     }
 
@@ -207,6 +224,8 @@ aot_compile_simd_load_zero(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     uint32 opcode_index = opcode - SIMD_v128_load32_zero;
     uint32 data_lengths[] = { 4, 8 };
     LLVMTypeRef element_ptr_types[] = { INT32_PTR_TYPE, INT64_PTR_TYPE };
+    LLVMTypeRef element_ptr_types_gs[] = { INT32_PTR_TYPE_GS,
+                                           INT64_PTR_TYPE_GS };
     LLVMTypeRef element_data_types[] = { I32_TYPE, I64_TYPE };
     LLVMValueRef zero[] = {
         LLVM_CONST(i32x4_vec_zero),
@@ -222,13 +241,16 @@ aot_compile_simd_load_zero(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
           LLVM_CONST(i32_six) },
         { LLVM_CONST(i32_zero), LLVM_CONST(i32_two) },
     };
+    bool enable_segue = comp_ctx->enable_segue_v128_load;
 
     bh_assert(opcode_index < 2);
 
-    if (!(element = simd_load(comp_ctx, func_ctx, align, offset,
-                              data_lengths[opcode_index],
-                              element_ptr_types[opcode_index],
-                              element_data_types[opcode_index]))) {
+    if (!(element = simd_load(
+              comp_ctx, func_ctx, align, offset, data_lengths[opcode_index],
+              comp_ctx->enable_segue_v128_load
+                  ? element_ptr_types_gs[opcode_index]
+                  : element_ptr_types[opcode_index],
+              element_data_types[opcode_index], enable_segue))) {
         return false;
     }
 
@@ -260,12 +282,12 @@ aot_compile_simd_load_zero(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 static bool
 simd_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, uint32 align,
            uint32 offset, uint32 data_length, LLVMValueRef value,
-           LLVMTypeRef value_ptr_type)
+           LLVMTypeRef value_ptr_type, bool enable_segue)
 {
     LLVMValueRef maddr, result;
 
     if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset,
-                                            data_length)))
+                                            data_length, enable_segue)))
         return false;
 
     if (!(maddr = LLVMBuildBitCast(comp_ctx->builder, maddr, value_ptr_type,
@@ -288,12 +310,14 @@ bool
 aot_compile_simd_v128_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                             uint32 align, uint32 offset)
 {
+    bool enable_segue = comp_ctx->enable_segue_v128_store;
+    LLVMTypeRef v128_ptr_type = enable_segue ? V128_PTR_TYPE_GS : V128_PTR_TYPE;
     LLVMValueRef value;
 
     POP_V128(value);
 
     return simd_store(comp_ctx, func_ctx, align, offset, 16, value,
-                      V128_PTR_TYPE);
+                      v128_ptr_type, enable_segue);
 fail:
     return false;
 }
@@ -307,10 +331,14 @@ aot_compile_simd_store_lane(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     uint32 data_lengths[] = { 1, 2, 4, 8 };
     LLVMTypeRef element_ptr_types[] = { INT8_PTR_TYPE, INT16_PTR_TYPE,
                                         INT32_PTR_TYPE, INT64_PTR_TYPE };
+    LLVMTypeRef element_ptr_types_gs[] = { INT8_PTR_TYPE_GS, INT16_PTR_TYPE_GS,
+                                           INT32_PTR_TYPE_GS,
+                                           INT64_PTR_TYPE_GS };
     uint32 opcode_index = opcode - SIMD_v128_store8_lane;
     LLVMTypeRef vector_types[] = { V128_i8x16_TYPE, V128_i16x8_TYPE,
                                    V128_i32x4_TYPE, V128_i64x2_TYPE };
     LLVMValueRef lane = simd_lane_id_to_llvm_value(comp_ctx, lane_id);
+    bool enable_segue = comp_ctx->enable_segue_v128_store;
 
     bh_assert(opcode_index < 4);
 
@@ -327,5 +355,7 @@ aot_compile_simd_store_lane(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     return simd_store(comp_ctx, func_ctx, align, offset,
                       data_lengths[opcode_index], element,
-                      element_ptr_types[opcode_index]);
+                      enable_segue ? element_ptr_types_gs[opcode_index]
+                                   : element_ptr_types[opcode_index],
+                      enable_segue);
 }

+ 3 - 0
core/iwasm/include/aot_export.h

@@ -55,12 +55,15 @@ typedef struct AOTCompOption {
     bool enable_aux_stack_frame;
     bool disable_llvm_intrinsics;
     bool disable_llvm_lto;
+    bool enable_llvm_pgo;
     bool enable_stack_estimation;
+    char *use_prof_file;
     uint32_t opt_level;
     uint32_t size_level;
     uint32_t output_format;
     uint32_t bounds_checks;
     uint32_t stack_bounds_checks;
+    uint32_t segue_flags;
     char **custom_sections;
     uint32_t custom_sections_count;
     const char *stack_usage_file;

+ 31 - 4
core/iwasm/include/wasm_export.h

@@ -167,6 +167,8 @@ typedef struct RuntimeInitArgs {
     /* LLVM JIT opt and size level */
     uint32_t llvm_jit_opt_level;
     uint32_t llvm_jit_size_level;
+    /* Segue optimization flags for LLVM JIT */
+    uint32_t segue_flags;
 } RuntimeInitArgs;
 
 #ifndef WASM_VALKIND_T_DEFINED
@@ -1329,6 +1331,30 @@ WASM_RUNTIME_API_EXTERN uint32_t
 wasm_runtime_dump_call_stack_to_buf(wasm_exec_env_t exec_env, char *buf,
                                     uint32_t len);
 
+/**
+ * Get the size required to store the LLVM PGO profile data
+ *
+ * @param module_inst the WASM module instance
+ *
+ * @return size required to store the contents, 0 means error
+ */
+WASM_RUNTIME_API_EXTERN uint32_t
+wasm_runtime_get_pgo_prof_data_size(wasm_module_inst_t module_inst);
+
+/**
+ * Dump the LLVM PGO profile data to buffer
+ *
+ * @param module_inst the WASM module instance
+ * @param buf buffer to store the dumped content
+ * @param len length of the buffer
+ *
+ * @return bytes dumped to the buffer, 0 means error and data in buf
+ *         may be invalid
+ */
+WASM_RUNTIME_API_EXTERN uint32_t
+wasm_runtime_dump_pgo_prof_data_to_buf(wasm_module_inst_t module_inst,
+                                       char *buf, uint32_t len);
+
 /**
  * Get a custom section by name
  *
@@ -1351,20 +1377,21 @@ WASM_RUNTIME_API_EXTERN void
 wasm_runtime_get_version(uint32_t *major, uint32_t *minor, uint32_t *patch);
 
 /**
- * Check whether an import func `(import <module_name> <func_name> (func ...))` is linked or not
- * with runtime registered natvie functions
+ * Check whether an import func `(import <module_name> <func_name> (func ...))`
+ * is linked or not with runtime registered natvie functions
  */
 WASM_RUNTIME_API_EXTERN bool
 wasm_runtime_is_import_func_linked(const char *module_name,
                                    const char *func_name);
 
 /**
- * Check whether an import global `(import <module_name> <global_name> (global ...))` is linked or not
- * with runtime registered natvie globals
+ * Check whether an import global `(import <module_name> <global_name> (global ...))`
+ * is linked or not with runtime registered natvie globals
  */
 WASM_RUNTIME_API_EXTERN bool
 wasm_runtime_is_import_global_linked(const char *module_name,
                                      const char *global_name);
+
 /* clang-format on */
 
 #ifdef __cplusplus

+ 18 - 3
core/iwasm/interpreter/wasm_interp_classic.c

@@ -270,7 +270,7 @@ local_copysignf(float x, float y)
 {
     union {
         float f;
-        uint32_t i;
+        uint32 i;
     } ux = { x }, uy = { y };
     ux.i &= 0x7fffffff;
     ux.i |= uy.i & 0x80000000;
@@ -282,9 +282,9 @@ local_copysign(double x, double y)
 {
     union {
         double f;
-        uint64_t i;
+        uint64 i;
     } ux = { x }, uy = { y };
-    ux.i &= -1ULL / 2;
+    ux.i &= UINT64_MAX / 2;
     ux.i |= uy.i & 1ULL << 63;
     return ux.f;
 }
@@ -3986,6 +3986,12 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module,
 }
 
 #if WASM_ENABLE_FAST_JIT != 0
+/* ASAN is not designed to work with custom stack unwind or other low-level \
+ things. > Ignore a function that does some low-level magic. (e.g. walking \
+ through the thread's stack bypassing the frame boundaries) */
+#if defined(__GNUC__)
+__attribute__((no_sanitize_address))
+#endif
 static void
 fast_jit_call_func_bytecode(WASMModuleInstance *module_inst,
                             WASMExecEnv *exec_env,
@@ -4225,6 +4231,15 @@ wasm_interp_call_wasm(WASMModuleInstance *module_inst, WASMExecEnv *exec_env,
 
     wasm_exec_env_set_cur_frame(exec_env, frame);
 
+#if defined(os_writegsbase)
+    {
+        WASMMemoryInstance *memory_inst = wasm_get_default_memory(module_inst);
+        if (memory_inst)
+            /* write base addr of linear memory to GS segment register */
+            os_writegsbase(memory_inst->memory_data);
+    }
+#endif
+
     if (function->is_import_func) {
 #if WASM_ENABLE_MULTI_MODULE != 0
         if (function->import_module_inst) {

+ 12 - 3
core/iwasm/interpreter/wasm_interp_fast.c

@@ -232,7 +232,7 @@ local_copysignf(float x, float y)
 {
     union {
         float f;
-        uint32_t i;
+        uint32 i;
     } ux = { x }, uy = { y };
     ux.i &= 0x7fffffff;
     ux.i |= uy.i & 0x80000000;
@@ -244,9 +244,9 @@ local_copysign(double x, double y)
 {
     union {
         double f;
-        uint64_t i;
+        uint64 i;
     } ux = { x }, uy = { y };
-    ux.i &= -1ULL / 2;
+    ux.i &= UINT64_MAX / 2;
     ux.i |= uy.i & 1ULL << 63;
     return ux.f;
 }
@@ -3979,6 +3979,15 @@ wasm_interp_call_wasm(WASMModuleInstance *module_inst, WASMExecEnv *exec_env,
 
     wasm_exec_env_set_cur_frame(exec_env, frame);
 
+#if defined(os_writegsbase)
+    {
+        WASMMemoryInstance *memory_inst = wasm_get_default_memory(module_inst);
+        if (memory_inst)
+            /* write base addr of linear memory to GS segment register */
+            os_writegsbase(memory_inst->memory_data);
+    }
+#endif
+
     if (function->is_import_func) {
 #if WASM_ENABLE_MULTI_MODULE != 0
         if (function->import_module_inst) {

+ 2 - 1
core/iwasm/interpreter/wasm_loader.c

@@ -3000,7 +3000,7 @@ init_llvm_jit_functions_stage1(WASMModule *module, char *error_buf,
     if (module->function_count == 0)
         return true;
 
-#if WASM_ENABLE_FAST_JIT != 0 && WASM_ENABLE_LLVM_JIT != 0
+#if WASM_ENABLE_FAST_JIT != 0 && WASM_ENABLE_LAZY_JIT != 0
     if (os_mutex_init(&module->tierup_wait_lock) != 0) {
         set_error_buf(error_buf, error_buf_size, "init jit tierup lock failed");
         return false;
@@ -3035,6 +3035,7 @@ init_llvm_jit_functions_stage1(WASMModule *module, char *error_buf,
     llvm_jit_options = wasm_runtime_get_llvm_jit_options();
     option.opt_level = llvm_jit_options.opt_level;
     option.size_level = llvm_jit_options.size_level;
+    option.segue_flags = llvm_jit_options.segue_flags;
 
 #if WASM_ENABLE_BULK_MEMORY != 0
     option.enable_bulk_memory = true;

+ 2 - 1
core/iwasm/interpreter/wasm_mini_loader.c

@@ -1843,7 +1843,7 @@ init_llvm_jit_functions_stage1(WASMModule *module, char *error_buf,
     if (module->function_count == 0)
         return true;
 
-#if WASM_ENABLE_FAST_JIT != 0 && WASM_ENABLE_LLVM_JIT != 0
+#if WASM_ENABLE_FAST_JIT != 0 && WASM_ENABLE_LAZY_JIT != 0
     if (os_mutex_init(&module->tierup_wait_lock) != 0) {
         set_error_buf(error_buf, error_buf_size, "init jit tierup lock failed");
         return false;
@@ -1876,6 +1876,7 @@ init_llvm_jit_functions_stage1(WASMModule *module, char *error_buf,
     option.is_jit_mode = true;
     option.opt_level = llvm_jit_options.opt_level;
     option.size_level = llvm_jit_options.size_level;
+    option.segue_flags = llvm_jit_options.segue_flags;
 
 #if WASM_ENABLE_BULK_MEMORY != 0
     option.enable_bulk_memory = true;

+ 0 - 1
core/iwasm/libraries/lib-pthread/lib_pthread_wrapper.c

@@ -561,7 +561,6 @@ pthread_create_wrapper(wasm_exec_env_t exec_env,
 #if WASM_ENABLE_LIBC_WASI != 0
     WASIContext *wasi_ctx;
 #endif
-    CApiFuncImport **new_c_api_func_imports = NULL;
 
     bh_assert(module);
     bh_assert(module_inst);

+ 1 - 1
core/iwasm/libraries/libc-wasi/sandboxed-system-primitives/src/ssp_config.h

@@ -41,7 +41,7 @@
 #endif
 
 #if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__EMSCRIPTEN__) \
-    && !defined(ESP_PLATFORM)
+    && !defined(ESP_PLATFORM) && !defined(DISABLE_CLOCK_NANOSLEEP)
 #define CONFIG_HAS_CLOCK_NANOSLEEP 1
 #else
 #define CONFIG_HAS_CLOCK_NANOSLEEP 0

+ 12 - 0
core/shared/platform/common/posix/posix_thread.c

@@ -492,6 +492,12 @@ destroy_stack_guard_pages()
 }
 #endif /* end of WASM_DISABLE_STACK_HW_BOUND_CHECK == 0 */
 
+/* ASAN is not designed to work with custom stack unwind or other low-level \
+ things. > Ignore a function that does some low-level magic. (e.g. walking \
+ through the thread's stack bypassing the frame boundaries) */
+#if defined(__GNUC__)
+__attribute__((no_sanitize_address))
+#endif
 static void
 mask_signals(int how)
 {
@@ -506,6 +512,12 @@ mask_signals(int how)
 static os_thread_local_attribute struct sigaction prev_sig_act_SIGSEGV;
 static os_thread_local_attribute struct sigaction prev_sig_act_SIGBUS;
 
+/* ASAN is not designed to work with custom stack unwind or other low-level \
+ things. > Ignore a function that does some low-level magic. (e.g. walking \
+ through the thread's stack bypassing the frame boundaries) */
+#if defined(__GNUC__)
+__attribute__((no_sanitize_address))
+#endif
 static void
 signal_callback(int sig_num, siginfo_t *sig_info, void *sig_ucontext)
 {

+ 1 - 0
core/shared/platform/include/platform_api_extension.h

@@ -130,6 +130,7 @@ os_thread_exit(void *retval);
 #define os_memory_order_release memory_order_release
 #define os_memory_order_seq_cst memory_order_seq_cst
 #define os_atomic_thread_fence atomic_thread_fence
+#define os_atomic_cmpxchg atomic_compare_exchange_strong
 #endif
 
 #endif /* end of os_atomic_thread_fence */

+ 14 - 0
core/shared/platform/linux/platform_internal.h

@@ -63,6 +63,20 @@ typedef sem_t korp_sem;
 
 #define bh_socket_t int
 
+#if defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)
+#define os_writegsbase(base_addr)                                 \
+    do {                                                          \
+        uint64 __gs_value = (uint64)(uintptr_t)base_addr;         \
+        asm volatile("wrgsbase %0" ::"r"(__gs_value) : "memory"); \
+    } while (0)
+#if 0
+/* _writegsbase_u64 also works, but need to add -mfsgsbase flag for gcc */
+#include <immintrin.h>
+#define os_writegsbase(base_addr) \
+    _writegsbase_u64(((uint64)(uintptr_t)base_addr))
+#endif
+#endif
+
 #if WASM_DISABLE_HW_BOUND_CHECK == 0
 #if defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)            \
     || defined(BUILD_TARGET_AARCH64) || defined(BUILD_TARGET_RISCV64_LP64D) \

+ 1 - 0
core/shared/platform/windows/platform_internal.h

@@ -26,6 +26,7 @@
 #include <malloc.h>
 #include <process.h>
 #include <winsock2.h>
+#include <ws2tcpip.h>
 #include <windows.h>
 #include <basetsd.h>
 

+ 1 - 1
core/shared/platform/windows/shared_platform.cmake

@@ -5,7 +5,7 @@ set (PLATFORM_SHARED_DIR ${CMAKE_CURRENT_LIST_DIR})
 
 add_definitions(-DBH_PLATFORM_WINDOWS)
 add_definitions(-DHAVE_STRUCT_TIMESPEC)
-
+add_definitions(-D_WINSOCK_DEPRECATED_NO_WARNINGS)
 
 include_directories(${PLATFORM_SHARED_DIR})
 include_directories(${PLATFORM_SHARED_DIR}/../include)

+ 1 - 1
core/version.h

@@ -7,5 +7,5 @@
 #define _WAMR_VERSION_H_
 #define WAMR_VERSION_MAJOR 1
 #define WAMR_VERSION_MINOR 2
-#define WAMR_VERSION_PATCH 1
+#define WAMR_VERSION_PATCH 2
 #endif

+ 3 - 1
doc/build_wasm_app.md

@@ -394,7 +394,7 @@ Examples: wamrc -o test.aot test.wasm
 
 ### Usage example
 ``` bash
-WAMRC_LLC_COMPILER=<path/to/your/compiler/driver> ./wamrc -o test.aot test.wasm
+WAMRC_LLC_COMPILER=/usr/local/opt/llvm@14/bin/clang WAMRC_LLC_FLAGS="--target=x86_64-pc-linux-gnu -mcmodel=medium -c -O3" ./wamrc -o test.aot test.wasm
 ```
 
 > Note: `wamrc` will verify whether the specified file exists and executable. If verification failed, `wamrc` will report a warning and fallback to normal pipeline. Since the verification is based on file, you **must specify the absolute path to the binary** even if it's in `$PATH`
@@ -403,6 +403,8 @@ WAMRC_LLC_COMPILER=<path/to/your/compiler/driver> ./wamrc -o test.aot test.wasm
 
 > Note: the `LLC` and `ASM` in the env name just means this compiler will be used to compile the `LLVM IR file`/`assembly file` to object file, usually passing the compiler driver is the simplest way. (e.g. for LLVM toolchain, you don't need to pass `/usr/bin/llc`, using `/usr/bin/clang` is OK)
 
+> Note: You might need to set `WAMRC_LLC_FLAGS`/`WAMRC_ASM_FLAGS` to match whatever the `wamrc` command would automatically do. In the above example, `-mcmodel=medium` corresponds to `wamrc --size-level=1`, which is the default of `wamrc` on macOS.
+
 Run WASM app in WAMR mini product build
 =======================================
 

+ 2 - 0
language-bindings/python/README.md

@@ -4,6 +4,8 @@ The WAMR Python package contains a set of high-level bindings for WAMR API and W
 
 ## Installation
 
+* **Notice**: This python package need python >= `3.9`.
+
 To Install from local source tree in _development mode_ run the following command,
 
 ```bash

+ 1 - 0
language-bindings/python/setup.py

@@ -62,4 +62,5 @@ setup(
         'install': PreInstallCommand,
         'egg_info': PreEggInfoCommand,
     },
+    python_requires='>=3.9'
 )

+ 2 - 0
language-bindings/python/wamr-api/README.md

@@ -1,5 +1,7 @@
 # WARM API
 
+* **Notice**: The python package `wamr.wamrapi.wamr` need python >= `3.9`.
+
 ## Setup
 
 ### Pre-requisites

+ 0 - 18
language-bindings/python/wamr-api/samples/compile.sh

@@ -135,24 +135,6 @@ if (WAMR_BUILD_TARGET MATCHES "X86_.*" OR WAMR_BUILD_TARGET STREQUAL "AMD_64")
     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mindirect-branch-register")
     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mindirect-branch-register")
     # UNDEFINED BEHAVIOR, refer to https://en.cppreference.com/w/cpp/language/ub
-    if(CMAKE_BUILD_TYPE STREQUAL "Debug" AND NOT WAMR_BUILD_JIT EQUAL 1)
-      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined \
-                                          -fno-sanitize=bounds,bounds-strict,alignment \
-                                          -fno-sanitize-recover")
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined \
-                                              -fno-sanitize=bounds,bounds-strict,alignment \
-                                              -fno-sanitize-recover")
-    endif()
-  else ()
-    # UNDEFINED BEHAVIOR, refer to https://en.cppreference.com/w/cpp/language/ub
-    if(CMAKE_BUILD_TYPE STREQUAL "Debug" AND NOT WAMR_BUILD_JIT EQUAL 1)
-      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined \
-                                          -fno-sanitize=bounds,alignment \
-                                          -fno-sanitize-recover")
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined \
-                                              -fno-sanitize=bounds,alignment \
-                                              -fno-sanitize-recover")
-    endif()
   endif ()
 endif ()
 

+ 134 - 5
product-mini/platforms/posix/main.c

@@ -54,6 +54,14 @@ print_help()
 #if WASM_ENABLE_JIT != 0
     printf("  --llvm-jit-size-level=n  Set LLVM JIT size level, default is 3\n");
     printf("  --llvm-jit-opt-level=n   Set LLVM JIT optimization level, default is 3\n");
+#if defined(os_writegsbase)
+    printf("  --enable-segue[=<flags>] Enable using segment register GS as the base address of\n");
+    printf("                           linear memory, which may improve performance, flags can be:\n");
+    printf("                              i32.load, i64.load, f32.load, f64.load, v128.load,\n");
+    printf("                              i32.store, i64.store, f32.store, f64.store, v128.store\n");
+    printf("                           Use comma to separate, e.g. --enable-segue=i32.load,i64.store\n");
+    printf("                           and --enable-segue means all flags are added.\n");
+#endif
 #endif
     printf("  --repl                   Start a very simple REPL (read-eval-print-loop) mode\n"
            "                           that runs commands in the form of \"FUNC ARG...\"\n");
@@ -89,6 +97,9 @@ print_help()
 #if WASM_ENABLE_DEBUG_INTERP != 0
     printf("  -g=ip:port               Set the debug sever address, default is debug disabled\n");
     printf("                             if port is 0, then a random port will be used\n");
+#endif
+#if WASM_ENABLE_STATIC_PGO != 0
+    printf("  --gen-prof-file=<path>   Generate LLVM PGO (Profile-Guided Optimization) profile file\n");
 #endif
     printf("  --version                Show version information\n");
     return 1;
@@ -117,13 +128,13 @@ app_instance_func(wasm_module_inst_t module_inst, const char *func_name)
 }
 
 /**
- * Split a space separated strings into an array of strings
+ * Split a string into an array of strings
  * Returns NULL on failure
  * Memory must be freed by caller
  * Based on: http://stackoverflow.com/a/11198630/471795
  */
 static char **
-split_string(char *str, int *count)
+split_string(char *str, int *count, const char *delimer)
 {
     char **res = NULL, **res1;
     char *p;
@@ -131,7 +142,7 @@ split_string(char *str, int *count)
 
     /* split string and append tokens to 'res' */
     do {
-        p = strtok(str, " ");
+        p = strtok(str, delimer);
         str = NULL;
         res1 = res;
         res = (char **)realloc(res1, sizeof(char *) * (uint32)(idx + 1));
@@ -180,7 +191,7 @@ app_instance_repl(wasm_module_inst_t module_inst)
             printf("exit repl mode\n");
             break;
         }
-        app_argv = split_string(cmd, &app_argc);
+        app_argv = split_string(cmd, &app_argc, " ");
         if (app_argv == NULL) {
             LOG_ERROR("Wasm prepare param failed: split string failed.\n");
             break;
@@ -195,6 +206,59 @@ app_instance_repl(wasm_module_inst_t module_inst)
     return NULL;
 }
 
+#if WASM_ENABLE_JIT != 0
+static uint32
+resolve_segue_flags(char *str_flags)
+{
+    uint32 segue_flags = 0;
+    int32 flag_count, i;
+    char **flag_list;
+
+    flag_list = split_string(str_flags, &flag_count, ",");
+    if (flag_list) {
+        for (i = 0; i < flag_count; i++) {
+            if (!strcmp(flag_list[i], "i32.load")) {
+                segue_flags |= 1 << 0;
+            }
+            else if (!strcmp(flag_list[i], "i64.load")) {
+                segue_flags |= 1 << 1;
+            }
+            else if (!strcmp(flag_list[i], "f32.load")) {
+                segue_flags |= 1 << 2;
+            }
+            else if (!strcmp(flag_list[i], "f64.load")) {
+                segue_flags |= 1 << 3;
+            }
+            else if (!strcmp(flag_list[i], "v128.load")) {
+                segue_flags |= 1 << 4;
+            }
+            else if (!strcmp(flag_list[i], "i32.store")) {
+                segue_flags |= 1 << 8;
+            }
+            else if (!strcmp(flag_list[i], "i64.store")) {
+                segue_flags |= 1 << 9;
+            }
+            else if (!strcmp(flag_list[i], "f32.store")) {
+                segue_flags |= 1 << 10;
+            }
+            else if (!strcmp(flag_list[i], "f64.store")) {
+                segue_flags |= 1 << 11;
+            }
+            else if (!strcmp(flag_list[i], "v128.store")) {
+                segue_flags |= 1 << 12;
+            }
+            else {
+                /* invalid flag */
+                segue_flags = (uint32)-1;
+                break;
+            }
+        }
+        free(flag_list);
+    }
+    return segue_flags;
+}
+#endif /* end of WASM_ENABLE_JIT != 0 */
+
 #if WASM_ENABLE_LIBC_WASI != 0
 static bool
 validate_env_str(char *env)
@@ -352,6 +416,44 @@ moudle_destroyer(uint8 *buffer, uint32 size)
 static char global_heap_buf[WASM_GLOBAL_HEAP_SIZE] = { 0 };
 #endif
 
+#if WASM_ENABLE_STATIC_PGO != 0
+static void
+dump_pgo_prof_data(wasm_module_inst_t module_inst, const char *path)
+{
+    char *buf;
+    uint32 len;
+    FILE *file;
+
+    if (!(len = wasm_runtime_get_pgo_prof_data_size(module_inst))) {
+        printf("failed to get LLVM PGO profile data size\n");
+        return;
+    }
+
+    if (!(buf = wasm_runtime_malloc(len))) {
+        printf("allocate memory failed\n");
+        return;
+    }
+
+    if (len != wasm_runtime_dump_pgo_prof_data_to_buf(module_inst, buf, len)) {
+        printf("failed to dump LLVM PGO profile data\n");
+        wasm_runtime_free(buf);
+        return;
+    }
+
+    if (!(file = fopen(path, "wb"))) {
+        printf("failed to create file %s", path);
+        wasm_runtime_free(buf);
+        return;
+    }
+    fwrite(buf, len, 1, file);
+    fclose(file);
+
+    wasm_runtime_free(buf);
+
+    printf("LLVM raw profile file %s was generated.\n", path);
+}
+#endif
+
 int
 main(int argc, char *argv[])
 {
@@ -367,6 +469,7 @@ main(int argc, char *argv[])
 #if WASM_ENABLE_JIT != 0
     uint32 llvm_jit_size_level = 3;
     uint32 llvm_jit_opt_level = 3;
+    uint32 segue_flags = 0;
 #endif
     wasm_module_t wasm_module = NULL;
     wasm_module_inst_t wasm_module_inst = NULL;
@@ -398,6 +501,9 @@ main(int argc, char *argv[])
     char *ip_addr = NULL;
     int instance_port = 0;
 #endif
+#if WASM_ENABLE_STATIC_PGO != 0
+    const char *gen_prof_file = NULL;
+#endif
 
     /* Process options. */
     for (argc--, argv++; argc > 0 && argv[0][0] == '-'; argc--, argv++) {
@@ -487,7 +593,16 @@ main(int argc, char *argv[])
                 llvm_jit_opt_level = 3;
             }
         }
-#endif
+        else if (!strcmp(argv[0], "--enable-segue")) {
+            /* all flags are enabled */
+            segue_flags = 0x1F1F;
+        }
+        else if (!strncmp(argv[0], "--enable-segue=", 15)) {
+            segue_flags = resolve_segue_flags(argv[0] + 15);
+            if (segue_flags == (uint32)-1)
+                return print_help();
+        }
+#endif /* end of WASM_ENABLE_JIT != 0 */
 #if WASM_ENABLE_LIBC_WASI != 0
         else if (!strncmp(argv[0], "--dir=", 6)) {
             if (argv[0][6] == '\0')
@@ -592,6 +707,13 @@ main(int argc, char *argv[])
                 return print_help();
             ip_addr = argv[0] + 3;
         }
+#endif
+#if WASM_ENABLE_STATIC_PGO != 0
+        else if (!strncmp(argv[0], "--gen-prof-file=", 16)) {
+            if (argv[0][16] == '\0')
+                return print_help();
+            gen_prof_file = argv[0] + 16;
+        }
 #endif
         else if (!strncmp(argv[0], "--version", 9)) {
             uint32 major, minor, patch;
@@ -632,6 +754,7 @@ main(int argc, char *argv[])
 #if WASM_ENABLE_JIT != 0
     init_args.llvm_jit_size_level = llvm_jit_size_level;
     init_args.llvm_jit_opt_level = llvm_jit_opt_level;
+    init_args.segue_flags = segue_flags;
 #endif
 
 #if WASM_ENABLE_DEBUG_INTERP != 0
@@ -754,6 +877,12 @@ main(int argc, char *argv[])
     }
 #endif
 
+#if WASM_ENABLE_STATIC_PGO != 0 && WASM_ENABLE_AOT != 0
+    if (get_package_type(wasm_file_buf, wasm_file_size) == Wasm_Module_AoT
+        && gen_prof_file)
+        dump_pgo_prof_data(wasm_module_inst, gen_prof_file);
+#endif
+
 #if WASM_ENABLE_DEBUG_INTERP != 0
 fail4:
 #endif

+ 0 - 1
product-mini/platforms/windows/CMakeLists.txt

@@ -102,7 +102,6 @@ include (${WAMR_ROOT_DIR}/build-scripts/runtime_lib.cmake)
 add_library(vmlib ${WAMR_RUNTIME_LIB_SOURCE})
 
 #set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWIN32_LEAN_AND_MEAN")
-set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_WINSOCK_DEPRECATED_NO_WARNINGS")
 if (NOT MINGW)
   set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /SAFESEH:NO")
   set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /SAFESEH:NO")

+ 2 - 2
samples/ref-types/src/hello.c

@@ -142,8 +142,8 @@ set_and_cmp(wasm_exec_env_t exec_env, wasm_module_inst_t inst, int32 i,
     wasm_set_externref(exec_env, inst, i, externref);
     local_set_externref(i, externref);
 
-    wasm_get_externref(exec_env, inst, 0, &wasm_externref);
-    if (!local_chk_externref(exec_env, 0, wasm_externref)) {
+    wasm_get_externref(exec_env, inst, i, &wasm_externref);
+    if (!local_chk_externref(exec_env, i, wasm_externref)) {
         printf("#%d, In host language world Wasm Externref 0x%lx Vs. Native "
                "Externref 0x%lx FAILED\n",
                i, wasm_externref, externref);

+ 0 - 9
samples/wasm-c-api/CMakeLists.txt

@@ -87,15 +87,6 @@ endif()
 set(WAMR_ROOT_DIR ${CMAKE_CURRENT_LIST_DIR}/../..)
 include (${WAMR_ROOT_DIR}/build-scripts/runtime_lib.cmake)
 
-if (NOT DEFINED SANITIZER)
-  set(SANITIZER "")
-elseif (SANITIZER STREQUAL "ubsan")
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O2 -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=all -fno-sanitize=alignment" )
-  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=undefined")
-elseif (NOT (SANITIZER STREQUAL "") )
-  message(SEND_ERROR "Unsupported sanitizer: ${SANITIZER}")
-endif()
-
 add_library(vmlib STATIC ${WAMR_RUNTIME_LIB_SOURCE})
 if (MSVC)
   target_compile_definitions(vmlib PRIVATE WASM_API_EXTERN=)

+ 1 - 1
test-tools/wamr-ide/VSCode-Extension/package.json

@@ -6,7 +6,7 @@
     },
     "displayName": "WAMR-IDE",
     "description": "An Integrated Development Environment for WASM",
-    "version": "1.1.2",
+    "version": "1.2.1",
     "engines": {
         "vscode": "^1.59.0"
     },

+ 24 - 43
test-tools/wamr-ide/VSCode-Extension/src/debugConfigurationProvider.ts

@@ -7,52 +7,33 @@ import * as vscode from 'vscode';
 import * as os from 'os';
 
 export class WasmDebugConfigurationProvider
-    implements vscode.DebugConfigurationProvider
-{
-    /* default port set as 1234 */
-    private port = 1234;
-    private hostPath!: string;
-    private providerPromise: Thenable<vscode.DebugConfiguration> | undefined =
-        undefined;
+    implements vscode.DebugConfigurationProvider {
+    private wasmDebugConfig = {
+        type: 'wamr-debug',
+        name: 'Attach',
+        request: 'attach',
+        stopOnEntry: true,
+        initCommands: os.platform() === 'win32' || os.platform() === 'darwin' ?
+            /* linux and windows has different debug configuration */
+            ['platform select remote-linux'] :
+            undefined,
+        attachCommands: [
+            /* default port 1234 */
+            'process connect -p wasm connect://127.0.0.1:1234',
+        ]
+    };
 
-    private wasmDebugConfig!: vscode.DebugConfiguration;
+    public resolveDebugConfiguration(
+        _: vscode.WorkspaceFolder | undefined,
+        debugConfiguration: vscode.DebugConfiguration,
+    ): vscode.ProviderResult<vscode.DebugConfiguration> {
 
-    public resolveDebugConfiguration():
-        | Thenable<vscode.DebugConfiguration>
-        | undefined {
-        if (!this.providerPromise) {
-            this.providerPromise = Promise.resolve(this.wasmDebugConfig);
-            return this.providerPromise;
-        }
-        return this.providerPromise;
-    }
+        this.wasmDebugConfig = {
+            ...this.wasmDebugConfig,
+            ...debugConfiguration
+        };
 
-    public setDebugConfig(hostPath: string, port: number): void {
-        this.port = port;
-        this.hostPath = hostPath;
-        /* linux and windows has different debug configuration */
-        if (os.platform() === 'win32' || os.platform() === 'darwin') {
-            this.wasmDebugConfig = {
-                type: 'wamr-debug',
-                name: 'Attach',
-                request: 'attach',
-                ['stopOnEntry']: true,
-                ['initCommands']: ['platform select remote-linux'],
-                ['attachCommands']: [
-                    'process connect -p wasm connect://127.0.0.1:' + port + '',
-                ],
-            };
-        } else if (os.platform() === 'linux') {
-            this.wasmDebugConfig = {
-                type: 'wamr-debug',
-                name: 'Attach',
-                request: 'attach',
-                ['stopOnEntry']: true,
-                ['attachCommands']: [
-                    'process connect -p wasm connect://127.0.0.1:' + port + '',
-                ],
-            };
-        }
+        return this.wasmDebugConfig;
     }
 
     public getDebugConfig(): vscode.DebugConfiguration {

+ 0 - 1
test-tools/wamr-ide/VSCode-Extension/src/extension.ts

@@ -171,7 +171,6 @@ export async function activate(context: vscode.ExtensionContext) {
 
     /* register debug configuration */
     wasmDebugConfigProvider = new WasmDebugConfigurationProvider();
-    wasmDebugConfigProvider.setDebugConfig(currentPrjDir, 1234);
 
     vscode.debug.registerDebugConfigurationProvider(
         'wamr-debug',

+ 62 - 0
tests/benchmarks/README.md

@@ -0,0 +1,62 @@
+# WAMR test benchmarks
+
+This folder contains test benchmarks for wamr.
+
+## Build and Run
+
+Refer to the `README.md` under each folder for how to build and run the benchmark.
+
+## Install `llvm-profdata`
+
+The tool `llvm-profdata` is used when running the `test_pgo.sh` script under the benchmark folder. There are two ways to install it:
+
+1. Refer to https://apt.llvm.org/, e.g. in Ubuntu 20.04, add lines below to /etc/apt/source.list
+
+```bash
+deb http://apt.llvm.org/focal/ llvm-toolchain-focal main
+deb-src http://apt.llvm.org/focal/ llvm-toolchain-focal main
+# 15
+deb http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main
+deb-src http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main
+```
+
+Then run `sudo apt update`, `sudo apt install llvm`. And after installing:
+
+```bash
+cd /usr/bin
+sudo ln -s llvm-profdata-15 llvm-profdata
+```
+
+2. Build manually
+
+```bash
+git clone --depth 1 --branch release/15.x https://github.com/llvm/llvm-project.git
+cd llvm-project
+mkdir build && cd build
+cmake ../llvm \
+    -DCMAKE_BUILD_TYPE:STRING="Release" \
+    -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+    -DLLVM_APPEND_VC_REV:BOOL=ON \
+    -DLLVM_BUILD_EXAMPLES:BOOL=OFF \
+    -DLLVM_BUILD_LLVM_DYLIB:BOOL=OFF \
+    -DLLVM_BUILD_TESTS:BOOL=OFF \
+    -DLLVM_CCACHE_BUILD:BOOL=ON \
+    -DLLVM_ENABLE_BINDINGS:BOOL=OFF \
+    -DLLVM_ENABLE_IDE:BOOL=OFF \
+    -DLLVM_ENABLE_LIBEDIT=OFF \
+    -DLLVM_ENABLE_TERMINFO:BOOL=OFF \
+    -DLLVM_ENABLE_ZLIB:BOOL=ON \
+    -DLLVM_INCLUDE_BENCHMARKS:BOOL=OFF \
+    -DLLVM_INCLUDE_DOCS:BOOL=OFF \
+    -DLLVM_INCLUDE_EXAMPLES:BOOL=OFF \
+    -DLLVM_INCLUDE_UTILS:BOOL=OFF \
+    -DLLVM_INCLUDE_TESTS:BOOL=OFF \
+    -DLLVM_BUILD_TESTS:BOOL=OFF \
+    -DLLVM_OPTIMIZED_TABLEGEN:BOOL=ON \
+    -DLLVM_ENABLE_LIBXML2:BOOL=OFF \
+    -DLLVM_TARGETS_TO_BUILD:STRING="X86" \
+    -DLLVM_INCLUDE_TOOLS:BOOL=ON \
+    -G'Ninja'
+ninja -j 8
+# tool `llvm-profdata` is generated under this folder.
+```

+ 2 - 0
tests/benchmarks/coremark/README.md

@@ -17,3 +17,5 @@ And then run `./build.sh` to build the source code, file `coremark.exe`, `corema
 # Running
 
 Run `./run.sh` to test the benchmark, the native mode, iwasm aot mode and iwasm interpreter mode will be tested respectively.
+
+Run `./test_pgo.sh` to test the benchmark with AOT static PGO (Profile-Guided Optimization) enabled, please refer [here](../README.md#install-llvm-profdata) to install tool `llvm-profdata` and build `iwasm` with `cmake -DWAMR_BUILD_STATIC_PGO=1`.

+ 7 - 0
tests/benchmarks/coremark/build.sh

@@ -3,6 +3,8 @@
 # Copyright (C) 2019 Intel Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+PLATFORM=$(uname -s | tr A-Z a-z)
+
 WAMRC="../../../wamr-compiler/build/wamrc"
 
 if [ ! -d coremark ]; then
@@ -32,4 +34,9 @@ cd ..
 echo "Compile coremark.wasm to coremark.aot .."
 ${WAMRC} -o coremark.aot coremark.wasm
 
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo "Compile coremark.wasm to coremark_segue.aot .."
+    ${WAMRC} --enable-segue -o coremark_segue.aot coremark.wasm
+fi
+
 echo "Done"

+ 10 - 3
tests/benchmarks/coremark/run.sh

@@ -3,14 +3,21 @@
 # Copyright (C) 2019 Intel Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-IWASM="../../../product-mini/platforms/linux/build/iwasm"
+PLATFORM=$(uname -s | tr A-Z a-z)
+
+IWASM="../../../product-mini/platforms/${PLATFORM}/build/iwasm"
 WAMRC="../../../wamr-compiler/build/wamrc"
 
 echo "Run coremark with native .."
 ./coremark.exe
 
-echo "Run coremark with iwasm mode .."
+echo "Run coremark with iwasm aot mode .."
 ${IWASM} coremark.aot
 
-echo "Run coremakr with iwasm interpreter .."
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo "Run coremark with iwasm aot-segue mode .."
+    ${IWASM} coremark_segue.aot
+fi
+
+echo "Run coremark with iwasm interpreter mode .."
 ${IWASM} coremark.wasm

+ 50 - 0
tests/benchmarks/coremark/test_pgo.sh

@@ -0,0 +1,50 @@
+#!/bin/sh
+
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+PLATFORM=$(uname -s | tr A-Z a-z)
+
+IWASM="../../../product-mini/platforms/${PLATFORM}/build/iwasm"
+WAMRC="../../../wamr-compiler/build/wamrc"
+
+if [ ! -e "coremark.wasm" ]; then
+    echo "coremark.wasm doesn't exist, please run build.sh first"
+    exit
+fi
+
+echo ""
+echo "Compile coremark.wasm to coremark.aot .."
+${WAMRC} -o coremark.aot coremark.wasm
+
+echo ""
+echo "Compile coremark.wasm to coremark_pgo.aot .."
+${WAMRC} --enable-llvm-pgo -o coremark_pgo.aot coremark.wasm
+
+echo ""
+echo "Run coremark_pgo.aot to generate the raw profile data .."
+${IWASM} --gen-prof-file=coremark.profraw coremark_pgo.aot
+
+echo ""
+echo "Merge the raw profile data to coremark.profdata .."
+rm -f coremark.profdata && llvm-profdata merge -output=coremark.profdata coremark.profraw
+
+echo ""
+echo "Compile coremark.wasm to coremark_opt.aot with the profile data .."
+${WAMRC} --use-prof-file=coremark.profdata -o coremark_opt.aot coremark.wasm
+
+echo ""
+echo "Run the coremark native"
+./coremark.exe
+
+echo ""
+echo "Run the original aot file coremark.aot"
+${IWASM} coremark.aot
+
+echo ""
+echo "Run the PGO optimized aot file coremark_opt.aot"
+${IWASM} coremark_opt.aot
+
+# Show the profile data:
+# llvm-profdata show --all-functions --detailed-summary --binary-ids --counts \
+# --hot-func-list --memop-sizes --show-prof-sym-list coremark.profraw

+ 7 - 0
tests/benchmarks/dhrystone/LICENSE

@@ -0,0 +1,7 @@
+Dhrystone
+------------------------------------------------------------------------------
+There is no explicit license defined.  They were originally
+written in ADA by Reinhold P. Weicker and translated to C by Rick Richardson .
+
+The source obtained from the following site:
+https://fossies.org/linux/privat/old/dhrystone-2.1.tar.gz

+ 24 - 0
tests/benchmarks/dhrystone/build.sh

@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+PLATFORM=$(uname -s | tr A-Z a-z)
+
+WAMRC_CMD=$PWD/../../../wamr-compiler/build/wamrc
+
+echo "===> compile dhrystone src to dhrystone_native"
+gcc -O3 -o dhrystone_native src/dhry_1.c src/dhry_2.c -I include
+
+echo "===> compile dhrystone src to dhrystone.wasm"
+/opt/wasi-sdk/bin/clang -O3 \
+    -o dhrystone.wasm src/dhry_1.c src/dhry_2.c -I include \
+    -Wl,--export=__heap_base -Wl,--export=__data_end
+
+echo "===> compile dhrystone.wasm to dhrystone.aot"
+${WAMRC_CMD} -o dhrystone.aot dhrystone.wasm
+
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo "===> compile dhrystone.wasm to dhrystone_segue.aot"
+    ${WAMRC_CMD} --enable-segue -o dhrystone_segue.aot dhrystone.wasm
+fi

+ 306 - 0
tests/benchmarks/dhrystone/include/dhry.h

@@ -0,0 +1,306 @@
+/*
+ **************************************************************************
+ *                       DHRYSTONE 2.1 BENCHMARK PC VERSION
+ **************************************************************************
+ *
+ *                   "DHRYSTONE" Benchmark Program
+ *                   -----------------------------
+ *
+ *  Version:    C, Version 2.1
+ *
+ *  File:       dhry.h (part 1 of 3)
+ *
+ *  Date:       May 25, 1988
+ *
+ *  Author:     Reinhold P. Weicker
+ *                      Siemens AG, AUT E 51
+ *                      Postfach 3220
+ *                      8520 Erlangen
+ *                      Germany (West)
+ *                              Phone:  [+49]-9131-7-20330
+ *                                      (8-17 Central European Time)
+ *                              Usenet: ..!mcsun!unido!estevax!weicker
+ *
+ *            Original Version (in Ada) published in
+ *            "Communications of the ACM" vol. 27., no. 10 (Oct. 1984),
+ *            pp. 1013 - 1030, together with the statistics
+ *            on which the distribution of statements etc. is based.
+ *
+ *            In this C version, the following C library functions are used:
+ *            - strcpy, strcmp (inside the measurement loop)
+ *            - printf, scanf (outside the measurement loop)
+ *            In addition, Berkeley UNIX system calls "times ()" or "time ()"
+ *            are used for execution time measurement. For measurements
+ *            on other systems, these calls have to be changed.
+ *
+ *  Collection of Results:
+ *              Reinhold Weicker (address see above) and
+ *
+ *              Rick Richardson
+ *              PC Research. Inc.
+ *              94 Apple Orchard Drive
+ *              Tinton Falls, NJ 07724
+ *                      Phone:  (201) 389-8963 (9-17 EST)
+ *                      Usenet: ...!uunet!pcrat!rick
+ *
+ *      Please send results to Rick Richardson and/or Reinhold Weicker.
+ *      Complete information should be given on hardware and software used.
+ *      Hardware information includes: Machine type, CPU, type and size
+ *      of caches; for microprocessors: clock frequency, memory speed
+ *      (number of wait states).
+ *      Software information includes: Compiler (and runtime library)
+ *      manufacturer and version, compilation switches, OS version.
+ *      The Operating System version may give an indication about the
+ *      compiler; Dhrystone itself performs no OS calls in the measurement
+ *      loop.
+ *
+ *      The complete output generated by the program should be mailed
+ *      such that at least some checks for correctness can be made.
+ *
+ **************************************************************************
+ *
+ *  This version has changes made by Roy Longbottom to conform to a common
+ *  format for a series of standard benchmarks for PCs:
+ *
+ *  Running time greater than 5 seconds due to inaccuracy of the PC clock.
+ *
+ *  Automatic adjustment of run time, no manually inserted parameters.
+ *
+ *  Initial display of calibration times to confirm linearity.
+ *
+ *  Display of results within one screen (or at a slow speed as the test
+ *  progresses) so that it can be seen to have run successfully.
+ *
+ *  Facilities to type in details of system used etc.
+ *
+ *  All results and details appended to a results file.
+ *
+ *
+ *  Roy Longbottom
+ *  101323.2241@compuserve.com
+ *
+ **************************************************************************
+ *
+ *  For details of history, changes, other defines, benchmark construction
+ *  statistics see official versions from ftp.nosc.mil/pub/aburto where
+ *  the latest table of results (dhry.tbl) are available. See also
+ *  netlib@ornl.gov
+ *
+ **************************************************************************
+ *
+ * Defines:     The following "Defines" are possible:
+ *              -DREG=register          (default: Not defined)
+ *                      As an approximation to what an average C programmer
+ *                      might do, the "register" storage class is applied
+ *                      (if enabled by -DREG=register)
+ *                      - for local variables, if they are used (dynamically)
+ *                        five or more times
+ *                      - for parameters if they are used (dynamically)
+ *                        six or more times
+ *                      Note that an optimal "register" strategy is
+ *                      compiler-dependent, and that "register" declarations
+ *                      do not necessarily lead to faster execution.
+ *              -DNOSTRUCTASSIGN        (default: Not defined)
+ *                      Define if the C compiler does not support
+ *                      assignment of structures.
+ *              -DNOENUMS               (default: Not defined)
+ *                      Define if the C compiler does not support
+ *                      enumeration types.
+ ***************************************************************************
+ *
+ *  Compilation model and measurement (IMPORTANT):
+ *
+ *  This C version of Dhrystone consists of three files:
+ *  - dhry.h (this file, containing global definitions and comments)
+ *  - dhry_1.c (containing the code corresponding to Ada package Pack_1)
+ *  - dhry_2.c (containing the code corresponding to Ada package Pack_2)
+ *
+ *  The following "ground rules" apply for measurements:
+ *  - Separate compilation
+ *  - No procedure merging
+ *  - Otherwise, compiler optimizations are allowed but should be indicated
+ *  - Default results are those without register declarations
+ *  See the companion paper "Rationale for Dhrystone Version 2" for a more
+ *  detailed discussion of these ground rules.
+ *
+ *  For 16-Bit processors (e.g. 80186, 80286), times for all compilation
+ *  models ("small", "medium", "large" etc.) should be given if possible,
+ *  together with a definition of these models for the compiler system used.
+ *
+ **************************************************************************
+ *                Examples of Pentium Results
+ *
+ * Dhrystone Benchmark  Version 2.1 (Language: C)
+ *
+ * Month run            4/1996
+ * PC model             Escom
+ * CPU                  Pentium
+ * Clock MHz            100
+ * Cache                256K
+ * Options              Neptune chipset
+ * OS/DOS               Windows 95
+ * Compiler             Watcom C/ C++ 10.5 Win386
+ * OptLevel             -otexan -zp8 -fp5 -5r
+ * Run by               Roy Longbottom
+ * From                 UK
+ * Mail                 101323.2241@compuserve.com
+ *
+ * Final values         (* implementation-dependent):
+ *
+ * Int_Glob:      O.K.  5
+ * Bool_Glob:     O.K.  1
+ * Ch_1_Glob:     O.K.  A
+ * Ch_2_Glob:     O.K.  B
+ * Arr_1_Glob[8]: O.K.  7
+ * Arr_2_Glob8/7: O.K.     1600010
+ * Ptr_Glob->
+ *   Ptr_Comp:       *  98008
+ *   Discr:       O.K.  0
+ *   Enum_Comp:   O.K.  2
+ *   Int_Comp:    O.K.  17
+ *   Str_Comp:    O.K.  DHRYSTONE PROGRAM, SOME STRING
+ * Next_Ptr_Glob->
+ *   Ptr_Comp:       *  98008 same as above
+ *   Discr:       O.K.  0
+ *   Enum_Comp:   O.K.  1
+ *   Int_Comp:    O.K.  18
+ *   Str_Comp:    O.K.  DHRYSTONE PROGRAM, SOME STRING
+ * Int_1_Loc:     O.K.  5
+ * Int_2_Loc:     O.K.  13
+ * Int_3_Loc:     O.K.  7
+ * Enum_Loc:      O.K.  1
+ * Str_1_Loc:     O.K.  DHRYSTONE PROGRAM, 1'ST STRING
+ * Str_2_Loc:     O.K.  DHRYSTONE PROGRAM, 2'ND STRING
+ *
+ * Register option      Selected.
+ *
+ * Microseconds 1 loop:          4.53
+ * Dhrystones / second:      220690
+ * VAX MIPS rating:            125.61
+ *
+ *
+ * Dhrystone Benchmark  Version 2.1 (Language: C)
+ *
+ * Month run            4/1996
+ * PC model             Escom
+ * CPU                  Pentium
+ * Clock MHz            100
+ * Cache                256K
+ * Options              Neptune chipset
+ * OS/DOS               Windows 95
+ * Compiler             Watcom C/ C++ 10.5 Win386
+ * OptLevel                 No optimisation
+ * Run by               Roy Longbottom
+ * From                 UK
+ * Mail                 101323.2241@compuserve.com
+ *
+ * Final values         (* implementation-dependent):
+ *
+ * Int_Glob:      O.K.  5
+ * Bool_Glob:     O.K.  1
+ * Ch_1_Glob:     O.K.  A
+ * Ch_2_Glob:     O.K.  B
+ * Arr_1_Glob[8]: O.K.  7
+ * Arr_2_Glob8/7: O.K.      320010
+ * Ptr_Glob->
+ *   Ptr_Comp:       *  98004
+ *   Discr:       O.K.  0
+ *   Enum_Comp:   O.K.  2
+ *   Int_Comp:    O.K.  17
+ *   Str_Comp:    O.K.  DHRYSTONE PROGRAM, SOME STRING
+ * Next_Ptr_Glob->
+ *   Ptr_Comp:       *  98004 same as above
+ *   Discr:       O.K.  0
+ *   Enum_Comp:   O.K.  1
+ *   Int_Comp:    O.K.  18
+ *   Str_Comp:    O.K.  DHRYSTONE PROGRAM, SOME STRING
+ * Int_1_Loc:     O.K.  5
+ * Int_2_Loc:     O.K.  13
+ * Int_3_Loc:     O.K.  7
+ * Enum_Loc:      O.K.  1
+ * Str_1_Loc:     O.K.  DHRYSTONE PROGRAM, 1'ST STRING
+ * Str_2_Loc:     O.K.  DHRYSTONE PROGRAM, 2'ND STRING
+ *
+ * Register option      Not selected.
+ *
+ * Microseconds 1 loop:         20.06
+ * Dhrystones / second:       49844
+ * VAX MIPS rating:             28.37
+ *
+ **************************************************************************
+ */
+
+/* Compiler and system dependent definitions: */
+
+#ifndef TIME
+#define TIMES
+#endif
+/* Use times(2) time function unless    */
+/* explicitly defined otherwise         */
+
+#ifdef TIMES
+/* #include <sys/types.h>
+   #include <sys/times.h> */
+/* for "times" */
+#endif
+
+#define Mic_secs_Per_Second 1000000.0
+/* Berkeley UNIX C returns process times in seconds/HZ */
+
+#ifdef NOSTRUCTASSIGN
+#define structassign(d, s) memcpy(&(d), &(s), sizeof(d))
+#else
+#define structassign(d, s) d = s
+#endif
+
+#ifdef NOENUM
+#define Ident_1 0
+#define Ident_2 1
+#define Ident_3 2
+#define Ident_4 3
+#define Ident_5 4
+typedef int Enumeration;
+#else
+typedef enum { Ident_1, Ident_2, Ident_3, Ident_4, Ident_5 } Enumeration;
+#endif
+/* for boolean and enumeration types in Ada, Pascal */
+
+/* General definitions: */
+
+#include <stdio.h>
+#include <string.h>
+
+/* for strcpy, strcmp */
+
+#define Null 0
+/* Value of a Null pointer */
+#define true 1
+#define false 0
+
+typedef int One_Thirty;
+typedef int One_Fifty;
+typedef char Capital_Letter;
+typedef int Boolean;
+typedef char Str_30[31];
+typedef int Arr_1_Dim[50];
+typedef int Arr_2_Dim[50][50];
+
+typedef struct record {
+    struct record *Ptr_Comp;
+    Enumeration Discr;
+    union {
+        struct {
+            Enumeration Enum_Comp;
+            int Int_Comp;
+            char Str_Comp[31];
+        } var_1;
+        struct {
+            Enumeration E_Comp_2;
+            char Str_2_Comp[31];
+        } var_2;
+        struct {
+            char Ch_1_Comp;
+            char Ch_2_Comp;
+        } var_3;
+    } variant;
+} Rec_Type, *Rec_Pointer;

+ 19 - 0
tests/benchmarks/dhrystone/run.sh

@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+PLATFORM=$(uname -s | tr A-Z a-z)
+
+readonly IWASM_CMD="../../../product-mini/platforms/${PLATFORM}/build/iwasm"
+
+echo "============> run dhrystone native"
+./dhrystone_native
+
+echo "============> run dhrystone.aot"
+${IWASM_CMD} dhrystone.aot
+
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo "============> run dhrystone_segue.aot"
+    ${IWASM_CMD} dhrystone_segue.aot
+fi

+ 485 - 0
tests/benchmarks/dhrystone/src/dhry_1.c

@@ -0,0 +1,485 @@
+/*
+ *************************************************************************
+ *
+ *                   "DHRYSTONE" Benchmark Program
+ *                   -----------------------------
+ *
+ *  Version:    C, Version 2.1
+ *
+ *  File:       dhry_1.c (part 2 of 3)
+ *
+ *  Date:       May 25, 1988
+ *
+ *  Author:     Reinhold P. Weicker
+ *
+ *************************************************************************
+ */
+
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "dhry.h"
+
+/* Global Variables: */
+
+Rec_Pointer Ptr_Glob, Next_Ptr_Glob;
+int Int_Glob;
+Boolean Bool_Glob;
+char Ch_1_Glob, Ch_2_Glob;
+int Arr_1_Glob[50];
+int Arr_2_Glob[50][50];
+
+Enumeration
+Func_1(Capital_Letter Ch_1_Par_Val, Capital_Letter Ch_2_Par_Val);
+/*
+forward declaration necessary since Enumeration may not simply be int
+*/
+
+#ifndef ROPT
+#define REG
+/* REG becomes defined as empty */
+/* i.e. no register variables   */
+#else
+#define REG register
+#endif
+
+void
+Proc_1(REG Rec_Pointer Ptr_Val_Par);
+void
+Proc_2(One_Fifty *Int_Par_Ref);
+void
+Proc_3(Rec_Pointer *Ptr_Ref_Par);
+void
+Proc_4();
+void
+Proc_5();
+void
+Proc_6(Enumeration Enum_Val_Par, Enumeration *Enum_Ref_Par);
+void
+Proc_7(One_Fifty Int_1_Par_Val, One_Fifty Int_2_Par_Val,
+       One_Fifty *Int_Par_Ref);
+void
+Proc_8(Arr_1_Dim Arr_1_Par_Ref, Arr_2_Dim Arr_2_Par_Ref, int Int_1_Par_Val,
+       int Int_2_Par_Val);
+
+Boolean
+Func_2(Str_30 Str_1_Par_Ref, Str_30 Str_2_Par_Ref);
+
+/* variables for time measurement: */
+
+#define Too_Small_Time 2
+/* Measurements should last at least 2 seconds */
+
+#define BILLION 1000000000L
+#define MILLION 1000000
+struct timespec Begin_Time, End_Time;
+double User_Time;
+
+double Microseconds, Dhrystones_Per_Second, Vax_Mips;
+
+/* end of variables for time measurement */
+
+int
+main(int argc, char *argv[])
+/*****/
+
+/* main program, corresponds to procedures        */
+/* Main and Proc_0 in the Ada version             */
+{
+    One_Fifty Int_1_Loc;
+    REG One_Fifty Int_2_Loc;
+    One_Fifty Int_3_Loc;
+    REG char Ch_Index;
+    Enumeration Enum_Loc;
+    Str_30 Str_1_Loc;
+    Str_30 Str_2_Loc;
+    REG int Run_Index;
+    REG int Number_Of_Runs;
+    int endit, count = 10;
+    char general[9][80] = { " " };
+
+    /***********************************************************************
+     *         Change for compiler and optimisation used                   *
+     ***********************************************************************/
+
+    Next_Ptr_Glob = (Rec_Pointer)malloc(sizeof(Rec_Type));
+    Ptr_Glob = (Rec_Pointer)malloc(sizeof(Rec_Type));
+
+    Ptr_Glob->Ptr_Comp = Next_Ptr_Glob;
+    Ptr_Glob->Discr = Ident_1;
+    Ptr_Glob->variant.var_1.Enum_Comp = Ident_3;
+    Ptr_Glob->variant.var_1.Int_Comp = 40;
+    strcpy(Ptr_Glob->variant.var_1.Str_Comp, "DHRYSTONE PROGRAM, SOME STRING");
+    strcpy(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING");
+
+    Arr_2_Glob[8][7] = 10;
+    /* Was missing in published program. Without this statement,   */
+    /* Arr_2_Glob [8][7] would have an undefined value.            */
+    /* Warning: With 16-Bit processors and Number_Of_Runs > 32000, */
+    /* overflow may occur for this array element.                  */
+
+    printf("\n");
+    printf("Dhrystone Benchmark, Version 2.1 (Language: C or C++)\n");
+    printf("\n");
+
+    Number_Of_Runs = 5000;
+
+    do {
+
+        Number_Of_Runs = Number_Of_Runs * 2;
+        count = count - 1;
+        Arr_2_Glob[8][7] = 10;
+
+        /***************/
+        /* Start timer */
+        /***************/
+
+        clock_gettime(CLOCK_MONOTONIC, &Begin_Time);
+
+        for (Run_Index = 1; Run_Index <= Number_Of_Runs; ++Run_Index) {
+
+            Proc_5();
+            Proc_4();
+            /* Ch_1_Glob == 'A', Ch_2_Glob == 'B', Bool_Glob == true */
+            Int_1_Loc = 2;
+            Int_2_Loc = 3;
+            strcpy(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING");
+            Enum_Loc = Ident_2;
+            Bool_Glob = !Func_2(Str_1_Loc, Str_2_Loc);
+            /* Bool_Glob == 1 */
+            while (Int_1_Loc < Int_2_Loc) /* loop body executed once */
+            {
+                Int_3_Loc = 5 * Int_1_Loc - Int_2_Loc;
+                /* Int_3_Loc == 7 */
+                Proc_7(Int_1_Loc, Int_2_Loc, &Int_3_Loc);
+                /* Int_3_Loc == 7 */
+                Int_1_Loc += 1;
+            } /* while */
+              /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
+            Proc_8(Arr_1_Glob, Arr_2_Glob, Int_1_Loc, Int_3_Loc);
+            /* Int_Glob == 5 */
+            Proc_1(Ptr_Glob);
+            for (Ch_Index = 'A'; Ch_Index <= Ch_2_Glob; ++Ch_Index)
+            /* loop body executed twice */
+            {
+                if (Enum_Loc == Func_1(Ch_Index, 'C'))
+                /* then, not executed */
+                {
+                    Proc_6(Ident_1, &Enum_Loc);
+                    strcpy(Str_2_Loc, "DHRYSTONE PROGRAM, 3'RD STRING");
+                    Int_2_Loc = Run_Index;
+                    Int_Glob = Run_Index;
+                }
+            }
+            /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
+            Int_2_Loc = Int_2_Loc * Int_1_Loc;
+            Int_1_Loc = Int_2_Loc / Int_3_Loc;
+            Int_2_Loc = 7 * (Int_2_Loc - Int_3_Loc) - Int_1_Loc;
+            /* Int_1_Loc == 1, Int_2_Loc == 13, Int_3_Loc == 7 */
+            Proc_2(&Int_1_Loc);
+            /* Int_1_Loc == 5 */
+
+        } /* loop "for Run_Index" */
+
+        /**************/
+        /* Stop timer */
+        /**************/
+
+        clock_gettime(CLOCK_MONOTONIC, &End_Time);
+
+        User_Time = (End_Time.tv_sec - Begin_Time.tv_sec) * MILLION
+                    + (End_Time.tv_nsec - Begin_Time.tv_nsec) / 1000;
+        User_Time = User_Time / MILLION; /* convert to seconds */
+
+        printf("%ld runs %lf seconds \n", (long)Number_Of_Runs, User_Time);
+        if (User_Time > 5.0) {
+            count = 0;
+        }
+        else {
+            if (User_Time < 0.1) {
+                Number_Of_Runs = Number_Of_Runs * 5;
+            }
+        }
+    } /* calibrate/run do while */
+    while (count > 0);
+
+    printf("\n");
+    printf("Final values (* implementation-dependent):\n");
+    printf("\n");
+    printf("Int_Glob:      ");
+    if (Int_Glob == 5)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d  ", Int_Glob);
+
+    printf("Bool_Glob:     ");
+    if (Bool_Glob == 1)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d\n", Bool_Glob);
+
+    printf("Ch_1_Glob:     ");
+    if (Ch_1_Glob == 'A')
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%c  ", Ch_1_Glob);
+
+    printf("Ch_2_Glob:     ");
+    if (Ch_2_Glob == 'B')
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%c\n", Ch_2_Glob);
+
+    printf("Arr_1_Glob[8]: ");
+    if (Arr_1_Glob[8] == 7)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d  ", Arr_1_Glob[8]);
+
+    printf("Arr_2_Glob8/7: ");
+    if (Arr_2_Glob[8][7] == Number_Of_Runs + 10)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%10d\n", Arr_2_Glob[8][7]);
+
+    printf("Ptr_Glob->            ");
+    printf("  Ptr_Comp:       *    %p\n", Ptr_Glob->Ptr_Comp);
+
+    printf("  Discr:       ");
+    if (Ptr_Glob->Discr == 0)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d  ", Ptr_Glob->Discr);
+
+    printf("Enum_Comp:     ");
+    if (Ptr_Glob->variant.var_1.Enum_Comp == 2)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d\n", Ptr_Glob->variant.var_1.Enum_Comp);
+
+    printf("  Int_Comp:    ");
+    if (Ptr_Glob->variant.var_1.Int_Comp == 17)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d ", Ptr_Glob->variant.var_1.Int_Comp);
+
+    printf("Str_Comp:      ");
+    if (strcmp(Ptr_Glob->variant.var_1.Str_Comp,
+               "DHRYSTONE PROGRAM, SOME STRING")
+        == 0)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%s\n", Ptr_Glob->variant.var_1.Str_Comp);
+
+    printf("Next_Ptr_Glob->       ");
+    printf("  Ptr_Comp:       *    %p", Next_Ptr_Glob->Ptr_Comp);
+    printf(" same as above\n");
+
+    printf("  Discr:       ");
+    if (Next_Ptr_Glob->Discr == 0)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d  ", Next_Ptr_Glob->Discr);
+
+    printf("Enum_Comp:     ");
+    if (Next_Ptr_Glob->variant.var_1.Enum_Comp == 1)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp);
+
+    printf("  Int_Comp:    ");
+    if (Next_Ptr_Glob->variant.var_1.Int_Comp == 18)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d ", Next_Ptr_Glob->variant.var_1.Int_Comp);
+
+    printf("Str_Comp:      ");
+    if (strcmp(Next_Ptr_Glob->variant.var_1.Str_Comp,
+               "DHRYSTONE PROGRAM, SOME STRING")
+        == 0)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%s\n", Next_Ptr_Glob->variant.var_1.Str_Comp);
+
+    printf("Int_1_Loc:     ");
+    if (Int_1_Loc == 5)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d  ", Int_1_Loc);
+
+    printf("Int_2_Loc:     ");
+    if (Int_2_Loc == 13)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d\n", Int_2_Loc);
+
+    printf("Int_3_Loc:     ");
+    if (Int_3_Loc == 7)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d  ", Int_3_Loc);
+
+    printf("Enum_Loc:      ");
+    if (Enum_Loc == 1)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d\n", Enum_Loc);
+
+    printf("Str_1_Loc:                             ");
+    if (strcmp(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING") == 0)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%s\n", Str_1_Loc);
+
+    printf("Str_2_Loc:                             ");
+    if (strcmp(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING") == 0)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%s\n", Str_2_Loc);
+
+    printf("\n");
+
+    if (User_Time < Too_Small_Time) {
+        printf("Measured time too small to obtain meaningful results\n");
+        printf("Please increase number of runs\n");
+        printf("\n");
+    }
+    else {
+        Microseconds = User_Time * Mic_secs_Per_Second / (double)Number_Of_Runs;
+        Dhrystones_Per_Second = (double)Number_Of_Runs / User_Time;
+        Vax_Mips = Dhrystones_Per_Second / 1757.0;
+
+        printf("Microseconds for one run through Dhrystone: ");
+        printf("%lf \n", Microseconds);
+        printf("Dhrystones per Second:                      ");
+        printf("%lf \n", Dhrystones_Per_Second);
+        printf("VAX  MIPS rating =                          ");
+        printf("%lf \n", Vax_Mips);
+        printf("\n");
+    }
+
+    free(Next_Ptr_Glob);
+    free(Ptr_Glob);
+    return 1;
+}
+
+void
+Proc_1(REG Rec_Pointer Ptr_Val_Par)
+/******************/
+
+/* executed once */
+{
+    REG Rec_Pointer Next_Record = Ptr_Val_Par->Ptr_Comp;
+    /* == Ptr_Glob_Next */
+    /* Local variable, initialized with Ptr_Val_Par->Ptr_Comp,    */
+    /* corresponds to "rename" in Ada, "with" in Pascal           */
+
+    structassign(*Ptr_Val_Par->Ptr_Comp, *Ptr_Glob);
+    Ptr_Val_Par->variant.var_1.Int_Comp = 5;
+    Next_Record->variant.var_1.Int_Comp = Ptr_Val_Par->variant.var_1.Int_Comp;
+    Next_Record->Ptr_Comp = Ptr_Val_Par->Ptr_Comp;
+    Proc_3(&Next_Record->Ptr_Comp);
+    /* Ptr_Val_Par->Ptr_Comp->Ptr_Comp
+                        == Ptr_Glob->Ptr_Comp */
+    if (Next_Record->Discr == Ident_1)
+    /* then, executed */
+    {
+        Next_Record->variant.var_1.Int_Comp = 6;
+        Proc_6(Ptr_Val_Par->variant.var_1.Enum_Comp,
+               &Next_Record->variant.var_1.Enum_Comp);
+        Next_Record->Ptr_Comp = Ptr_Glob->Ptr_Comp;
+        Proc_7(Next_Record->variant.var_1.Int_Comp, 10,
+               &Next_Record->variant.var_1.Int_Comp);
+    }
+    else { /* not executed */
+        structassign(*Ptr_Val_Par, *Ptr_Val_Par->Ptr_Comp);
+    }
+} /* Proc_1 */
+
+void
+Proc_2(One_Fifty *Int_Par_Ref)
+/******************/
+/* executed once */
+/* *Int_Par_Ref == 1, becomes 4 */
+
+{
+    One_Fifty Int_Loc;
+    Enumeration Enum_Loc;
+
+    Int_Loc = *Int_Par_Ref + 10;
+    do /* executed once */
+        if (Ch_1_Glob == 'A')
+        /* then, executed */
+        {
+            Int_Loc -= 1;
+            *Int_Par_Ref = Int_Loc - Int_Glob;
+            Enum_Loc = Ident_1;
+        }                        /* if */
+    while (Enum_Loc != Ident_1); /* true */
+} /* Proc_2 */
+
+void
+Proc_3(Rec_Pointer *Ptr_Ref_Par)
+/******************/
+/* executed once */
+/* Ptr_Ref_Par becomes Ptr_Glob */
+
+{
+    if (Ptr_Glob != Null)
+        /* then, executed */
+        *Ptr_Ref_Par = Ptr_Glob->Ptr_Comp;
+    Proc_7(10, Int_Glob, &Ptr_Glob->variant.var_1.Int_Comp);
+} /* Proc_3 */
+
+void
+Proc_4() /* without parameters */
+/*******/
+/* executed once */
+{
+    Boolean Bool_Loc;
+
+    Bool_Loc = Ch_1_Glob == 'A';
+    Bool_Glob = Bool_Loc | Bool_Glob;
+    Ch_2_Glob = 'B';
+} /* Proc_4 */
+
+void
+Proc_5() /* without parameters */
+/*******/
+/* executed once */
+{
+    Ch_1_Glob = 'A';
+    Bool_Glob = false;
+} /* Proc_5 */
+
+/* Procedure for the assignment of structures,          */
+/* if the C compiler doesn't support this feature       */
+#ifdef NOSTRUCTASSIGN
+memcpy(d, s, l) register char *d;
+register char *s;
+register int l;
+{
+    while (l--)
+        *d++ = *s++;
+}
+#endif

+ 187 - 0
tests/benchmarks/dhrystone/src/dhry_2.c

@@ -0,0 +1,187 @@
+/*
+ *************************************************************************
+ *
+ *                   "DHRYSTONE" Benchmark Program
+ *                   -----------------------------
+ *
+ *  Version:    C, Version 2.1
+ *
+ *  File:       dhry_2.c (part 3 of 3)
+ *
+ *  Date:       May 25, 1988
+ *
+ *  Author:     Reinhold P. Weicker
+ *
+ *************************************************************************
+ */
+
+#include "dhry.h"
+
+#ifndef REG
+#define REG
+/* REG becomes defined as empty */
+/* i.e. no register variables   */
+#else
+#define REG register
+#endif
+
+extern int Int_Glob;
+extern char Ch_1_Glob;
+
+Boolean
+Func_3(Enumeration Enum_Par_Val);
+
+void
+Proc_6(Enumeration Enum_Val_Par, Enumeration *Enum_Ref_Par)
+/*********************************/
+/* executed once */
+/* Enum_Val_Par == Ident_3, Enum_Ref_Par becomes Ident_2 */
+
+{
+    *Enum_Ref_Par = Enum_Val_Par;
+    if (!Func_3(Enum_Val_Par))
+        /* then, not executed */
+        *Enum_Ref_Par = Ident_4;
+    switch (Enum_Val_Par) {
+        case Ident_1:
+            *Enum_Ref_Par = Ident_1;
+            break;
+        case Ident_2:
+            if (Int_Glob > 100)
+                /* then */
+                *Enum_Ref_Par = Ident_1;
+            else
+                *Enum_Ref_Par = Ident_4;
+            break;
+        case Ident_3: /* executed */
+            *Enum_Ref_Par = Ident_2;
+            break;
+        case Ident_4:
+            break;
+        case Ident_5:
+            *Enum_Ref_Par = Ident_3;
+            break;
+    } /* switch */
+} /* Proc_6 */
+
+void
+Proc_7(One_Fifty Int_1_Par_Val, One_Fifty Int_2_Par_Val, One_Fifty *Int_Par_Ref)
+/**********************************************/
+/* executed three times                                      */
+/* first call:      Int_1_Par_Val == 2, Int_2_Par_Val == 3,  */
+/*                  Int_Par_Ref becomes 7                    */
+/* second call:     Int_1_Par_Val == 10, Int_2_Par_Val == 5, */
+/*                  Int_Par_Ref becomes 17                   */
+/* third call:      Int_1_Par_Val == 6, Int_2_Par_Val == 10, */
+/*                  Int_Par_Ref becomes 18                   */
+
+{
+    One_Fifty Int_Loc;
+
+    Int_Loc = Int_1_Par_Val + 2;
+    *Int_Par_Ref = Int_2_Par_Val + Int_Loc;
+} /* Proc_7 */
+
+void
+Proc_8(Arr_1_Dim Arr_1_Par_Ref, Arr_2_Dim Arr_2_Par_Ref, int Int_1_Par_Val,
+       int Int_2_Par_Val)
+/*********************************************************************/
+/* executed once      */
+/* Int_Par_Val_1 == 3 */
+/* Int_Par_Val_2 == 7 */
+
+{
+    REG One_Fifty Int_Index;
+    REG One_Fifty Int_Loc;
+
+    Int_Loc = Int_1_Par_Val + 5;
+    Arr_1_Par_Ref[Int_Loc] = Int_2_Par_Val;
+    Arr_1_Par_Ref[Int_Loc + 1] = Arr_1_Par_Ref[Int_Loc];
+    Arr_1_Par_Ref[Int_Loc + 30] = Int_Loc;
+    for (Int_Index = Int_Loc; Int_Index <= Int_Loc + 1; ++Int_Index)
+        Arr_2_Par_Ref[Int_Loc][Int_Index] = Int_Loc;
+    Arr_2_Par_Ref[Int_Loc][Int_Loc - 1] += 1;
+    Arr_2_Par_Ref[Int_Loc + 20][Int_Loc] = Arr_1_Par_Ref[Int_Loc];
+    Int_Glob = 5;
+} /* Proc_8 */
+
+Enumeration
+Func_1(Capital_Letter Ch_1_Par_Val, Capital_Letter Ch_2_Par_Val)
+/*************************************************/
+/* executed three times                                         */
+/* first call:      Ch_1_Par_Val == 'H', Ch_2_Par_Val == 'R'    */
+/* second call:     Ch_1_Par_Val == 'A', Ch_2_Par_Val == 'C'    */
+/* third call:      Ch_1_Par_Val == 'B', Ch_2_Par_Val == 'C'    */
+
+{
+    Capital_Letter Ch_1_Loc;
+    Capital_Letter Ch_2_Loc;
+
+    Ch_1_Loc = Ch_1_Par_Val;
+    Ch_2_Loc = Ch_1_Loc;
+    if (Ch_2_Loc != Ch_2_Par_Val)
+        /* then, executed */
+        return (Ident_1);
+    else /* not executed */
+    {
+        Ch_1_Glob = Ch_1_Loc;
+        return (Ident_2);
+    }
+} /* Func_1 */
+
+Boolean
+Func_2(Str_30 Str_1_Par_Ref, Str_30 Str_2_Par_Ref)
+/*************************************************/
+/* executed once */
+/* Str_1_Par_Ref == "DHRYSTONE PROGRAM, 1'ST STRING" */
+/* Str_2_Par_Ref == "DHRYSTONE PROGRAM, 2'ND STRING" */
+
+{
+    REG One_Thirty Int_Loc;
+    Capital_Letter Ch_Loc;
+
+    Int_Loc = 2;
+    while (Int_Loc <= 2) /* loop body executed once */
+        if (Func_1(Str_1_Par_Ref[Int_Loc], Str_2_Par_Ref[Int_Loc + 1])
+            == Ident_1)
+        /* then, executed */
+        {
+            Ch_Loc = 'A';
+            Int_Loc += 1;
+        } /* if, while */
+    if (Ch_Loc >= 'W' && Ch_Loc < 'Z')
+        /* then, not executed */
+        Int_Loc = 7;
+    if (Ch_Loc == 'R')
+        /* then, not executed */
+        return (true);
+    else /* executed */
+    {
+        if (strcmp(Str_1_Par_Ref, Str_2_Par_Ref) > 0)
+        /* then, not executed */
+        {
+            Int_Loc += 7;
+            Int_Glob = Int_Loc;
+            return (true);
+        }
+        else /* executed */
+            return (false);
+    } /* if Ch_Loc */
+} /* Func_2 */
+
+Boolean
+Func_3(Enumeration Enum_Par_Val)
+/***************************/
+/* executed once        */
+/* Enum_Par_Val == Ident_3 */
+
+{
+    Enumeration Enum_Loc;
+
+    Enum_Loc = Enum_Par_Val;
+    if (Enum_Loc == Ident_3)
+        /* then, executed */
+        return (true);
+    else /* not executed */
+        return (false);
+} /* Func_3 */

+ 50 - 0
tests/benchmarks/dhrystone/test_pgo.sh

@@ -0,0 +1,50 @@
+#!/bin/sh
+
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+PLATFORM=$(uname -s | tr A-Z a-z)
+
+IWASM="../../../product-mini/platforms/${PLATFORM}/build/iwasm"
+WAMRC="../../../wamr-compiler/build/wamrc"
+
+if [ ! -e "dhrystone.wasm" ]; then
+    echo "dhrystone.wasm doesn't exist, please run build.sh first"
+    exit
+fi
+
+echo ""
+echo "Compile dhrystone.wasm to dhrystone.aot .."
+${WAMRC} -o dhrystone.aot dhrystone.wasm
+
+echo ""
+echo "Compile dhrystone.wasm to dhrystone_pgo.aot .."
+${WAMRC} --enable-llvm-pgo -o dhrystone_pgo.aot dhrystone.wasm
+
+echo ""
+echo "Run dhrystone_pgo.aot to generate the raw profile data .."
+${IWASM} --gen-prof-file=dhrystone.profraw dhrystone_pgo.aot
+
+echo ""
+echo "Merge the raw profile data to dhrystone.profdata .."
+rm -f dhrystone.profdata && llvm-profdata merge -output=dhrystone.profdata dhrystone.profraw
+
+echo ""
+echo "Compile dhrystone.wasm to dhrystone_opt.aot with the profile data .."
+${WAMRC} --use-prof-file=dhrystone.profdata -o dhrystone_opt.aot dhrystone.wasm
+
+echo ""
+echo "Run the dhrystone native"
+./dhrystone_native
+
+echo ""
+echo "Run the original aot file dhrystone.aot"
+${IWASM} dhrystone.aot
+
+echo ""
+echo "Run the PGO optimized aot file dhrystone_opt.aot"
+${IWASM} dhrystone_opt.aot
+
+# Show the profile data:
+# llvm-profdata show --all-functions --detailed-summary --binary-ids --counts \
+# --hot-func-list --memop-sizes --show-prof-sym-list dhrystone.profraw

+ 2 - 0
tests/benchmarks/jetstream/README.md

@@ -27,3 +27,5 @@ And then run `./build.sh` to build the source code, the folder `out` will be cre
 # Running
 
 Run `./run_aot.sh` to test the benchmark, the native mode and iwasm aot mode will be tested for each workload, and the file `report.txt` will be generated.
+
+Run `./test_pgo.sh` to test the benchmark with AOT static PGO (Profile-Guided Optimization) enabled, please refer [here](../README.md#install-llvm-profdata) to install tool `llvm-profdata` and build `iwasm` with `cmake -DWAMR_BUILD_STATIC_PGO=1`.

+ 113 - 13
tests/benchmarks/jetstream/build.sh

@@ -3,27 +3,45 @@
 # Copyright (C) 2019 Intel Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+source /opt/emsdk/emsdk_env.sh
+
+PLATFORM=$(uname -s | tr A-Z a-z)
+
 OUT_DIR=$PWD/out
 WAMRC_CMD=$PWD/../../../wamr-compiler/build/wamrc
 
 mkdir -p jetstream
+mkdir -p tsf-src
 mkdir -p ${OUT_DIR}
 
+if [[ $1 != "--no-simd" ]];then
+    NATIVE_SIMD_FLAGS="-msse2 -msse3 -msse4"
+    WASM_SIMD_FLAGS="-msimd128 -msse2 -msse3 -msse4"
+else
+    NATIVE_SIMD_FLAGS=""
+    WASM_SIMD_FLAGS=""
+fi
+
 cd jetstream
 
 echo "Download source files .."
-wget https://browserbench.org/JetStream/wasm/gcc-loops.cpp
-wget https://browserbench.org/JetStream/wasm/quicksort.c
-wget https://browserbench.org/JetStream/wasm/HashSet.cpp
-wget https://browserbench.org/JetStream/simple/float-mm.c
+wget -N https://browserbench.org/JetStream/wasm/gcc-loops.cpp
+wget -N https://browserbench.org/JetStream/wasm/quicksort.c
+wget -N https://browserbench.org/JetStream/wasm/HashSet.cpp
+wget -N https://browserbench.org/JetStream/simple/float-mm.c
+
+if [[ $? != 0 ]]; then
+    exit
+fi
 
-patch -p1 < ../jetstream.patch
+echo "Patch source files .."
+patch -p1 -N < ../jetstream.patch
 
 echo "Build gcc-loops with g++ .."
-g++ -O3 -msse2 -msse3 -msse4 -o ${OUT_DIR}/gcc-loops_native gcc-loops.cpp
+g++ -O3 ${NATIVE_SIMD_FLAGS} -o ${OUT_DIR}/gcc-loops_native gcc-loops.cpp
 
 echo "Build gcc-loops with em++ .."
-em++ -O3 -s STANDALONE_WASM=1 -msimd128 \
+em++ -O3 -s STANDALONE_WASM=1 ${WASM_SIMD_FLAGS} \
          -s INITIAL_MEMORY=1048576 \
          -s TOTAL_STACK=32768 \
          -s "EXPORTED_FUNCTIONS=['_main']" \
@@ -33,11 +51,16 @@ em++ -O3 -s STANDALONE_WASM=1 -msimd128 \
 echo "Compile gcc-loops.wasm to gcc-loops.aot"
 ${WAMRC_CMD} -o ${OUT_DIR}/gcc-loops.aot ${OUT_DIR}/gcc-loops.wasm
 
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo "Compile gcc-loops.wasm to gcc-loops_segue.aot"
+    ${WAMRC_CMD} --enable-segue -o ${OUT_DIR}/gcc-loops_segue.aot ${OUT_DIR}/gcc-loops.wasm
+fi
+
 echo "Build quicksort with gcc .."
-gcc -O3 -msse2 -msse3 -msse4 -o ${OUT_DIR}/quicksort_native quicksort.c
+gcc -O3 ${NATIVE_SIMD_FLAGS} -o ${OUT_DIR}/quicksort_native quicksort.c
 
 echo "Build quicksort with emcc .."
-emcc -O3 -s STANDALONE_WASM=1 -msimd128 \
+emcc -O3 -s STANDALONE_WASM=1 ${WASM_SIMD_FLAGS} \
          -s INITIAL_MEMORY=1048576 \
          -s TOTAL_STACK=32768 \
          -s "EXPORTED_FUNCTIONS=['_main']" \
@@ -46,12 +69,17 @@ emcc -O3 -s STANDALONE_WASM=1 -msimd128 \
 echo "Compile quicksort.wasm to quicksort.aot"
 ${WAMRC_CMD} -o ${OUT_DIR}/quicksort.aot ${OUT_DIR}/quicksort.wasm
 
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo "Compile quicksort.wasm to quicksort_segue.aot"
+    ${WAMRC_CMD} --enable-segue -o ${OUT_DIR}/quicksort_segue.aot ${OUT_DIR}/quicksort.wasm
+fi
+
 echo "Build HashSet with g++ .."
-g++ -O3 -msse2 -msse3 -msse4 -o ${OUT_DIR}/HashSet_native HashSet.cpp \
+g++ -O3 ${NATIVE_SIMD_FLAGS} -o ${OUT_DIR}/HashSet_native HashSet.cpp \
         -lstdc++
 
 echo "Build HashSet with em++ .."
-em++ -O3 -s STANDALONE_WASM=1 -msimd128 \
+em++ -O3 -s STANDALONE_WASM=1 ${WASM_SIMD_FLAGS} \
          -s INITIAL_MEMORY=1048576 \
          -s TOTAL_STACK=32768 \
          -s "EXPORTED_FUNCTIONS=['_main']" \
@@ -60,11 +88,16 @@ em++ -O3 -s STANDALONE_WASM=1 -msimd128 \
 echo "Compile HashSet.wasm to HashSet.aot"
 ${WAMRC_CMD} -o ${OUT_DIR}/HashSet.aot ${OUT_DIR}/HashSet.wasm
 
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo "Compile HashSet.wasm to HashSet_segue.aot"
+    ${WAMRC_CMD} --enable-segue -o ${OUT_DIR}/HashSet_segue.aot ${OUT_DIR}/HashSet.wasm
+fi
+
 echo "Build float-mm with gcc .."
-gcc -O3 -msse2 -msse3 -msse4 -o ${OUT_DIR}/float-mm_native float-mm.c
+gcc -O3 ${NATIVE_SIMD_FLAGS} -o ${OUT_DIR}/float-mm_native float-mm.c
 
 echo "Build float-mm with emcc .."
-emcc -O3 -s STANDALONE_WASM=1 -msimd128 \
+emcc -O3 -s STANDALONE_WASM=1 ${WASM_SIMD_FLAGS} \
          -s INITIAL_MEMORY=1048576 \
          -s TOTAL_STACK=32768 \
          -s "EXPORTED_FUNCTIONS=['_main']" \
@@ -72,3 +105,70 @@ emcc -O3 -s STANDALONE_WASM=1 -msimd128 \
 
 echo "Compile float-mm.wasm to float-mm.aot"
 ${WAMRC_CMD} -o ${OUT_DIR}/float-mm.aot ${OUT_DIR}/float-mm.wasm
+
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo "Compile float-mm.wasm to float-mm_segue.aot"
+    ${WAMRC_CMD} --enable-segue -o ${OUT_DIR}/float-mm_segue.aot ${OUT_DIR}/float-mm.wasm
+fi
+
+cd ../tsf-src
+
+tsf_srcs="tsf_asprintf.c tsf_buffer.c tsf_error.c tsf_reflect.c tsf_st.c \
+          tsf_type.c tsf_io.c tsf_native.c tsf_generator.c tsf_st_typetable.c \
+          tsf_parser.c tsf_buf_writer.c tsf_buf_reader.c tsf_primitive.c \
+          tsf_type_table.c tsf_copier.c tsf_destructor.c tsf_gpc_code_gen.c \
+          gpc_code_gen_util.c gpc_threaded.c gpc_intable.c gpc_instruction.c \
+          gpc_program.c gpc_proto.c gpc_stack_height.c tsf_serial_in_man.c \
+          tsf_serial_out_man.c tsf_type_in_map.c tsf_type_out_map.c \
+          tsf_stream_file_input.c tsf_stream_file_output.c tsf_sort.c \
+          tsf_version.c tsf_named_type.c tsf_io_utils.c tsf_zip_attr.c \
+          tsf_zip_reader.c tsf_zip_writer.c tsf_zip_abstract.c tsf_limits.c \
+          tsf_ra_type_man.c tsf_adaptive_reader.c tsf_sha1.c tsf_sha1_writer.c \
+          tsf_fsdb.c tsf_fsdb_protocol.c tsf_define_helpers.c tsf_ir.c \
+          tsf_ir_different.c tsf_ir_speed.c"
+
+tsf_files="${tsf_srcs} config.h gpc_worklist.h \
+           tsf_config_stub.h tsf.h tsf_internal.h tsf_region.h tsf_types.h \
+           gpc.h tsf_atomics.h tsf_define_helpers.h tsf_indent.h tsf_inttypes.h \
+           tsf_serial_protocol.h tsf_util.h gpc_int_common.h tsf_build_defines.h \
+           tsf_format.h tsf_internal_config.h tsf_ir_different.h tsf_sha1.h \
+           tsf_zip_abstract.h gpc_internal.h tsf_config.h tsf_fsdb_protocol.h \
+           tsf_internal_config_stub.h tsf_ir.h tsf_st.h \
+           gpc_instruction_dispatch.gen gpc_instruction_stack_effects.gen \
+           gpc_instruction_to_string.gen gpc_instruction_size.gen \
+           gpc_instruction_static_size.gen gpc_interpreter.gen"
+
+echo "Download tsf source files .."
+for t in ${tsf_files}
+do
+    wget -N "https://browserbench.org/JetStream/wasm/TSF/${t}"
+    if [[ $? != 0 ]]; then
+        exit
+    fi
+done
+
+patch -p1 -N < ../tsf.patch
+
+echo "Build tsf with gcc .."
+gcc \
+    -o ${OUT_DIR}/tsf_native -O3 ${NATIVE_SIMD_FLAGS} \
+    -I. -DTSF_BUILD_SYSTEM=1 \
+    ${tsf_srcs} -lm
+
+echo "Build tsf standalone with wasi-sdk .."
+/opt/wasi-sdk/bin/clang -O3 ${WASM_SIMD_FLAGS} -z stack-size=1048576 \
+    -Wl,--initial-memory=52428800 \
+    -Wl,--export=main \
+    -Wl,--export=__heap_base,--export=__data_end \
+    -I. -DTSF_BUILD_SYSTEM=1 \
+    -Wl,--allow-undefined \
+    -o ${OUT_DIR}/tsf.wasm \
+    ${tsf_srcs}
+
+echo "Compile tsf.wasm to tsf.aot"
+${WAMRC_CMD} -o ${OUT_DIR}/tsf.aot ${OUT_DIR}/tsf.wasm
+
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo "Compile tsf.wasm to tsf_segue.aot"
+    ${WAMRC_CMD} --enable-segue -o ${OUT_DIR}/tsf_segue.aot ${OUT_DIR}/tsf.wasm
+fi

+ 9 - 6
tests/benchmarks/jetstream/jetstream.patch

@@ -1,15 +1,18 @@
 diff -urN jetstream-org/HashSet.cpp jetstream/HashSet.cpp
---- jetstream-org/HashSet.cpp	2020-10-30 04:12:42.000000000 +0800
-+++ jetstream/HashSet.cpp	2022-01-24 17:11:08.619831711 +0800
-@@ -24,6 +24,7 @@
+--- jetstream-org/HashSet.cpp   2020-10-30 04:12:42.000000000 +0800
++++ jetstream/HashSet.cpp   2022-01-24 17:11:08.619831711 +0800
+@@ -22,8 +22,10 @@
+
+ #include <algorithm>
  #include <memory>
++#include <limits>
  #include <stdio.h>
  #include <stdlib.h>
 +#include <string.h>
  #include <sys/time.h>
- 
+
  // Compile with: xcrun clang++ -o HashSet HashSet.cpp -O2 -W -framework Foundation -licucore -std=c++11 -fvisibility=hidden -DNDEBUG=1
-@@ -76,7 +77,7 @@
+@@ -76,7 +78,7 @@
  inline ToType bitwise_cast(FromType from)
  {
      typename std::remove_const<ToType>::type to { };
@@ -17,4 +20,4 @@ diff -urN jetstream-org/HashSet.cpp jetstream/HashSet.cpp
 +    memcpy(&to, &from, sizeof(to));
      return to;
  }
- 
+

+ 15 - 3
tests/benchmarks/jetstream/run_aot.sh

@@ -3,6 +3,8 @@
 # Copyright (C) 2019 Intel Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+PLATFORM=$(uname -s | tr A-Z a-z)
+
 CUR_DIR=$PWD
 OUT_DIR=$CUR_DIR/out
 REPORT=$CUR_DIR/report.txt
@@ -13,7 +15,7 @@ IWASM_CMD=$CUR_DIR/../../../product-mini/platforms/${PLATFORM}/build/iwasm
 
 BENCH_NAME_MAX_LEN=20
 
-JETSTREAM_CASES="gcc-loops quicksort HashSet float-mm"
+JETSTREAM_CASES="gcc-loops HashSet tsf float-mm quicksort"
 
 rm -f $REPORT
 touch $REPORT
@@ -34,7 +36,11 @@ echo "Start to run cases, the result is written to report.txt"
 
 #run benchmarks
 cd $OUT_DIR
-echo -en "\t\t\t\t\t  native\tiwasm-aot\n" >> $REPORT
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo -en "\t\t\t\t\t  native\tiwasm-aot\tiwasm-aot-segue\n" >> $REPORT
+else
+    echo -en "\t\t\t\t\t  native\tiwasm-aot\n" >> $REPORT
+fi
 
 for t in $JETSTREAM_CASES
 do
@@ -46,7 +52,13 @@ do
 
     echo "run $t with iwasm aot .."
     echo -en "\t" >> $REPORT
-    $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+    $TIME -f "real-%e-time" $IWASM_CMD --dir=. ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    if [[ ${PLATFORM} == "linux" ]]; then
+        echo "run $t with iwasm aot segue .."
+        echo -en "\t" >> $REPORT
+        $TIME -f "real-%e-time" $IWASM_CMD --dir=. ${t}_segue.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+    fi
 
     echo -en "\n" >> $REPORT
 done

+ 87 - 0
tests/benchmarks/jetstream/test_pgo.sh

@@ -0,0 +1,87 @@
+#!/bin/bash
+
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+CUR_DIR=$PWD
+OUT_DIR=$CUR_DIR/out
+REPORT=$CUR_DIR/report.txt
+TIME=/usr/bin/time
+
+PLATFORM=$(uname -s | tr A-Z a-z)
+IWASM_CMD=$CUR_DIR/../../../product-mini/platforms/${PLATFORM}/build/iwasm
+WAMRC_CMD=$CUR_DIR/../../../wamr-compiler/build/wamrc
+
+BENCH_NAME_MAX_LEN=20
+
+JETSTREAM_CASES="gcc-loops HashSet tsf float-mm quicksort"
+
+rm -f $REPORT
+touch $REPORT
+
+function print_bench_name()
+{
+    name=$1
+    echo -en "$name" >> $REPORT
+    name_len=${#name}
+    if [ $name_len -lt $BENCH_NAME_MAX_LEN ]
+    then
+        spaces=$(( $BENCH_NAME_MAX_LEN - $name_len ))
+        for i in $(eval echo "{1..$spaces}"); do echo -n " " >> $REPORT; done
+    fi
+}
+
+pushd $OUT_DIR > /dev/null 2>&1
+for t in $JETSTREAM_CASES
+do
+    if [ ! -e "${t}.wasm" ]; then
+        echo "${t}.wasm doesn't exist, please run build.sh first"
+        exit
+    fi
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}.aot .."
+    ${WAMRC_CMD} -o ${t}.aot ${t}.wasm
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}_pgo.aot .."
+    ${WAMRC_CMD} --enable-llvm-pgo -o ${t}_pgo.aot ${t}.wasm
+
+    echo ""
+    echo "Run ${t}_pgo.aot to generate the raw profile data .."
+    ${IWASM_CMD} --gen-prof-file=${t}.profraw --dir=. ${t}_pgo.aot
+
+    echo ""
+    echo "Merge the raw profile data to ${t}.profdata .."
+    rm -f ${t}.profdata && llvm-profdata merge -output=${t}.profdata ${t}.profraw
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}_opt.aot with the profile data .."
+    ${WAMRC_CMD} --use-prof-file=${t}.profdata -o ${t}_opt.aot ${t}.wasm
+done
+popd > /dev/null 2>&1
+
+echo "Start to run cases, the result is written to report.txt"
+
+#run benchmarks
+cd $OUT_DIR
+echo -en "\t\t\t\t\t  native\tiwasm-aot\tiwasm-aot-pgo\n" >> $REPORT
+
+for t in $JETSTREAM_CASES
+do
+    print_bench_name $t
+
+    echo "run $t with native .."
+    echo -en "\t" >> $REPORT
+    $TIME -f "real-%e-time" ./${t}_native 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    echo "run $t with iwasm aot .."
+    echo -en "\t" >> $REPORT
+    $TIME -f "real-%e-time" $IWASM_CMD --dir=. ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    echo "run $t with iwasm aot opt .."
+    echo -en "\t" >> $REPORT
+    $TIME -f "real-%e-time" $IWASM_CMD --dir=. ${t}_opt.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    echo -en "\n" >> $REPORT
+done

+ 24 - 0
tests/benchmarks/jetstream/tsf.patch

@@ -0,0 +1,24 @@
+diff -urN tsf-src-org/tsf_internal.h tsf-src/tsf_internal.h
+--- tsf-src-org/tsf_internal.h  2023-03-31 10:49:45.000000000 +0800
++++ tsf-src/tsf_internal.h  2023-05-11 08:18:35.000000000 +0800
+@@ -429,6 +429,7 @@
+ #endif
+             tsf_fsdb_connection_t *connection;
+ #endif
++            uint32_t __padding;
+         } remote;
+     } u;
+     tsf_limits_t *limits;
+diff -urN tsf-src-org/tsf_ir_speed.c tsf-src/tsf_ir_speed.c
+--- tsf-src-org/tsf_ir_speed.c  2023-03-31 10:49:45.000000000 +0800
++++ tsf-src/tsf_ir_speed.c  2023-05-11 08:18:35.000000000 +0800
+@@ -63,6 +63,9 @@
+         Program_t *program;
+         unsigned elementIndex;
+
++        if (!(programIndex % 100))
++            printf("##programIndex: %u\n", programIndex);
++
+         CS(program = tsf_region_create(sizeof(Program_t)));
+
+         program->globals.len = numDecls + numDefns;

+ 10 - 1
tests/benchmarks/libsodium/build.sh

@@ -16,6 +16,8 @@ libsodium_CASES="aead_aes256gcm2 aead_aes256gcm aead_chacha20poly13052 aead_chac
                  sodium_utils3 sodium_utils sodium_version stream2 stream3 stream4 stream verify1 \
                  xchacha20"
 
+PLATFORM=$(uname -s | tr A-Z a-z)
+
 readonly WAMRC_CMD=$PWD/../../../wamr-compiler/build/wamrc
 readonly OUT_DIR=$PWD/libsodium/zig-out/bin
 
@@ -34,9 +36,16 @@ zig build -Drelease-fast -Denable_benchmarks=true -Dtarget=wasm32-wasi
 for case in ${libsodium_CASES}
 do
     ${WAMRC_CMD} -o ${OUT_DIR}/${case}.aot ${OUT_DIR}/${case}.wasm
-
     if [ "$?" != 0 ]; then
         echo -e "Error while compiling ${case}.wasm to ${case}.aot"
         exit
     fi
+
+    if [[ ${PLATFORM} == "linux" ]]; then
+        ${WAMRC_CMD} --enable-segue -o ${OUT_DIR}/${case}_segue.aot ${OUT_DIR}/${case}.wasm
+        if [ "$?" != 0 ]; then
+            echo -e "Error while compiling ${case}.wasm to ${case}_segue.aot"
+            exit
+        fi
+    fi
 done

+ 39 - 6
tests/benchmarks/libsodium/test_aot.sh → tests/benchmarks/libsodium/run_aot.sh

@@ -13,12 +13,14 @@ libsodium_CASES="aead_aes256gcm2 aead_aes256gcm aead_chacha20poly13052 aead_chac
                  scalarmult6 scalarmult7 scalarmult8 scalarmult_ed25519 scalarmult_ristretto255 \
                  scalarmult secretbox2 secretbox7 secretbox8 secretbox_easy2 secretbox_easy \
                  secretbox secretstream shorthash sign siphashx24 sodium_core sodium_utils2 \
-                 sodium_utils3 sodium_utils sodium_version stream2 stream3 stream4 stream verify1 \
-                 xchacha20"
+                 sodium_utils stream2 stream3 stream4 stream verify1 xchacha20"
+
+PLATFORM=$(uname -s | tr A-Z a-z)
 
 readonly OUT_DIR=$PWD/libsodium/zig-out/bin
 readonly REPORT=$PWD/report.txt
-readonly IWASM_CMD=$PWD/../../../product-mini/platforms/linux/build/iwasm
+readonly IWASM_CMD=$PWD/../../../product-mini/platforms/${PLATFORM}/build/iwasm
+readonly TIME=/usr/bin/time
 
 BENCH_NAME_MAX_LEN=20
 
@@ -40,7 +42,11 @@ function print_bench_name()
 # run benchmarks
 cd $OUT_DIR
 
-echo -en "\t\t\t\t\t\tnative\tiwasm-aot\n" >> $REPORT
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo -en "\t\t\t\t\t\tnative\tiwasm-aot\tiwasm-aot-segue\n" >> $REPORT
+else
+    echo -en "\t\t\t\t\t\tnative\tiwasm-aot\n" >> $REPORT
+fi
 
 for t in $libsodium_CASES
 do
@@ -48,11 +54,38 @@ do
 
     echo "run $t with native..."
     echo -en "\t" >> $REPORT
-    ./${t} | awk -F '-' 'BEGIN{FIELDWIDTHS="10"}{ORS=""; print $1 / 1000000.0}' >> $REPORT
+    if [[ $t != "sodium_utils2" ]]; then
+        ./${t} | awk '{printf "%-10.2f", $0/1000000.0}' >> $REPORT
+    else
+        # sodium_utils2 doesn't print the result,
+        # use time command to get result instead
+        $TIME -f "real-%e-time" ./${t} 2>&1 | grep "real-.*-time" |
+            awk -F '-' '{printf "%-10.2f", $2}' >> $REPORT
+    fi
 
     echo "run $t with iwasm aot..."
     echo -en "\t  \t" >> $REPORT
-    $IWASM_CMD ${t}.aot | awk -F '-' 'BEGIN{FIELDWIDTHS="10"}{ORS=""; print $1 / 1000000.0}' >> $REPORT
+    if [[ $t != "sodium_utils2" ]]; then
+        $IWASM_CMD ${t}.aot | awk '{printf "%-10.2f", $0/1000000.0}' >> $REPORT
+    else
+        # sodium_utils2 doesn't print the result,
+        # use time command to get result instead
+        $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" |
+            awk -F '-' '{printf "%-10.2f", $2}' >> $REPORT
+    fi
+
+    if [[ ${PLATFORM} == "linux" ]]; then
+        echo "run $t with iwasm aot segue..."
+        echo -en "\t  \t" >> $REPORT
+        if [[ $t != "sodium_utils2" ]]; then
+            $IWASM_CMD ${t}_segue.aot | awk '{printf "%.2f", $0/1000000.0}' >> $REPORT
+        else
+            # sodium_utils2 doesn't print the result,
+            # use time command to get result instead
+            $TIME -f "real-%e-time" $IWASM_CMD ${t}_segue.aot 2>&1 | grep "real-.*-time" |
+                awk -F '-' '{printf "%.2f", $2}' >> $REPORT
+        fi
+    fi
 
     echo -en "\n" >> $REPORT
 done

+ 116 - 0
tests/benchmarks/libsodium/test_pgo.sh

@@ -0,0 +1,116 @@
+#!/bin/bash
+
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+libsodium_CASES="aead_aes256gcm2 aead_aes256gcm aead_chacha20poly13052 aead_chacha20poly1305 \
+                 aead_xchacha20poly1305 auth2 auth3 auth5 auth6 auth7 auth box2 box7 box8 \
+                 box_easy2 box_easy box_seal box_seed box chacha20 codecs core1 core2 core3 \
+                 core4 core5 core6 core_ed25519 core_ristretto255 ed25519_convert generichash2 \
+                 generichash3 generichash hash3 hash kdf keygen kx metamorphic misuse \
+                 onetimeauth2 onetimeauth7 onetimeauth pwhash_argon2id pwhash_argon2i \
+                 pwhash_scrypt_ll pwhash_scrypt randombytes scalarmult2 scalarmult5 \
+                 scalarmult6 scalarmult7 scalarmult8 scalarmult_ed25519 scalarmult_ristretto255 \
+                 scalarmult secretbox2 secretbox7 secretbox8 secretbox_easy2 secretbox_easy \
+                 secretbox secretstream shorthash sign siphashx24 sodium_core sodium_utils2 \
+                 sodium_utils stream2 stream3 stream4 stream verify1 xchacha20"
+
+PLATFORM=$(uname -s | tr A-Z a-z)
+
+readonly OUT_DIR=$PWD/libsodium/zig-out/bin
+readonly REPORT=$PWD/report.txt
+readonly IWASM_CMD=$PWD/../../../product-mini/platforms/${PLATFORM}/build/iwasm
+readonly WAMRC_CMD=$PWD/../../../wamr-compiler/build/wamrc
+readonly TIME=/usr/bin/time
+
+BENCH_NAME_MAX_LEN=20
+
+rm -f $REPORT
+touch $REPORT
+
+function print_bench_name()
+{
+    name=$1
+    echo -en "$name" >> $REPORT
+    name_len=${#name}
+    if [ $name_len -lt $BENCH_NAME_MAX_LEN ]
+    then
+        spaces=$(( $BENCH_NAME_MAX_LEN - $name_len ))
+        for i in $(eval echo "{1..$spaces}"); do echo -n " " >> $REPORT; done
+    fi
+}
+
+pushd $OUT_DIR > /dev/null 2>&1
+for t in $libsodium_CASES
+do
+    if [ ! -e "${t}.wasm" ]; then
+        echo "${t}.wasm doesn't exist, please run build.sh first"
+        exit
+    fi
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}.aot .."
+    ${WAMRC_CMD} -o ${t}.aot ${t}.wasm
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}_pgo.aot .."
+    ${WAMRC_CMD} --enable-llvm-pgo -o ${t}_pgo.aot ${t}.wasm
+
+    echo ""
+    echo "Run ${t}_pgo.aot to generate the raw profile data .."
+    ${IWASM_CMD} --gen-prof-file=${t}.profraw --dir=. ${t}_pgo.aot
+
+    echo ""
+    echo "Merge the raw profile data to ${t}.profdata .."
+    rm -f ${t}.profdata && llvm-profdata merge -output=${t}.profdata ${t}.profraw
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}_opt.aot with the profile data .."
+    ${WAMRC_CMD} --use-prof-file=${t}.profdata -o ${t}_opt.aot ${t}.wasm
+done
+
+# run benchmarks
+cd $OUT_DIR
+
+echo -en "\t\t\t\t\t\tnative\tiwasm-aot\tiwasm-aot-pgo\n" >> $REPORT
+
+for t in $libsodium_CASES
+do
+    print_bench_name $t
+
+    echo "run $t with native..."
+    echo -en "\t" >> $REPORT
+    if [[ $t != "sodium_utils2" ]]; then
+        ./${t} | awk '{printf "%-10.2f", $0/1000000.0}' >> $REPORT
+    else
+        # sodium_utils2 doesn't print the result,
+        # use time command to get result instead
+        $TIME -f "real-%e-time" ./${t} 2>&1 | grep "real-.*-time" |
+            awk -F '-' '{printf "%-10.2f", $2}' >> $REPORT
+    fi
+
+    echo "run $t with iwasm aot..."
+    echo -en "\t  \t" >> $REPORT
+    if [[ $t != "sodium_utils2" ]]; then
+        $IWASM_CMD ${t}.aot | awk '{printf "%-10.2f", $0/1000000.0}' >> $REPORT
+    else
+        # sodium_utils2 doesn't print the result,
+        # use time command to get result instead
+        $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" |
+            awk -F '-' '{printf "%-10.2f", $2}' >> $REPORT
+    fi
+
+    echo "run $t with iwasm aot opt..."
+    echo -en "\t  \t" >> $REPORT
+    if [[ $t != "sodium_utils2" ]]; then
+        $IWASM_CMD ${t}_opt.aot | awk '{printf "%-10.2f", $0/1000000.0}' >> $REPORT
+    else
+        # sodium_utils2 doesn't print the result,
+        # use time command to get result instead
+        $TIME -f "real-%e-time" $IWASM_CMD ${t}_opt.aot 2>&1 | grep "real-.*-time" |
+            awk -F '-' '{printf "%-10.2f", $2}' >> $REPORT
+    fi
+
+    echo -en "\n" >> $REPORT
+done
+

+ 8 - 0
tests/benchmarks/polybench/build.sh

@@ -3,6 +3,8 @@
 # Copyright (C) 2019 Intel Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+PLATFORM=$(uname -s | tr A-Z a-z)
+
 OUT_DIR=$PWD/out
 WAMRC_CMD=$PWD/../../../wamr-compiler/build/wamrc
 POLYBENCH_CASES="datamining linear-algebra medley stencils"
@@ -40,6 +42,12 @@ do
         echo "Compile ${file_name%.*}.wasm into ${file_name%.*}.aot"
         ${WAMRC_CMD} -o ${OUT_DIR}/${file_name%.*}.aot \
                 ${OUT_DIR}/${file_name%.*}.wasm
+
+        if [[ ${PLATFORM} == "linux" ]]; then
+            echo "Compile ${file_name%.*}.wasm into ${file_name%.*}_segue.aot"
+            ${WAMRC_CMD} --enable-segue -o ${OUT_DIR}/${file_name%.*}_segue.aot \
+                    ${OUT_DIR}/${file_name%.*}.wasm
+        fi
     done
 done
 

+ 11 - 1
tests/benchmarks/polybench/run_aot.sh

@@ -37,7 +37,11 @@ echo "Start to run cases, the result is written to report.txt"
 
 #run benchmarks
 cd $OUT_DIR
-echo -en "\t\t\t\t\t  native\tiwasm-aot\n" >> $REPORT
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo -en "\t\t\t\t\t  native\tiwasm-aot\tiwasm-aot-segue\n" >> $REPORT
+else
+    echo -en "\t\t\t\t\t  native\tiwasm-aot\n" >> $REPORT
+fi
 
 for t in $POLYBENCH_CASES
 do
@@ -51,5 +55,11 @@ do
     echo -en "\t" >> $REPORT
     $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
 
+    if [[ ${PLATFORM} == "linux" ]]; then
+        echo "run $t with iwasm aot segue .."
+        echo -en "\t" >> $REPORT
+        $TIME -f "real-%e-time" $IWASM_CMD ${t}_segue.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+    fi
+
     echo -en "\n" >> $REPORT
 done

+ 1 - 1
tests/benchmarks/polybench/run_interp.sh

@@ -37,7 +37,7 @@ echo "Start to run cases, the result is written to report.txt"
 
 #run benchmarks
 cd $OUT_DIR
-echo -en "\t\t\t\t\t  native\tiwasm-aot\n" >> $REPORT
+echo -en "\t\t\t\t\t  native\tiwasm-interp\n" >> $REPORT
 
 for t in $POLYBENCH_CASES
 do

+ 90 - 0
tests/benchmarks/polybench/test_pgo.sh

@@ -0,0 +1,90 @@
+#!/bin/bash
+
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+CUR_DIR=$PWD
+OUT_DIR=$CUR_DIR/out
+REPORT=$CUR_DIR/report.txt
+TIME=/usr/bin/time
+
+PLATFORM=$(uname -s | tr A-Z a-z)
+IWASM_CMD=$CUR_DIR/../../../product-mini/platforms/${PLATFORM}/build/iwasm
+WAMRC_CMD=$CUR_DIR/../../../wamr-compiler/build/wamrc
+
+BENCH_NAME_MAX_LEN=20
+
+POLYBENCH_CASES="2mm 3mm adi atax bicg cholesky correlation covariance \
+                 deriche doitgen durbin fdtd-2d floyd-warshall gemm gemver \
+                 gesummv gramschmidt heat-3d jacobi-1d jacobi-2d ludcmp lu \
+                 mvt nussinov seidel-2d symm syr2k syrk trisolv trmm"
+
+rm -f $REPORT
+touch $REPORT
+
+function print_bench_name()
+{
+    name=$1
+    echo -en "$name" >> $REPORT
+    name_len=${#name}
+    if [ $name_len -lt $BENCH_NAME_MAX_LEN ]
+    then
+        spaces=$(( $BENCH_NAME_MAX_LEN - $name_len ))
+        for i in $(eval echo "{1..$spaces}"); do echo -n " " >> $REPORT; done
+    fi
+}
+
+pushd $OUT_DIR > /dev/null 2>&1
+for t in $POLYBENCH_CASES
+do
+    if [ ! -e "${t}.wasm" ]; then
+        echo "${t}.wasm doesn't exist, please run build.sh first"
+        exit
+    fi
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}.aot .."
+    ${WAMRC_CMD} -o ${t}.aot ${t}.wasm
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}_pgo.aot .."
+    ${WAMRC_CMD} --enable-llvm-pgo -o ${t}_pgo.aot ${t}.wasm
+
+    echo ""
+    echo "Run ${t}_pgo.aot to generate the raw profile data .."
+    ${IWASM_CMD} --gen-prof-file=${t}.profraw --dir=. ${t}_pgo.aot
+
+    echo ""
+    echo "Merge the raw profile data to ${t}.profdata .."
+    rm -f ${t}.profdata && llvm-profdata merge -output=${t}.profdata ${t}.profraw
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}_opt.aot with the profile data .."
+    ${WAMRC_CMD} --use-prof-file=${t}.profdata -o ${t}_opt.aot ${t}.wasm
+done
+popd > /dev/null 2>&1
+
+echo "Start to run cases, the result is written to report.txt"
+
+#run benchmarks
+cd $OUT_DIR
+echo -en "\t\t\t\t\t  native\tiwasm-aot\tiwasm-aot-pgo\n" >> $REPORT
+
+for t in $POLYBENCH_CASES
+do
+    print_bench_name $t
+
+    echo "run $t with native .."
+    echo -en "\t" >> $REPORT
+    $TIME -f "real-%e-time" ./${t}_native 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    echo "run $t with iwasm aot .."
+    echo -en "\t" >> $REPORT
+    $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    echo "run $t with iwasm aot opt .."
+    echo -en "\t" >> $REPORT
+    $TIME -f "real-%e-time" $IWASM_CMD ${t}_opt.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    echo -en "\n" >> $REPORT
+done

+ 2 - 0
tests/benchmarks/sightglass/README.md

@@ -19,3 +19,5 @@ And then run `./build.sh` to build the source code, the folder `out` will be cre
 Run `./run_aot.sh` to test the benchmark, the native mode and iwasm aot mode will be tested for each workload, and the file `report.txt` will be generated.
 
 Run `./run_interp.sh` to test the benchmark, the native mode and iwasm interpreter mode will be tested for each workload, and the file `report.txt` will be generated.
+
+Run `./test_pgo.sh` to test the benchmark with AOT static PGO (Profile-Guided Optimization) enabled, please refer [here](../README.md#install-llvm-profdata) to install tool `llvm-profdata` and build `iwasm` with `cmake -DWAMR_BUILD_STATIC_PGO=1`.

+ 6 - 1
tests/benchmarks/sightglass/build.sh

@@ -3,6 +3,8 @@
 # Copyright (C) 2019 Intel Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+PLATFORM=$(uname -s | tr A-Z a-z)
+
 OUT_DIR=$PWD/out
 WAMRC_CMD=$PWD/../../../wamr-compiler/build/wamrc
 SHOOTOUT_CASES="base64 fib2 gimli heapsort matrix memmove nestedloop \
@@ -34,9 +36,12 @@ do
         -Wl,--export=app_main -Wl,--export=_start \
         ${bench}.c main/main_${bench}.c main/my_libc.c
 
-
     echo "Compile ${bench}.wasm into ${bench}.aot"
     ${WAMRC_CMD} -o ${OUT_DIR}/${bench}.aot ${OUT_DIR}/${bench}.wasm
+    if [[ ${PLATFORM} == "linux" ]]; then
+        echo "Compile ${bench}.wasm into ${bench}_segue.aot"
+        ${WAMRC_CMD} --enable-segue -o ${OUT_DIR}/${bench}_segue.aot ${OUT_DIR}/${bench}.wasm
+    fi
 done
 
 cd ..

+ 11 - 1
tests/benchmarks/sightglass/run_aot.sh

@@ -36,7 +36,11 @@ echo "Start to run cases, the result is written to report.txt"
 
 #run benchmarks
 cd $OUT_DIR
-echo -en "\t\t\t\t\t  native\tiwasm-aot\n" >> $REPORT
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo -en "\t\t\t\t\t  native\tiwasm-aot\tiwasm-aot-segue\n" >> $REPORT
+else
+    echo -en "\t\t\t\t\t  native\tiwasm-aot\n" >> $REPORT
+fi
 
 for t in $SHOOTOUT_CASES
 do
@@ -50,5 +54,11 @@ do
     echo -en "\t" >> $REPORT
     $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
 
+    if [[ ${PLATFORM} == "linux" ]]; then
+        echo "run $t with iwasm aot segue .."
+        echo -en "\t" >> $REPORT
+        $TIME -f "real-%e-time" $IWASM_CMD ${t}_segue.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+    fi
+
     echo -en "\n" >> $REPORT
 done

+ 2 - 2
tests/benchmarks/sightglass/run_interp.sh

@@ -46,9 +46,9 @@ do
     echo -en "\t" >> $REPORT
     $TIME -f "real-%e-time" ./${t}_native 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
 
-    echo "run $t with iwasm aot .."
+    echo "run $t with iwasm interp .."
     echo -en "\t" >> $REPORT
-    $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+    $TIME -f "real-%e-time" $IWASM_CMD ${t}.wasm 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
 
     echo -en "\n" >> $REPORT
 done

+ 89 - 0
tests/benchmarks/sightglass/test_pgo.sh

@@ -0,0 +1,89 @@
+#!/bin/bash
+
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+CUR_DIR=$PWD
+OUT_DIR=$CUR_DIR/out
+REPORT=$CUR_DIR/report.txt
+TIME=/usr/bin/time
+
+PLATFORM=$(uname -s | tr A-Z a-z)
+IWASM_CMD=$CUR_DIR/../../../product-mini/platforms/${PLATFORM}/build/iwasm
+WAMRC_CMD=$CUR_DIR/../../../wamr-compiler/build/wamrc
+
+BENCH_NAME_MAX_LEN=20
+
+SHOOTOUT_CASES="base64 fib2 gimli heapsort matrix memmove nestedloop \
+                nestedloop2 nestedloop3 random seqhash sieve strchr \
+                switch2"
+
+rm -f $REPORT
+touch $REPORT
+
+function print_bench_name()
+{
+    name=$1
+    echo -en "$name" >> $REPORT
+    name_len=${#name}
+    if [ $name_len -lt $BENCH_NAME_MAX_LEN ]
+    then
+        spaces=$(( $BENCH_NAME_MAX_LEN - $name_len ))
+        for i in $(eval echo "{1..$spaces}"); do echo -n " " >> $REPORT; done
+    fi
+}
+
+pushd $OUT_DIR > /dev/null 2>&1
+for t in $SHOOTOUT_CASES
+do
+    if [ ! -e "${t}.wasm" ]; then
+        echo "${t}.wasm doesn't exist, please run build.sh first"
+        exit
+    fi
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}.aot .."
+    ${WAMRC_CMD} -o ${t}.aot ${t}.wasm
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}_pgo.aot .."
+    ${WAMRC_CMD} --enable-llvm-pgo -o ${t}_pgo.aot ${t}.wasm
+
+    echo ""
+    echo "Run ${t}_pgo.aot to generate the raw profile data .."
+    ${IWASM_CMD} --gen-prof-file=${t}.profraw --dir=. ${t}_pgo.aot
+
+    echo ""
+    echo "Merge the raw profile data to ${t}.profdata .."
+    rm -f ${t}.profdata && llvm-profdata merge -output=${t}.profdata ${t}.profraw
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}_opt.aot with the profile data .."
+    ${WAMRC_CMD} --use-prof-file=${t}.profdata -o ${t}_opt.aot ${t}.wasm
+done
+popd > /dev/null 2>&1
+
+echo "Start to run cases, the result is written to report.txt"
+
+#run benchmarks
+cd $OUT_DIR
+echo -en "\t\t\t\t\t  native\tiwasm-aot\tiwasm-aot-pgo\n" >> $REPORT
+
+for t in $SHOOTOUT_CASES
+do
+    print_bench_name $t
+
+    echo "run $t with native .."
+    echo -en "\t" >> $REPORT
+    $TIME -f "real-%e-time" ./${t}_native 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    echo "run $t with iwasm aot .."
+    echo -en "\t" >> $REPORT
+    $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    echo "run $t with iwasm aot opt .."
+    echo -en "\t" >> $REPORT
+    $TIME -f "real-%e-time" $IWASM_CMD ${t}_opt.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    echo -en "\n" >> $REPORT
+done

+ 20 - 3
tests/wamr-test-suites/spec-test-script/thread_proposal_ignore_cases.patch

@@ -1,5 +1,22 @@
+diff --git a/test/core/atomic_wait_notify.wast b/test/core/atomic_wait_notify.wast
+index 2e312c3..4f35ac5 100644
+--- a/test/core/atomic_wait_notify.wast
++++ b/test/core/atomic_wait_notify.wast
+@@ -70,6 +70,7 @@
+   (memory (export "shared") 1 1 shared)
+ )
+ 
++(;
+ (thread $T1 (shared (module $Mem))
+   (register "mem" $Mem)
+   (module
+@@ -106,3 +107,4 @@
+ 
+ (wait $T1)
+ (wait $T2)
++;)
 diff --git a/test/core/binary.wast b/test/core/binary.wast
-index b9fa438c..a5711dd3 100644
+index b9fa438..a5711dd 100644
 --- a/test/core/binary.wast
 +++ b/test/core/binary.wast
 @@ -45,7 +45,7 @@
@@ -163,7 +180,7 @@ index b9fa438c..a5711dd3 100644
  ;; 1 elem segment declared, 2 given
  (assert_malformed
 diff --git a/test/core/elem.wast b/test/core/elem.wast
-index 1ea2b061..8eded377 100644
+index 1ea2b06..8eded37 100644
 --- a/test/core/elem.wast
 +++ b/test/core/elem.wast
 @@ -12,10 +12,10 @@
@@ -195,7 +212,7 @@ index 1ea2b061..8eded377 100644
  (assert_return (invoke $module1 "call-9") (i32.const 70))
 +;)
 diff --git a/test/core/thread.wast b/test/core/thread.wast
-index c3456a61..83fc2815 100644
+index c3456a6..83fc281 100644
 --- a/test/core/thread.wast
 +++ b/test/core/thread.wast
 @@ -2,6 +2,7 @@

+ 19 - 2
tests/wamr-test-suites/test_wamr.sh

@@ -430,7 +430,7 @@ function spec_test()
 
     # sgx only enable in interp mode and aot mode
     if [[ ${SGX_OPT} == "--sgx" ]];then
-        if [[ $1 == 'classic-interp' || $1 == 'fast-interp' || $1 == 'aot' ]]; then
+        if [[ $1 == 'classic-interp' || $1 == 'fast-interp' || $1 == 'aot' || $1 == 'fast-jit' ]]; then
           ARGS_FOR_SPEC_TEST+="-x "
         fi
     fi
@@ -628,7 +628,7 @@ function standalone_test()
 
 function build_iwasm_with_cfg()
 {
-    echo "Build iwasm with compile flags with " $* " for spec test" \
+    echo "Build iwasm with compile flags " $* " for spec test" \
         | tee -a ${REPORT_DIR}/spec_test_report.txt
 
     if [[ ${SGX_OPT} == "--sgx" ]];then
@@ -754,6 +754,23 @@ function trigger()
         EXTRA_COMPILE_FLAGS+=" -DWAMR_BUILD_LIB_WASI_THREADS=1"
     fi
 
+    echo "SANITIZER IS" $WAMR_BUILD_SANITIZER
+
+    if [[ "$WAMR_BUILD_SANITIZER" == "ubsan" ]]; then
+        echo "Setting run with ubsan"
+        EXTRA_COMPILE_FLAGS+=" -DWAMR_BUILD_SANITIZER=ubsan"
+    fi
+
+    if [[ "$WAMR_BUILD_SANITIZER" == "asan" ]]; then
+        echo "Setting run with asan"
+        EXTRA_COMPILE_FLAGS+=" -DWAMR_BUILD_SANITIZER=asan"
+    fi
+
+    if [[ "$WAMR_BUILD_SANITIZER" == "tsan" ]]; then
+        echo "Setting run with tsan"
+        EXTRA_COMPILE_FLAGS+=" -DWAMR_BUILD_SANITIZER=tsan"
+    fi
+
     for t in "${TYPE[@]}"; do
         case $t in
             "classic-interp")

+ 77 - 1
wamr-compiler/main.c

@@ -65,6 +65,14 @@ print_help()
     printf("  --enable-indirect-mode    Enalbe call function through symbol table but not direct call\n");
     printf("  --disable-llvm-intrinsics Disable the LLVM built-in intrinsics\n");
     printf("  --disable-llvm-lto        Disable the LLVM link time optimization\n");
+    printf("  --enable-llvm-pgo         Enable LLVM PGO (Profile-Guided Optimization)\n");
+    printf("  --use-prof-file=<file>    Use profile file collected by LLVM PGO (Profile-Guided Optimization)\n");
+    printf("  --enable-segue[=<flags>]  Enable using segment register GS as the base address of linear memory,\n");
+    printf("                            only available on linux/linux-sgx x86-64, which may improve performance,\n");
+    printf("                            flags can be: i32.load, i64.load, f32.load, f64.load, v128.load,\n");
+    printf("                                          i32.store, i64.store, f32.store, f64.store, v128.store\n");
+    printf("                            Use comma to separate, e.g. --enable-segue=i32.load,i64.store\n");
+    printf("                            and --enable-segue means all flags are added.\n");
     printf("  --emit-custom-sections=<section names>\n");
     printf("                            Emit the specified custom sections to AoT file, using comma to separate\n");
     printf("                            multiple names, e.g.\n");
@@ -84,7 +92,7 @@ print_help()
     } while (0)
 
 /**
- * Split a strings into an array of strings
+ * Split a string into an array of strings
  * Returns NULL on failure
  * Memory must be freed by caller
  * Based on: http://stackoverflow.com/a/11198630/471795
@@ -126,6 +134,57 @@ split_string(char *str, int *count, const char *delimer)
     return res;
 }
 
+static uint32
+resolve_segue_flags(char *str_flags)
+{
+    uint32 segue_flags = 0;
+    int32 flag_count, i;
+    char **flag_list;
+
+    flag_list = split_string(str_flags, &flag_count, ",");
+    if (flag_list) {
+        for (i = 0; i < flag_count; i++) {
+            if (!strcmp(flag_list[i], "i32.load")) {
+                segue_flags |= 1 << 0;
+            }
+            else if (!strcmp(flag_list[i], "i64.load")) {
+                segue_flags |= 1 << 1;
+            }
+            else if (!strcmp(flag_list[i], "f32.load")) {
+                segue_flags |= 1 << 2;
+            }
+            else if (!strcmp(flag_list[i], "f64.load")) {
+                segue_flags |= 1 << 3;
+            }
+            else if (!strcmp(flag_list[i], "v128.load")) {
+                segue_flags |= 1 << 4;
+            }
+            else if (!strcmp(flag_list[i], "i32.store")) {
+                segue_flags |= 1 << 8;
+            }
+            else if (!strcmp(flag_list[i], "i64.store")) {
+                segue_flags |= 1 << 9;
+            }
+            else if (!strcmp(flag_list[i], "f32.store")) {
+                segue_flags |= 1 << 10;
+            }
+            else if (!strcmp(flag_list[i], "f64.store")) {
+                segue_flags |= 1 << 11;
+            }
+            else if (!strcmp(flag_list[i], "v128.store")) {
+                segue_flags |= 1 << 12;
+            }
+            else {
+                /* invalid flag */
+                segue_flags = (uint32)-1;
+                break;
+            }
+        }
+        free(flag_list);
+    }
+    return segue_flags;
+}
+
 int
 main(int argc, char *argv[])
 {
@@ -272,6 +331,23 @@ main(int argc, char *argv[])
         else if (!strcmp(argv[0], "--disable-llvm-lto")) {
             option.disable_llvm_lto = true;
         }
+        else if (!strcmp(argv[0], "--enable-llvm-pgo")) {
+            option.enable_llvm_pgo = true;
+        }
+        else if (!strncmp(argv[0], "--use-prof-file=", 16)) {
+            if (argv[0][16] == '\0')
+                PRINT_HELP_AND_EXIT();
+            option.use_prof_file = argv[0] + 16;
+        }
+        else if (!strcmp(argv[0], "--enable-segue")) {
+            /* all flags are enabled */
+            option.segue_flags = 0x1F1F;
+        }
+        else if (!strncmp(argv[0], "--enable-segue=", 15)) {
+            option.segue_flags = resolve_segue_flags(argv[0] + 15);
+            if (option.segue_flags == (uint32)-1)
+                PRINT_HELP_AND_EXIT();
+        }
         else if (!strncmp(argv[0], "--emit-custom-sections=", 23)) {
             int len = 0;
             if (option.custom_sections) {