Просмотр исходного кода

Merge pull request #2194 from bytecodealliance/main

Merge branch main into gitbook
Wenyong Huang 2 лет назад
Родитель
Сommit
6f725bd4ab
60 измененных файлов с 2843 добавлено и 353 удалено
  1. 1 1
      .github/scripts/fetch_and_compare_version.py
  2. 26 10
      .github/workflows/build_wamr_lldb.yml
  3. 5 2
      .github/workflows/build_wamr_vscode_ext.yml
  4. 19 21
      .github/workflows/compilation_on_android_ubuntu.yml
  5. 4 2
      .github/workflows/compilation_on_macos.yml
  6. 2 2
      .github/workflows/compilation_on_nuttx.yml
  7. 4 2
      .github/workflows/compilation_on_sgx.yml
  8. 2 2
      .github/workflows/compilation_on_windows.yml
  9. 1 1
      .github/workflows/create_tag.yml
  10. 1 1
      README.md
  11. 2 2
      assembly-script/wamr_app_lib/request.ts
  12. 7 0
      build-scripts/config_common.cmake
  13. 0 20
      build-scripts/runtime_lib.cmake
  14. 14 1
      core/iwasm/common/wasm_c_api.c
  15. 2 2
      core/iwasm/compilation/aot_emit_control.c
  16. 4 2
      core/iwasm/compilation/aot_llvm.c
  17. 1 0
      core/iwasm/compilation/aot_llvm.h
  18. 108 0
      core/iwasm/compilation/aot_llvm_extra2.cpp
  19. 17 0
      core/iwasm/compilation/aot_llvm_extra2.h
  20. 1455 129
      core/iwasm/fast-jit/cg/x86-64/jit_codegen_x86_64.cpp
  21. 55 0
      core/iwasm/fast-jit/fe/jit_emit_control.c
  22. 31 0
      core/iwasm/fast-jit/fe/jit_emit_function.c
  23. 346 28
      core/iwasm/fast-jit/fe/jit_emit_memory.c
  24. 3 0
      core/iwasm/fast-jit/fe/jit_emit_memory.h
  25. 6 1
      core/iwasm/fast-jit/jit_dump.c
  26. 58 7
      core/iwasm/fast-jit/jit_frontend.c
  27. 11 0
      core/iwasm/fast-jit/jit_frontend.h
  28. 17 1
      core/iwasm/fast-jit/jit_ir.c
  29. 44 0
      core/iwasm/fast-jit/jit_ir.def
  30. 23 15
      core/iwasm/fast-jit/jit_ir.h
  31. 14 0
      core/iwasm/fast-jit/jit_regalloc.c
  32. 1 0
      core/iwasm/include/aot_export.h
  33. 2 39
      core/iwasm/libraries/lib-pthread/lib_pthread_wrapper.c
  34. 3 0
      core/iwasm/libraries/lib-wasi-threads/lib_wasi_threads_wrapper.c
  35. 7 1
      core/iwasm/libraries/lib-wasi-threads/test/build.sh
  36. 94 0
      core/iwasm/libraries/lib-wasi-threads/test/linear_memory_size_update.c
  37. 44 0
      core/iwasm/libraries/lib-wasi-threads/test/trap_after_main_thread_finishes.c
  38. 3 0
      core/iwasm/libraries/lib-wasi-threads/test/trap_after_main_thread_finishes.json
  39. 12 6
      core/iwasm/libraries/libc-wasi/libc_wasi_wrapper.c
  40. 49 0
      core/iwasm/libraries/thread-mgr/thread_manager.c
  41. 5 0
      core/iwasm/libraries/thread-mgr/thread_manager.h
  42. 13 0
      core/iwasm/libraries/wasi-nn/README.md
  43. 41 0
      core/iwasm/libraries/wasi-nn/cmake/Findtensorflow_lite.cmake
  44. 40 3
      core/iwasm/libraries/wasi-nn/src/wasi_nn_tensorflowlite.cpp
  45. 99 0
      core/iwasm/libraries/wasi-nn/test/Dockerfile.vx-delegate
  46. 5 0
      core/iwasm/libraries/wasi-nn/wasi_nn.cmake
  47. 14 8
      core/shared/mem-alloc/ems/ems_alloc.c
  48. 46 4
      core/shared/mem-alloc/ems/ems_gc_internal.h
  49. 30 8
      core/shared/mem-alloc/ems/ems_kfc.c
  50. 1 1
      doc/embed_wamr.md
  51. 2 2
      language-bindings/python/README.md
  52. 10 14
      product-mini/README.md
  53. 5 1
      product-mini/platforms/posix/main.c
  54. 5 1
      product-mini/platforms/windows/main.c
  55. 10 1
      samples/wasm-c-api/CMakeLists.txt
  56. 1 1
      test-tools/wamr-ide/VSCode-Extension/src/utilities/lldbUtilities.ts
  57. 2 1
      tests/benchmarks/polybench/build.sh
  58. 0 8
      tests/wamr-test-suites/test_wamr.sh
  59. 11 2
      wamr-compiler/README.md
  60. 5 0
      wamr-compiler/main.c

+ 1 - 1
.github/scripts/fetch_and_compare_version.py

@@ -12,7 +12,7 @@ import sys
 
 def fetch_version_from_code():
     """
-    search the semantic version definition in build-scripts/config_common.cmake
+    search the semantic version definition in core/version.h
     """
     major, minor, patch = "", "", ""
     with open("core/version.h", encoding="utf-8") as f:

+ 26 - 10
.github/workflows/build_wamr_lldb.yml

@@ -96,14 +96,23 @@ jobs:
           cmake -S ./llvm -B build \
             -G Ninja \
             -DCMAKE_INSTALL_PREFIX=../wamr-lldb \
-            -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="clang;lldb" \
-            -DLLVM_TARGETS_TO_BUILD=X86 \
-            -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DLLVM_BUILD_BENCHMARKS:BOOL=OFF \
-            -DLLVM_BUILD_DOCS:BOOL=OFF  -DLLVM_BUILD_EXAMPLES:BOOL=OFF  \
-            -DLLVM_BUILD_LLVM_DYLIB:BOOL=OFF  -DLLVM_BUILD_TESTS:BOOL=OFF  \
-            -DLLVM_ENABLE_BINDINGS:BOOL=OFF  -DLLVM_INCLUDE_BENCHMARKS:BOOL=OFF  \
-            -DLLVM_INCLUDE_DOCS:BOOL=OFF  -DLLVM_INCLUDE_EXAMPLES:BOOL=OFF  \
-            -DLLVM_INCLUDE_TESTS:BOOL=OFF -DLLVM_ENABLE_LLD:BOOL=ON
+            -DCMAKE_BUILD_TYPE:STRING="Release" \
+            -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+            -DLLVM_ENABLE_PROJECTS="clang;lldb" \
+            -DLLVM_TARGETS_TO_BUILD:STRING="X86;WebAssembly" \
+            -DLLVM_BUILD_BENCHMARKS:BOOL=OFF \
+            -DLLVM_BUILD_DOCS:BOOL=OFF \
+            -DLLVM_BUILD_EXAMPLES:BOOL=OFF  \
+            -DLLVM_BUILD_LLVM_DYLIB:BOOL=OFF \
+            -DLLVM_BUILD_TESTS:BOOL=OFF  \
+            -DLLVM_INCLUDE_BENCHMARKS:BOOL=OFF  \
+            -DLLVM_INCLUDE_DOCS:BOOL=OFF \
+            -DLLVM_INCLUDE_EXAMPLES:BOOL=OFF \
+            -DLLVM_INCLUDE_TESTS:BOOL=OFF \
+            -DLLVM_ENABLE_BINDINGS:BOOL=OFF \
+            -DLLVM_ENABLE_LIBXML2:BOOL=ON \
+            -DLLDB_ENABLE_PYTHON:BOOL=OFF \
+            -DLLVM_ENABLE_LLD:BOOL=ON
           cmake --build build --target lldb install --parallel $(nproc)
         working-directory: core/deps/llvm-project
 
@@ -118,13 +127,20 @@ jobs:
             -DCMAKE_BUILD_TYPE:STRING="Release" \
             -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
             -DLLVM_ENABLE_PROJECTS="clang;lldb" \
+            -DLLVM_TARGETS_TO_BUILD:STRING="X86;WebAssembly" \
+            -DLLVM_BUILD_BENCHMARKS:BOOL=OFF \
+            -DLLVM_BUILD_DOCS:BOOL=OFF \
+            -DLLVM_BUILD_EXAMPLES:BOOL=OFF  \
+            -DLLVM_BUILD_LLVM_DYLIB:BOOL=OFF \
+            -DLLVM_BUILD_TESTS:BOOL=OFF  \
+            -DLLVM_INCLUDE_BENCHMARKS:BOOL=OFF  \
+            -DLLVM_INCLUDE_DOCS:BOOL=OFF \
+            -DLLVM_INCLUDE_EXAMPLES:BOOL=OFF \
             -DLLVM_INCLUDE_TESTS:BOOL=OFF \
-            -DLLVM_INCLUDE_EXAMPLES:BOOL=OFF  \
             -DLLVM_BUILD_BENCHMARKS:BOOL=OFF \
             -DLLVM_BUILD_DOCS:BOOL=OFF \
             -DLLVM_BUILD_LLVM_DYLIB:BOOL=OFF \
             -DLLVM_ENABLE_BINDINGS:BOOL=OFF \
-            -DLLVM_TARGETS_TO_BUILD:STRING="X86;WebAssembly" \
             -DLLVM_ENABLE_LIBXML2:BOOL=ON \
             -DLLDB_ENABLE_PYTHON:BOOL=OFF \
             -DLLDB_BUILD_FRAMEWORK:BOOL=OFF

+ 5 - 2
.github/workflows/build_wamr_vscode_ext.yml

@@ -32,13 +32,16 @@ jobs:
         working-directory: test-tools/wamr-ide/VSCode-Extension
 
       - name: generate wamr ide vscode extension
-        env:
-          credentials: ${{ secrets.TOKEN }}
         run: |
           npm install -g vsce
           rm -rf node_modules
           npm install
           vsce package
+        working-directory: test-tools/wamr-ide/VSCode-Extension
+
+      - name: publish wamr ide vscode extension to the vsce marketplace
+        if: ${{ github.repository == 'bytecodealliance/wasm-micro-runtime' }}
+        run: |
           vsce publish -p ${{ secrets.TOKEN }}
         working-directory: test-tools/wamr-ide/VSCode-Extension
 

+ 19 - 21
.github/workflows/compilation_on_android_ubuntu.yml

@@ -10,7 +10,8 @@ on:
       - opened
       - synchronize
     paths:
-      - ".github/**"
+      - ".github/workflows/build_llvm_libraries.yml"
+      - ".github/workflows/compilation_on_android_ubuntu.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"
@@ -26,7 +27,8 @@ on:
       - main
       - "dev/**"
     paths:
-      - ".github/**"
+      - ".github/workflows/build_llvm_libraries.yml"
+      - ".github/workflows/compilation_on_android_ubuntu.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"
@@ -125,8 +127,8 @@ jobs:
             # Running mode
             $CLASSIC_INTERP_BUILD_OPTIONS,
             $FAST_INTERP_BUILD_OPTIONS,
-            $FAST_JIT_BUILD_OPTIONS
-        ]
+            $FAST_JIT_BUILD_OPTIONS,
+          ]
         make_options_feature: [
             # Features
             "-DWAMR_BUILD_CUSTOM_NAME_SECTION=1",
@@ -338,7 +340,9 @@ jobs:
       ]
     runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
+        sanitizer: ["", "ubsan"]
         make_options: [
             # Running mode
             $AOT_BUILD_OPTIONS,
@@ -363,6 +367,7 @@ jobs:
             llvm_cache_key: ${{ needs.build_llvm_libraries_on_ubuntu_2004.outputs.cache_key }}
           - os: ubuntu-22.04
             llvm_cache_key: ${{ needs.build_llvm_libraries_on_ubuntu_2204.outputs.cache_key }}
+
     steps:
       - name: checkout
         uses: actions/checkout@v3
@@ -395,15 +400,16 @@ jobs:
         if: (!endsWith(matrix.make_options, '_INTERP_BUILD_OPTIONS'))
         run: |
           mkdir build && cd build
-          cmake ..
+          cmake -DSANITIZER="${{matrix.sanitizer}}" ..
           cmake --build . --config Release --parallel 4
         working-directory: wamr-compiler
 
       - name: Build Sample [wasm-c-api]
         run: |
-          cmake -S . -B build ${{ matrix.make_options }}
+          VERBOSE=1
+          cmake -S . -B build ${{ matrix.make_options }} -DSANITIZER="${{matrix.sanitizer}}"
           cmake --build build --config Release --parallel 4
-          ctest --test-dir build
+          ctest --test-dir build --output-on-failure
         working-directory: samples/wasm-c-api
 
   build_samples_others:
@@ -414,11 +420,11 @@ jobs:
         os: [ubuntu-20.04, ubuntu-22.04]
         wasi_sdk_release:
           [
-            "https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-20/wasi-sdk-20.0-linux.tar.gz"
+            "https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-20/wasi-sdk-20.0-linux.tar.gz",
           ]
         wabt_release:
           [
-            "https://github.com/WebAssembly/wabt/releases/download/1.0.31/wabt-1.0.31-ubuntu.tar.gz"
+            "https://github.com/WebAssembly/wabt/releases/download/1.0.31/wabt-1.0.31-ubuntu.tar.gz",
           ]
     steps:
       - name: checkout
@@ -505,7 +511,7 @@ jobs:
         build_iwasm,
         build_llvm_libraries_on_ubuntu_2004,
         build_llvm_libraries_on_ubuntu_2204,
-        build_wamrc
+        build_wamrc,
       ]
     runs-on: ${{ matrix.os }}
     strategy:
@@ -530,7 +536,7 @@ jobs:
           ]
         wasi_sdk_release:
           [
-            "https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-20/wasi-sdk-20.0-linux.tar.gz"
+            "https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-20/wasi-sdk-20.0-linux.tar.gz",
           ]
         include:
           - os: ubuntu-20.04
@@ -551,24 +557,16 @@ jobs:
             test_option: $MULTI_MODULES_TEST_OPTIONS
           - running_mode: "jit"
             test_option: $MULTI_MODULES_TEST_OPTIONS
-          # fast-jit doesn't support multi module, simd, and threads
+          # fast-jit doesn't support multi module, simd
           - running_mode: "fast-jit"
             test_option: $MULTI_MODULES_TEST_OPTIONS
           - running_mode: "fast-jit"
             test_option: $SIMD_TEST_OPTIONS
-          - running_mode: "fast-jit"
-            test_option: $THREADS_TEST_OPTIONS
-          - running_mode: "fast-jit"
-            test_option: $WASI_TEST_OPTIONS
-          # multi-tier-jit doesn't support multi module, simd, and threads
+          # multi-tier-jit doesn't support multi module, simd
           - running_mode: "multi-tier-jit"
             test_option: $MULTI_MODULES_TEST_OPTIONS
           - running_mode: "multi-tier-jit"
             test_option: $SIMD_TEST_OPTIONS
-          - running_mode: "multi-tier-jit"
-            test_option: $THREADS_TEST_OPTIONS
-          - running_mode: "multi-tier-jit"
-            test_option: $WASI_TEST_OPTIONS
     steps:
       - name: checkout
         uses: actions/checkout@v3

+ 4 - 2
.github/workflows/compilation_on_macos.yml

@@ -10,7 +10,8 @@ on:
       - opened
       - synchronize
     paths:
-      - ".github/**"
+      - ".github/workflows/build_llvm_libraries.yml"
+      - ".github/workflows/compilation_on_macos.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"
@@ -26,7 +27,8 @@ on:
       - main
       - "dev/**"
     paths:
-      - ".github/**"
+      - ".github/workflows/build_llvm_libraries.yml"
+      - ".github/workflows/compilation_on_macos.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"

+ 2 - 2
.github/workflows/compilation_on_nuttx.yml

@@ -10,7 +10,7 @@ on:
       - opened
       - synchronize
     paths:
-      - ".github/**"
+      - ".github/workflows/compilation_on_nuttx.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"
@@ -26,7 +26,7 @@ on:
       - main
       - "dev/**"
     paths:
-      - ".github/**"
+      - ".github/workflows/compilation_on_nuttx.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"

+ 4 - 2
.github/workflows/compilation_on_sgx.yml

@@ -10,7 +10,8 @@ on:
       - opened
       - synchronize
     paths:
-      - ".github/**"
+      - ".github/workflows/build_llvm_libraries.yml"
+      - ".github/workflows/compilation_on_sgx.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"
@@ -26,7 +27,8 @@ on:
       - main
       - "dev/**"
     paths:
-      - ".github/**"
+      - ".github/workflows/build_llvm_libraries.yml"
+      - ".github/workflows/compilation_on_sgx.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"

+ 2 - 2
.github/workflows/compilation_on_windows.yml

@@ -10,7 +10,7 @@ on:
       - opened
       - synchronize
     paths:
-      - ".github/**"
+      - ".github/workflows/compilation_on_windows.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"
@@ -26,7 +26,7 @@ on:
       - main
       - "dev/**"
     paths:
-      - ".github/**"
+      - ".github/workflows/compilation_on_windows.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"

+ 1 - 1
.github/workflows/create_tag.yml

@@ -52,7 +52,7 @@ jobs:
           #
           #
           if [[ -z ${new_ver} ]]; then
-            echo "::error::please indicate the right semantic version in build-scripts/config_common.cmake"
+            echo "::error::please indicate the right semantic version in core/version.h"
             echo "new_ver=''" >> "$GITHUB_OUTPUT"
             echo "new_tag=''" >> "$GITHUB_OUTPUT"
             exit 1

+ 1 - 1
README.md

@@ -7,7 +7,7 @@
 
 **[Guide](https://wamr.gitbook.io/)**  **[Website](https://bytecodealliance.github.io/wamr.dev)**  **[Chat](https://bytecodealliance.zulipchat.com/#narrow/stream/290350-wamr)**
 
-[Build WAMR](./doc/build_wamr.md) | [Build AOT Compiler](./wamr-compiler/README.md) | [Embed WAMR](./doc/embed_wamr.md) | [Export Native API](./doc/export_native_api.md) | [Build Wasm Apps](./doc/build_wasm_app.md) | [Samples](./README.md#samples)
+[Build WAMR](./doc/build_wamr.md) | [Build AOT Compiler](./wamr-compiler/README.md) | [Embed WAMR](./doc/embed_wamr.md) | [Export Native API](./doc/export_native_api.md) | [Build Wasm Apps](./doc/build_wasm_app.md) | [Samples](./samples/README.md)
 
 WebAssembly Micro Runtime (WAMR) is a lightweight standalone WebAssembly (Wasm) runtime with small footprint, high performance and highly configurable features for applications cross from embedded, IoT, edge to Trusted Execution Environment (TEE), smart contract, cloud native and so on. It includes a few parts as below:
 - [**VMcore**](./core/iwasm/): A set of runtime libraries for loading and running Wasm modules. It supports several execution modes including interpreter, Ahead-of-Time compilation(AoT) and Just-in-Time compilation (JIT). The WAMR supports two JIT tiers - Fast JIT, LLVM JIT, and dynamic tier-up from Fast JIT to LLVM JIT.

+ 2 - 2
assembly-script/wamr_app_lib/request.ts

@@ -7,7 +7,7 @@ import * as console from './console'
 import * as timer from './timer'
 
 @external("env", "wasm_response_send")
-declare function wasm_response_send(buffer: ArrayBuffer, size: i32): void;
+declare function wasm_response_send(buffer: ArrayBuffer, size: i32): bool;
 
 @external("env", "wasm_register_resource")
 declare function wasm_register_resource(url: ArrayBuffer): void;
@@ -492,4 +492,4 @@ export function on_response(buffer_offset: i32, size: i32): void {
 
         trans.cb(resp);
     }
-}
+}

+ 7 - 0
build-scripts/config_common.cmake

@@ -341,6 +341,13 @@ if (WAMR_BUILD_WASI_NN EQUAL 1)
       message ("     WASI-NN: GPU enabled")
       add_definitions (-DWASI_NN_ENABLE_GPU=1)
   endif ()
+  if (WAMR_BUILD_WASI_NN_ENABLE_EXT EQUAL 1)
+      message ("     WASI-NN: External Delegation enabled")
+      add_definitions (-DWASI_NN_ENABLE_EXTERNAL_DELEGATE=1)
+  endif ()
+  if (DEFINED WASI_NN_EXT_DELEGATE_PATH)
+      add_definitions (-DWASI_NN_EXT_DELEGATE_PATH="${WASI_NN_EXT_DELEGATE_PATH}")
+  endif ()
 endif ()
 if (WAMR_BUILD_ALLOC_WITH_USER_DATA EQUAL 1)
   add_definitions(-DWASM_MEM_ALLOC_WITH_USER_DATA=1)

+ 0 - 20
build-scripts/runtime_lib.cmake

@@ -101,26 +101,6 @@ if (WAMR_BUILD_LIB_PTHREAD_SEMAPHORE EQUAL 1)
 endif ()
 
 if (WAMR_BUILD_WASI_NN EQUAL 1)
-    if (NOT EXISTS "${WAMR_ROOT_DIR}/core/deps/tensorflow-src")
-        execute_process(COMMAND ${WAMR_ROOT_DIR}/core/deps/install_tensorflow.sh
-                        RESULT_VARIABLE TENSORFLOW_RESULT
-        )
-    else ()
-        message("Tensorflow is already downloaded.")
-    endif()
-    set(TENSORFLOW_SOURCE_DIR "${WAMR_ROOT_DIR}/core/deps/tensorflow-src")
-
-    if (WASI_NN_ENABLE_GPU EQUAL 1)
-        # Tensorflow specific:
-        # * https://www.tensorflow.org/lite/guide/build_cmake#available_options_to_build_tensorflow_lite
-        set (TFLITE_ENABLE_GPU ON)
-    endif ()
-
-    include_directories (${CMAKE_CURRENT_BINARY_DIR}/flatbuffers/include)
-    include_directories (${TENSORFLOW_SOURCE_DIR})
-    add_subdirectory(
-        "${TENSORFLOW_SOURCE_DIR}/tensorflow/lite"
-        "${CMAKE_CURRENT_BINARY_DIR}/tensorflow-lite" EXCLUDE_FROM_ALL)
     include (${IWASM_DIR}/libraries/wasi-nn/wasi_nn.cmake)
 endif ()
 

+ 14 - 1
core/iwasm/common/wasm_c_api.c

@@ -23,6 +23,9 @@
 #if WASM_ENABLE_WASM_CACHE != 0
 #include <openssl/sha.h>
 #endif
+#if WASM_ENABLE_THREAD_MGR != 0
+#include "thread_manager.h"
+#endif
 
 /*
  * Thread Model:
@@ -3315,7 +3318,17 @@ wasm_func_call(const wasm_func_t *func, const wasm_val_vec_t *params,
         goto failed;
     }
 
-    exec_env = wasm_runtime_get_exec_env_singleton(func->inst_comm_rt);
+#ifdef OS_ENABLE_HW_BOUND_CHECK
+    exec_env = wasm_runtime_get_exec_env_tls();
+#endif
+#if WASM_ENABLE_THREAD_MGR != 0
+    if (!exec_env) {
+        exec_env = wasm_clusters_search_exec_env(func->inst_comm_rt);
+    }
+#endif
+    if (!exec_env) {
+        exec_env = wasm_runtime_get_exec_env_singleton(func->inst_comm_rt);
+    }
     if (!exec_env) {
         goto failed;
     }

+ 2 - 2
core/iwasm/compilation/aot_emit_control.c

@@ -701,7 +701,7 @@ check_suspend_flags(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
     if (!(terminate_flags =
               LLVMBuildLoad2(comp_ctx->builder, I32_TYPE, terminate_addr,
                              "terminate_flags"))) {
-        aot_set_last_error("llvm build bit cast failed");
+        aot_set_last_error("llvm build LOAD failed");
         return false;
     }
     /* Set terminate_flags memory accecc to volatile, so that the value
@@ -729,7 +729,7 @@ check_suspend_flags(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
         goto fail;
     }
 
-    /* Move builder to terminate block */
+    /* Move builder to non terminate block */
     SET_BUILDER_POS(non_terminate_block);
     return true;
 

+ 4 - 2
core/iwasm/compilation/aot_llvm.c

@@ -4,6 +4,7 @@
  */
 
 #include "aot_llvm.h"
+#include "aot_llvm_extra2.h"
 #include "aot_compiler.h"
 #include "aot_emit_exception.h"
 #include "../aot/aot_runtime.h"
@@ -2055,9 +2056,10 @@ aot_create_comp_context(AOTCompData *comp_data, aot_comp_option_t option)
             code_model = LLVMCodeModelSmall;
 
         /* Create the target machine */
-        if (!(comp_ctx->target_machine = LLVMCreateTargetMachine(
+        if (!(comp_ctx->target_machine = LLVMCreateTargetMachineWithOpts(
                   target, triple_norm, cpu, features, opt_level,
-                  LLVMRelocStatic, code_model))) {
+                  LLVMRelocStatic, code_model, false,
+                  option->stack_usage_file))) {
             aot_set_last_error("create LLVM target machine failed.");
             goto fail;
         }

+ 1 - 0
core/iwasm/compilation/aot_llvm.h

@@ -415,6 +415,7 @@ typedef struct AOTCompOption {
     uint32 stack_bounds_checks;
     char **custom_sections;
     uint32 custom_sections_count;
+    const char *stack_usage_file;
 } AOTCompOption, *aot_comp_option_t;
 
 bool

+ 108 - 0
core/iwasm/compilation/aot_llvm_extra2.cpp

@@ -0,0 +1,108 @@
+/*
+ * Copyright (c)2023 YAMAMOTO Takashi.  All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include <llvm-c/TargetMachine.h>
+#include <llvm/MC/TargetRegistry.h>
+#include <llvm/Target/TargetMachine.h>
+
+#include "bh_assert.h"
+
+#include "aot_llvm_extra2.h"
+
+static llvm::Optional<llvm::Reloc::Model>
+convert(LLVMRelocMode reloc_mode)
+{
+    switch (reloc_mode) {
+        case LLVMRelocDefault:
+            return llvm::None;
+        case LLVMRelocStatic:
+            return llvm::Reloc::Static;
+        case LLVMRelocPIC:
+            return llvm::Reloc::PIC_;
+        case LLVMRelocDynamicNoPic:
+            return llvm::Reloc::DynamicNoPIC;
+        case LLVMRelocROPI:
+            return llvm::Reloc::ROPI;
+        case LLVMRelocRWPI:
+            return llvm::Reloc::RWPI;
+        case LLVMRelocROPI_RWPI:
+            return llvm::Reloc::ROPI_RWPI;
+    }
+    bh_assert(0);
+    return llvm::None;
+}
+
+static llvm::CodeGenOpt::Level
+convert(LLVMCodeGenOptLevel opt_level)
+{
+    switch (opt_level) {
+        case LLVMCodeGenLevelNone:
+            return llvm::CodeGenOpt::None;
+        case LLVMCodeGenLevelLess:
+            return llvm::CodeGenOpt::Less;
+        case LLVMCodeGenLevelDefault:
+            return llvm::CodeGenOpt::Default;
+        case LLVMCodeGenLevelAggressive:
+            return llvm::CodeGenOpt::Aggressive;
+    }
+    bh_assert(0);
+    return llvm::CodeGenOpt::None;
+}
+
+static llvm::Optional<llvm::CodeModel::Model>
+convert(LLVMCodeModel code_model, bool *jit)
+{
+    *jit = false;
+    switch (code_model) {
+        case LLVMCodeModelDefault:
+            return llvm::None;
+        case LLVMCodeModelJITDefault:
+            *jit = true;
+            return llvm::None;
+        case LLVMCodeModelTiny:
+            return llvm::CodeModel::Tiny;
+        case LLVMCodeModelSmall:
+            return llvm::CodeModel::Small;
+        case LLVMCodeModelKernel:
+            return llvm::CodeModel::Kernel;
+        case LLVMCodeModelMedium:
+            return llvm::CodeModel::Medium;
+        case LLVMCodeModelLarge:
+            return llvm::CodeModel::Large;
+    }
+    bh_assert(0);
+    return llvm::None;
+}
+
+LLVMTargetMachineRef
+LLVMCreateTargetMachineWithOpts(LLVMTargetRef ctarget, const char *triple,
+                                const char *cpu, const char *features,
+                                LLVMCodeGenOptLevel opt_level,
+                                LLVMRelocMode reloc_mode,
+                                LLVMCodeModel code_model,
+                                bool EmitStackSizeSection,
+                                const char *StackUsageOutput)
+{
+    llvm::TargetOptions opts;
+
+    // -fstack-size-section equiv
+    // emit it to ".stack_sizes" section in case of ELF
+    // you can read it with "llvm-readobj --stack-sizes"
+    opts.EmitStackSizeSection = EmitStackSizeSection;
+
+    // -fstack-usage equiv
+    if (StackUsageOutput != NULL) {
+        opts.StackUsageOutput = StackUsageOutput;
+    }
+
+    auto target = reinterpret_cast<llvm::Target *>(ctarget);
+    auto rm = convert(reloc_mode);
+    auto ol = convert(opt_level);
+    bool jit;
+    auto cm = convert(code_model, &jit);
+    auto targetmachine = target->createTargetMachine(triple, cpu, features,
+                                                     opts, rm, cm, ol, jit);
+    return reinterpret_cast<LLVMTargetMachineRef>(targetmachine);
+}

+ 17 - 0
core/iwasm/compilation/aot_llvm_extra2.h

@@ -0,0 +1,17 @@
+/*
+ * Copyright (c)2023 YAMAMOTO Takashi.  All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include <llvm-c/TargetMachine.h>
+
+LLVM_C_EXTERN_C_BEGIN
+LLVMTargetMachineRef
+LLVMCreateTargetMachineWithOpts(LLVMTargetRef ctarget, const char *triple,
+                                const char *cpu, const char *features,
+                                LLVMCodeGenOptLevel opt_level,
+                                LLVMRelocMode reloc_mode,
+                                LLVMCodeModel code_model,
+                                bool EmitStackSizeSection,
+                                const char *StackUsageOutput);
+LLVM_C_EXTERN_C_END

Разница между файлами не показана из-за своего большого размера
+ 1455 - 129
core/iwasm/fast-jit/cg/x86-64/jit_codegen_x86_64.cpp


+ 55 - 0
core/iwasm/fast-jit/fe/jit_emit_control.c

@@ -904,6 +904,42 @@ check_copy_arities(const JitBlock *block_dst, JitFrame *jit_frame)
     }
 }
 
+#if WASM_ENABLE_THREAD_MGR != 0
+bool
+jit_check_suspend_flags(JitCompContext *cc)
+{
+    JitReg exec_env, suspend_flags, terminate_flag, offset;
+    JitBasicBlock *terminate_block, *cur_basic_block;
+    JitFrame *jit_frame = cc->jit_frame;
+
+    cur_basic_block = cc->cur_basic_block;
+    terminate_block = jit_cc_new_basic_block(cc, 0);
+    if (!terminate_block) {
+        return false;
+    }
+
+    gen_commit_values(jit_frame, jit_frame->lp, jit_frame->sp);
+    exec_env = cc->exec_env_reg;
+    suspend_flags = jit_cc_new_reg_I32(cc);
+    terminate_flag = jit_cc_new_reg_I32(cc);
+
+    offset = jit_cc_new_const_I32(cc, offsetof(WASMExecEnv, suspend_flags));
+    GEN_INSN(LDI32, suspend_flags, exec_env, offset);
+    GEN_INSN(AND, terminate_flag, suspend_flags, NEW_CONST(I32, 1));
+
+    GEN_INSN(CMP, cc->cmp_reg, terminate_flag, NEW_CONST(I32, 0));
+    GEN_INSN(BNE, cc->cmp_reg, jit_basic_block_label(terminate_block), 0);
+
+    cc->cur_basic_block = terminate_block;
+    GEN_INSN(RETURN, NEW_CONST(I32, 0));
+
+    cc->cur_basic_block = cur_basic_block;
+
+    return true;
+}
+
+#endif
+
 static bool
 handle_op_br(JitCompContext *cc, uint32 br_depth, uint8 **p_frame_ip)
 {
@@ -986,6 +1022,13 @@ fail:
 bool
 jit_compile_op_br(JitCompContext *cc, uint32 br_depth, uint8 **p_frame_ip)
 {
+
+#if WASM_ENABLE_THREAD_MGR != 0
+    /* Insert suspend check point */
+    if (!jit_check_suspend_flags(cc))
+        return false;
+#endif
+
     return handle_op_br(cc, br_depth, p_frame_ip)
            && handle_next_reachable_block(cc, p_frame_ip);
 }
@@ -1105,6 +1148,12 @@ jit_compile_op_br_if(JitCompContext *cc, uint32 br_depth,
         jit_insn_delete(insn_select);
     }
 
+#if WASM_ENABLE_THREAD_MGR != 0
+    /* Insert suspend check point */
+    if (!jit_check_suspend_flags(cc))
+        return false;
+#endif
+
     SET_BUILDER_POS(if_basic_block);
     SET_BB_BEGIN_BCIP(if_basic_block, *p_frame_ip - 1);
 
@@ -1144,6 +1193,12 @@ jit_compile_op_br_table(JitCompContext *cc, uint32 *br_depths, uint32 br_count,
     uint32 i = 0;
     JitOpndLookupSwitch *opnd = NULL;
 
+#if WASM_ENABLE_THREAD_MGR != 0
+    /* Insert suspend check point */
+    if (!jit_check_suspend_flags(cc))
+        return false;
+#endif
+
     cur_basic_block = cc->cur_basic_block;
 
     POP_I32(value);

+ 31 - 0
core/iwasm/fast-jit/fe/jit_emit_function.c

@@ -5,6 +5,7 @@
 
 #include "jit_emit_function.h"
 #include "jit_emit_exception.h"
+#include "jit_emit_control.h"
 #include "../jit_frontend.h"
 #include "../jit_codegen.h"
 #include "../../interpreter/wasm_runtime.h"
@@ -232,6 +233,12 @@ jit_compile_op_call(JitCompContext *cc, uint32 func_idx, bool tail_call)
     bool is_pointer_arg;
     bool return_value = false;
 
+#if WASM_ENABLE_THREAD_MGR != 0
+    /* Insert suspend check point */
+    if (!jit_check_suspend_flags(cc))
+        goto fail;
+#endif
+
     if (func_idx < wasm_module->import_function_count) {
         /* The function to call is an import function */
         func_import = &wasm_module->import_functions[func_idx].u.function;
@@ -275,6 +282,12 @@ jit_compile_op_call(JitCompContext *cc, uint32 func_idx, bool tail_call)
                 goto fail;
             }
 
+#if WASM_ENABLE_THREAD_MGR != 0
+            /* Insert suspend check point */
+            if (!jit_check_suspend_flags(cc))
+                goto fail;
+#endif
+
             return true;
         }
 
@@ -416,6 +429,12 @@ jit_compile_op_call(JitCompContext *cc, uint32 func_idx, bool tail_call)
         }
     }
 
+#if WASM_ENABLE_THREAD_MGR != 0
+    /* Insert suspend check point */
+    if (!jit_check_suspend_flags(cc))
+        goto fail;
+#endif
+
     /* Clear part of memory regs and table regs as their values
        may be changed in the function call */
     if (cc->cur_wasm_module->possible_memory_grow)
@@ -540,6 +559,12 @@ jit_compile_op_call_indirect(JitCompContext *cc, uint32 type_idx,
     GEN_INSN(STI32, func_idx, cc->exec_env_reg,
              NEW_CONST(I32, offsetof(WASMExecEnv, jit_cache) + 4));
 
+#if WASM_ENABLE_THREAD_MGR != 0
+    /* Insert suspend check point */
+    if (!jit_check_suspend_flags(cc))
+        goto fail;
+#endif
+
     block_import = jit_cc_new_basic_block(cc, 0);
     block_nonimport = jit_cc_new_basic_block(cc, 0);
     func_return = jit_cc_new_basic_block(cc, 0);
@@ -742,6 +767,12 @@ jit_compile_op_call_indirect(JitCompContext *cc, uint32 type_idx,
         goto fail;
     }
 
+#if WASM_ENABLE_THREAD_MGR != 0
+    /* Insert suspend check point */
+    if (!jit_check_suspend_flags(cc))
+        goto fail;
+#endif
+
     /* Clear part of memory regs and table regs as their values
        may be changed in the function call */
     if (cc->cur_wasm_module->possible_memory_grow)

+ 346 - 28
core/iwasm/fast-jit/fe/jit_emit_memory.c

@@ -9,6 +9,7 @@
 #include "../jit_frontend.h"
 #include "../jit_codegen.h"
 #include "../../interpreter/wasm_runtime.h"
+#include "jit_emit_control.h"
 
 #ifndef OS_ENABLE_HW_BOUND_CHECK
 static JitReg
@@ -60,6 +61,14 @@ fail:
 }
 #endif
 
+#if WASM_ENABLE_SHARED_MEMORY != 0
+static void
+set_load_or_store_atomic(JitInsn *load_or_store_inst)
+{
+    load_or_store_inst->flags_u8 |= 0x1;
+}
+#endif
+
 #if UINTPTR_MAX == UINT64_MAX
 static JitReg
 check_and_seek_on_64bit_platform(JitCompContext *cc, JitReg addr, JitReg offset,
@@ -177,23 +186,36 @@ fail:
     return 0;
 }
 
-#define CHECK_ALIGNMENT(maddr, memory_data, offset1)                   \
+#if UINTPTR_MAX == UINT64_MAX
+#define CHECK_ALIGNMENT(offset1)                                       \
     do {                                                               \
-        GEN_INSN(ADD, maddr, memory_data, offset1);                    \
         JitReg align_mask = NEW_CONST(I64, ((uint64)1 << align) - 1);  \
         JitReg AND_res = jit_cc_new_reg_I64(cc);                       \
-        GEN_INSN(AND, AND_res, maddr, align_mask);                     \
+        GEN_INSN(AND, AND_res, offset1, align_mask);                   \
         GEN_INSN(CMP, cc->cmp_reg, AND_res, NEW_CONST(I64, 0));        \
         if (!jit_emit_exception(cc, EXCE_UNALIGNED_ATOMIC, JIT_OP_BNE, \
                                 cc->cmp_reg, NULL))                    \
             goto fail;                                                 \
     } while (0)
+#else
+#define CHECK_ALIGNMENT(offset1)                                       \
+    do {                                                               \
+        JitReg align_mask = NEW_CONST(I32, (1 << align) - 1);          \
+        JitReg AND_res = jit_cc_new_reg_I32(cc);                       \
+        GEN_INSN(AND, AND_res, offset1, align_mask);                   \
+        GEN_INSN(CMP, cc->cmp_reg, AND_res, NEW_CONST(I32, 0));        \
+        if (!jit_emit_exception(cc, EXCE_UNALIGNED_ATOMIC, JIT_OP_BNE, \
+                                cc->cmp_reg, NULL))                    \
+            goto fail;                                                 \
+    } while (0)
+#endif
 
 bool
 jit_compile_op_i32_load(JitCompContext *cc, uint32 align, uint32 offset,
                         uint32 bytes, bool sign, bool atomic)
 {
     JitReg addr, offset1, value, memory_data;
+    JitInsn *load_insn = NULL;
 
     POP_I32(addr);
 
@@ -201,6 +223,11 @@ jit_compile_op_i32_load(JitCompContext *cc, uint32 align, uint32 offset,
     if (!offset1) {
         goto fail;
     }
+#if WASM_ENABLE_SHARED_MEMORY != 0
+    if (atomic) {
+        CHECK_ALIGNMENT(offset1);
+    }
+#endif
 
     memory_data = get_memory_data_reg(cc->jit_frame, 0);
 
@@ -209,30 +236,30 @@ jit_compile_op_i32_load(JitCompContext *cc, uint32 align, uint32 offset,
         case 1:
         {
             if (sign) {
-                GEN_INSN(LDI8, value, memory_data, offset1);
+                load_insn = GEN_INSN(LDI8, value, memory_data, offset1);
             }
             else {
-                GEN_INSN(LDU8, value, memory_data, offset1);
+                load_insn = GEN_INSN(LDU8, value, memory_data, offset1);
             }
             break;
         }
         case 2:
         {
             if (sign) {
-                GEN_INSN(LDI16, value, memory_data, offset1);
+                load_insn = GEN_INSN(LDI16, value, memory_data, offset1);
             }
             else {
-                GEN_INSN(LDU16, value, memory_data, offset1);
+                load_insn = GEN_INSN(LDU16, value, memory_data, offset1);
             }
             break;
         }
         case 4:
         {
             if (sign) {
-                GEN_INSN(LDI32, value, memory_data, offset1);
+                load_insn = GEN_INSN(LDI32, value, memory_data, offset1);
             }
             else {
-                GEN_INSN(LDU32, value, memory_data, offset1);
+                load_insn = GEN_INSN(LDU32, value, memory_data, offset1);
             }
             break;
         }
@@ -243,6 +270,13 @@ jit_compile_op_i32_load(JitCompContext *cc, uint32 align, uint32 offset,
         }
     }
 
+#if WASM_ENABLE_SHARED_MEMORY != 0
+    if (atomic && load_insn)
+        set_load_or_store_atomic(load_insn);
+#else
+    (void)load_insn;
+#endif
+
     PUSH_I32(value);
     return true;
 fail:
@@ -254,6 +288,7 @@ jit_compile_op_i64_load(JitCompContext *cc, uint32 align, uint32 offset,
                         uint32 bytes, bool sign, bool atomic)
 {
     JitReg addr, offset1, value, memory_data;
+    JitInsn *load_insn = NULL;
 
     POP_I32(addr);
 
@@ -261,6 +296,11 @@ jit_compile_op_i64_load(JitCompContext *cc, uint32 align, uint32 offset,
     if (!offset1) {
         goto fail;
     }
+#if WASM_ENABLE_SHARED_MEMORY != 0
+    if (atomic) {
+        CHECK_ALIGNMENT(offset1);
+    }
+#endif
 
     memory_data = get_memory_data_reg(cc->jit_frame, 0);
 
@@ -269,40 +309,40 @@ jit_compile_op_i64_load(JitCompContext *cc, uint32 align, uint32 offset,
         case 1:
         {
             if (sign) {
-                GEN_INSN(LDI8, value, memory_data, offset1);
+                load_insn = GEN_INSN(LDI8, value, memory_data, offset1);
             }
             else {
-                GEN_INSN(LDU8, value, memory_data, offset1);
+                load_insn = GEN_INSN(LDU8, value, memory_data, offset1);
             }
             break;
         }
         case 2:
         {
             if (sign) {
-                GEN_INSN(LDI16, value, memory_data, offset1);
+                load_insn = GEN_INSN(LDI16, value, memory_data, offset1);
             }
             else {
-                GEN_INSN(LDU16, value, memory_data, offset1);
+                load_insn = GEN_INSN(LDU16, value, memory_data, offset1);
             }
             break;
         }
         case 4:
         {
             if (sign) {
-                GEN_INSN(LDI32, value, memory_data, offset1);
+                load_insn = GEN_INSN(LDI32, value, memory_data, offset1);
             }
             else {
-                GEN_INSN(LDU32, value, memory_data, offset1);
+                load_insn = GEN_INSN(LDU32, value, memory_data, offset1);
             }
             break;
         }
         case 8:
         {
             if (sign) {
-                GEN_INSN(LDI64, value, memory_data, offset1);
+                load_insn = GEN_INSN(LDI64, value, memory_data, offset1);
             }
             else {
-                GEN_INSN(LDU64, value, memory_data, offset1);
+                load_insn = GEN_INSN(LDU64, value, memory_data, offset1);
             }
             break;
         }
@@ -313,6 +353,13 @@ jit_compile_op_i64_load(JitCompContext *cc, uint32 align, uint32 offset,
         }
     }
 
+#if WASM_ENABLE_SHARED_MEMORY != 0
+    if (atomic && load_insn)
+        set_load_or_store_atomic(load_insn);
+#else
+    (void)load_insn;
+#endif
+
     PUSH_I64(value);
     return true;
 fail:
@@ -370,6 +417,7 @@ jit_compile_op_i32_store(JitCompContext *cc, uint32 align, uint32 offset,
                          uint32 bytes, bool atomic)
 {
     JitReg value, addr, offset1, memory_data;
+    JitInsn *store_insn = NULL;
 
     POP_I32(value);
     POP_I32(addr);
@@ -378,23 +426,28 @@ jit_compile_op_i32_store(JitCompContext *cc, uint32 align, uint32 offset,
     if (!offset1) {
         goto fail;
     }
+#if WASM_ENABLE_SHARED_MEMORY != 0
+    if (atomic) {
+        CHECK_ALIGNMENT(offset1);
+    }
+#endif
 
     memory_data = get_memory_data_reg(cc->jit_frame, 0);
 
     switch (bytes) {
         case 1:
         {
-            GEN_INSN(STI8, value, memory_data, offset1);
+            store_insn = GEN_INSN(STI8, value, memory_data, offset1);
             break;
         }
         case 2:
         {
-            GEN_INSN(STI16, value, memory_data, offset1);
+            store_insn = GEN_INSN(STI16, value, memory_data, offset1);
             break;
         }
         case 4:
         {
-            GEN_INSN(STI32, value, memory_data, offset1);
+            store_insn = GEN_INSN(STI32, value, memory_data, offset1);
             break;
         }
         default:
@@ -403,6 +456,12 @@ jit_compile_op_i32_store(JitCompContext *cc, uint32 align, uint32 offset,
             goto fail;
         }
     }
+#if WASM_ENABLE_SHARED_MEMORY != 0
+    if (atomic && store_insn)
+        set_load_or_store_atomic(store_insn);
+#else
+    (void)store_insn;
+#endif
 
     return true;
 fail:
@@ -414,6 +473,7 @@ jit_compile_op_i64_store(JitCompContext *cc, uint32 align, uint32 offset,
                          uint32 bytes, bool atomic)
 {
     JitReg value, addr, offset1, memory_data;
+    JitInsn *store_insn = NULL;
 
     POP_I64(value);
     POP_I32(addr);
@@ -422,6 +482,11 @@ jit_compile_op_i64_store(JitCompContext *cc, uint32 align, uint32 offset,
     if (!offset1) {
         goto fail;
     }
+#if WASM_ENABLE_SHARED_MEMORY != 0
+    if (atomic) {
+        CHECK_ALIGNMENT(offset1);
+    }
+#endif
 
     if (jit_reg_is_const(value) && bytes < 8) {
         value = NEW_CONST(I32, (int32)jit_cc_get_const_I64(cc, value));
@@ -432,22 +497,22 @@ jit_compile_op_i64_store(JitCompContext *cc, uint32 align, uint32 offset,
     switch (bytes) {
         case 1:
         {
-            GEN_INSN(STI8, value, memory_data, offset1);
+            store_insn = GEN_INSN(STI8, value, memory_data, offset1);
             break;
         }
         case 2:
         {
-            GEN_INSN(STI16, value, memory_data, offset1);
+            store_insn = GEN_INSN(STI16, value, memory_data, offset1);
             break;
         }
         case 4:
         {
-            GEN_INSN(STI32, value, memory_data, offset1);
+            store_insn = GEN_INSN(STI32, value, memory_data, offset1);
             break;
         }
         case 8:
         {
-            GEN_INSN(STI64, value, memory_data, offset1);
+            store_insn = GEN_INSN(STI64, value, memory_data, offset1);
             break;
         }
         default:
@@ -456,6 +521,12 @@ jit_compile_op_i64_store(JitCompContext *cc, uint32 align, uint32 offset,
             goto fail;
         }
     }
+#if WASM_ENABLE_SHARED_MEMORY != 0
+    if (atomic && store_insn)
+        set_load_or_store_atomic(store_insn);
+#else
+    (void)store_insn;
+#endif
 
     return true;
 fail:
@@ -774,10 +845,153 @@ fail:
 #endif
 
 #if WASM_ENABLE_SHARED_MEMORY != 0
+#define GEN_AT_RMW_INSN(op, op_type, bytes, result, value, memory_data,       \
+                        offset1)                                              \
+    do {                                                                      \
+        switch (bytes) {                                                      \
+            case 1:                                                           \
+            {                                                                 \
+                insn = GEN_INSN(AT_##op##U8, result, value, memory_data,      \
+                                offset1);                                     \
+                break;                                                        \
+            }                                                                 \
+            case 2:                                                           \
+            {                                                                 \
+                insn = GEN_INSN(AT_##op##U16, result, value, memory_data,     \
+                                offset1);                                     \
+                break;                                                        \
+            }                                                                 \
+            case 4:                                                           \
+            {                                                                 \
+                if (op_type == VALUE_TYPE_I32)                                \
+                    insn = GEN_INSN(AT_##op##I32, result, value, memory_data, \
+                                    offset1);                                 \
+                else                                                          \
+                    insn = GEN_INSN(AT_##op##U32, result, value, memory_data, \
+                                    offset1);                                 \
+                break;                                                        \
+            }                                                                 \
+            case 8:                                                           \
+            {                                                                 \
+                insn = GEN_INSN(AT_##op##I64, result, value, memory_data,     \
+                                offset1);                                     \
+                break;                                                        \
+            }                                                                 \
+            default:                                                          \
+            {                                                                 \
+                bh_assert(0);                                                 \
+                goto fail;                                                    \
+            }                                                                 \
+        }                                                                     \
+    } while (0)
+
 bool
 jit_compile_op_atomic_rmw(JitCompContext *cc, uint8 atomic_op, uint8 op_type,
                           uint32 align, uint32 offset, uint32 bytes)
 {
+    JitReg addr, offset1, memory_data, value, result, eax_hreg, rax_hreg,
+        ebx_hreg, rbx_hreg;
+    JitInsn *insn = NULL;
+    bool is_i32 = op_type == VALUE_TYPE_I32;
+    bool is_logical_op = atomic_op == AtomicRMWBinOpAnd
+                         || atomic_op == AtomicRMWBinOpOr
+                         || atomic_op == AtomicRMWBinOpXor;
+
+    /* currently we only implement atomic rmw on x86-64 target */
+#if defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)
+
+    /* For atomic logical binary ops, it implicitly uses rax in cmpxchg
+     * instruction and implicitly uses rbx for storing temp value in the
+     * generated loop */
+    eax_hreg = jit_codegen_get_hreg_by_name("eax");
+    rax_hreg = jit_codegen_get_hreg_by_name("rax");
+    ebx_hreg = jit_codegen_get_hreg_by_name("ebx");
+    rbx_hreg = jit_codegen_get_hreg_by_name("rbx");
+
+    bh_assert(op_type == VALUE_TYPE_I32 || op_type == VALUE_TYPE_I64);
+    if (op_type == VALUE_TYPE_I32) {
+        POP_I32(value);
+    }
+    else {
+        POP_I64(value);
+    }
+    POP_I32(addr);
+
+    offset1 = check_and_seek(cc, addr, offset, bytes);
+    if (!offset1) {
+        goto fail;
+    }
+    CHECK_ALIGNMENT(offset1);
+
+    memory_data = get_memory_data_reg(cc->jit_frame, 0);
+
+    if (op_type == VALUE_TYPE_I32)
+        result = jit_cc_new_reg_I32(cc);
+    else
+        result = jit_cc_new_reg_I64(cc);
+
+    switch (atomic_op) {
+        case AtomicRMWBinOpAdd:
+        {
+            GEN_AT_RMW_INSN(ADD, op_type, bytes, result, value, memory_data,
+                            offset1);
+            break;
+        }
+        case AtomicRMWBinOpSub:
+        {
+            GEN_AT_RMW_INSN(SUB, op_type, bytes, result, value, memory_data,
+                            offset1);
+            break;
+        }
+        case AtomicRMWBinOpAnd:
+        {
+            GEN_AT_RMW_INSN(AND, op_type, bytes, result, value, memory_data,
+                            offset1);
+            break;
+        }
+        case AtomicRMWBinOpOr:
+        {
+            GEN_AT_RMW_INSN(OR, op_type, bytes, result, value, memory_data,
+                            offset1);
+            break;
+        }
+        case AtomicRMWBinOpXor:
+        {
+            GEN_AT_RMW_INSN(XOR, op_type, bytes, result, value, memory_data,
+                            offset1);
+            break;
+        }
+        case AtomicRMWBinOpXchg:
+        {
+            GEN_AT_RMW_INSN(XCHG, op_type, bytes, result, value, memory_data,
+                            offset1);
+            break;
+        }
+        default:
+        {
+            bh_assert(0);
+            goto fail;
+        }
+    }
+
+    if (is_logical_op
+        && (!insn
+            || !jit_lock_reg_in_insn(cc, insn, is_i32 ? eax_hreg : rax_hreg)
+            || !jit_lock_reg_in_insn(cc, insn, is_i32 ? ebx_hreg : rbx_hreg))) {
+        jit_set_last_error(
+            cc, "generate atomic logical insn or lock ra&rb hreg failed");
+        goto fail;
+    }
+
+    if (op_type == VALUE_TYPE_I32)
+        PUSH_I32(result);
+    else
+        PUSH_I64(result);
+
+    return true;
+#endif /* defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64) */
+
+fail:
     return false;
 }
 
@@ -785,6 +999,93 @@ bool
 jit_compile_op_atomic_cmpxchg(JitCompContext *cc, uint8 op_type, uint32 align,
                               uint32 offset, uint32 bytes)
 {
+    JitReg addr, offset1, memory_data, value, expect, result;
+    bool is_i32 = op_type == VALUE_TYPE_I32;
+    /* currently we only implement atomic cmpxchg on x86-64 target */
+#if defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)
+    /* cmpxchg will use register al/ax/eax/rax to store parameter expected
+     * value, and the read result will also be stored to al/ax/eax/rax */
+    JitReg eax_hreg = jit_codegen_get_hreg_by_name("eax");
+    JitReg rax_hreg = jit_codegen_get_hreg_by_name("rax");
+    JitInsn *insn = NULL;
+
+    bh_assert(op_type == VALUE_TYPE_I32 || op_type == VALUE_TYPE_I64);
+    if (is_i32) {
+        POP_I32(value);
+        POP_I32(expect);
+        result = jit_cc_new_reg_I32(cc);
+    }
+    else {
+        POP_I64(value);
+        POP_I64(expect);
+        result = jit_cc_new_reg_I64(cc);
+    }
+    POP_I32(addr);
+
+    offset1 = check_and_seek(cc, addr, offset, bytes);
+    if (!offset1) {
+        goto fail;
+    }
+    CHECK_ALIGNMENT(offset1);
+
+    memory_data = get_memory_data_reg(cc->jit_frame, 0);
+
+    GEN_INSN(MOV, is_i32 ? eax_hreg : rax_hreg, expect);
+    switch (bytes) {
+        case 1:
+        {
+            insn = GEN_INSN(AT_CMPXCHGU8, value, is_i32 ? eax_hreg : rax_hreg,
+                            memory_data, offset1);
+            break;
+        }
+        case 2:
+        {
+            insn = GEN_INSN(AT_CMPXCHGU16, value, is_i32 ? eax_hreg : rax_hreg,
+                            memory_data, offset1);
+            break;
+        }
+        case 4:
+        {
+            if (op_type == VALUE_TYPE_I32)
+                insn =
+                    GEN_INSN(AT_CMPXCHGI32, value, is_i32 ? eax_hreg : rax_hreg,
+                             memory_data, offset1);
+            else
+                insn =
+                    GEN_INSN(AT_CMPXCHGU32, value, is_i32 ? eax_hreg : rax_hreg,
+                             memory_data, offset1);
+            break;
+        }
+        case 8:
+        {
+            insn = GEN_INSN(AT_CMPXCHGI64, value, is_i32 ? eax_hreg : rax_hreg,
+                            memory_data, offset1);
+            break;
+        }
+        default:
+        {
+            bh_assert(0);
+            goto fail;
+        }
+    }
+
+    if (!insn
+        || !jit_lock_reg_in_insn(cc, insn, is_i32 ? eax_hreg : rax_hreg)) {
+        jit_set_last_error(cc, "generate cmpxchg insn or lock ra hreg failed");
+        goto fail;
+    }
+
+    GEN_INSN(MOV, result, is_i32 ? eax_hreg : rax_hreg);
+
+    if (is_i32)
+        PUSH_I32(result);
+    else
+        PUSH_I64(result);
+
+    return true;
+#endif /* defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64) */
+
+fail:
     return false;
 }
 
@@ -812,8 +1113,10 @@ jit_compile_op_atomic_wait(JitCompContext *cc, uint8 op_type, uint32 align,
     JitReg offset1 = check_and_seek(cc, addr, offset, bytes);
     if (!offset1)
         goto fail;
-    JitReg maddr = jit_cc_new_reg_I64(cc);
-    CHECK_ALIGNMENT(maddr, memory_data, offset1);
+    CHECK_ALIGNMENT(offset1);
+
+    JitReg maddr = jit_cc_new_reg_ptr(cc);
+    GEN_INSN(ADD, maddr, memory_data, offset1);
 
     // Prepare `wasm_runtime_atomic_wait` arguments
     JitReg res = jit_cc_new_reg_I32(cc);
@@ -835,6 +1138,12 @@ jit_compile_op_atomic_wait(JitCompContext *cc, uint8 op_type, uint32 align,
         goto fail;
 
     PUSH_I32(res);
+
+#if WASM_ENABLE_THREAD_MGR != 0
+    /* Insert suspend check point */
+    if (!jit_check_suspend_flags(cc))
+        goto fail;
+#endif
     return true;
 fail:
     return false;
@@ -854,8 +1163,10 @@ jit_compiler_op_atomic_notify(JitCompContext *cc, uint32 align, uint32 offset,
     JitReg offset1 = check_and_seek(cc, addr, offset, bytes);
     if (!offset1)
         goto fail;
-    JitReg maddr = jit_cc_new_reg_I64(cc);
-    CHECK_ALIGNMENT(maddr, memory_data, offset1);
+    CHECK_ALIGNMENT(offset1);
+
+    JitReg maddr = jit_cc_new_reg_ptr(cc);
+    GEN_INSN(ADD, maddr, memory_data, offset1);
 
     // Prepare `wasm_runtime_atomic_notify` arguments
     JitReg res = jit_cc_new_reg_I32(cc);
@@ -879,4 +1190,11 @@ jit_compiler_op_atomic_notify(JitCompContext *cc, uint32 align, uint32 offset,
 fail:
     return false;
 }
+
+bool
+jit_compiler_op_atomic_fence(JitCompContext *cc)
+{
+    GEN_INSN(FENCE);
+    return true;
+}
 #endif

+ 3 - 0
core/iwasm/fast-jit/fe/jit_emit_memory.h

@@ -80,6 +80,9 @@ jit_compile_op_atomic_wait(JitCompContext *cc, uint8 op_type, uint32 align,
 bool
 jit_compiler_op_atomic_notify(JitCompContext *cc, uint32 align, uint32 offset,
                               uint32 bytes);
+
+bool
+jit_compiler_op_atomic_fence(JitCompContext *cc);
 #endif
 
 #ifdef __cplusplus

+ 6 - 1
core/iwasm/fast-jit/jit_dump.c

@@ -114,7 +114,10 @@ jit_dump_insn(JitCompContext *cc, JitInsn *insn)
     switch (insn->opcode) {
 #define INSN(NAME, OPND_KIND, OPND_NUM, FIRST_USE)     \
     case JIT_OP_##NAME:                                \
-        os_printf("    %-15s", #NAME);                 \
+        if (insn->flags_u8 & 0x1)                      \
+            os_printf("    ATOMIC %-8s", #NAME);       \
+        else                                           \
+            os_printf("    %-15s", #NAME);             \
         jit_dump_insn_##OPND_KIND(cc, insn, OPND_NUM); \
         break;
 #include "jit_ir.def"
@@ -319,7 +322,9 @@ jit_pass_dump(JitCompContext *cc)
 
     os_printf("JIT.COMPILER.DUMP: PASS_NO=%d PREV_PASS=%s\n\n", pass_no,
               pass_name);
+
     jit_dump_cc(cc);
+
     os_printf("\n");
     return true;
 }

+ 58 - 7
core/iwasm/fast-jit/jit_frontend.c

@@ -223,18 +223,37 @@ get_memory_data_reg(JitFrame *frame, uint32 mem_idx)
 {
     JitCompContext *cc = frame->cc;
     JitReg module_inst_reg = get_module_inst_reg(frame);
-    uint32 memory_data_offset =
-        (uint32)offsetof(WASMModuleInstance, global_table_data.bytes)
-        + (uint32)offsetof(WASMMemoryInstance, memory_data);
+    uint32 memory_data_offset;
 
     bh_assert(mem_idx == 0);
-
+#if WASM_ENABLE_SHARED_MEMORY != 0
+    uint32 memories_offset = (uint32)offsetof(WASMModuleInstance, memories);
+    JitReg memories_addr = jit_cc_new_reg_ptr(cc);
+    JitReg memories_0_addr = jit_cc_new_reg_ptr(cc);
+    memory_data_offset = (uint32)offsetof(WASMMemoryInstance, memory_data);
+    if (!frame->memory_regs[mem_idx].memory_data) {
+        frame->memory_regs[mem_idx].memory_data =
+            cc->memory_regs[mem_idx].memory_data;
+        /* module_inst->memories */
+        GEN_INSN(LDPTR, memories_addr, module_inst_reg,
+                 NEW_CONST(I32, memories_offset));
+        /* module_inst->memories[0] */
+        GEN_INSN(LDPTR, memories_0_addr, memories_addr, NEW_CONST(I32, 0));
+        /* memories[0]->memory_data */
+        GEN_INSN(LDPTR, frame->memory_regs[mem_idx].memory_data,
+                 memories_0_addr, NEW_CONST(I32, memory_data_offset));
+    }
+#else
+    memory_data_offset =
+        (uint32)offsetof(WASMModuleInstance, global_table_data.bytes)
+        + (uint32)offsetof(WASMMemoryInstance, memory_data);
     if (!frame->memory_regs[mem_idx].memory_data) {
         frame->memory_regs[mem_idx].memory_data =
             cc->memory_regs[mem_idx].memory_data;
         GEN_INSN(LDPTR, frame->memory_regs[mem_idx].memory_data,
                  module_inst_reg, NEW_CONST(I32, memory_data_offset));
     }
+#endif
     return frame->memory_regs[mem_idx].memory_data;
 }
 
@@ -1078,6 +1097,39 @@ read_leb(JitCompContext *cc, const uint8 *buf, const uint8 *buf_end,
         res = (int64)res64;                                  \
     } while (0)
 
+#if WASM_ENABLE_SHARED_MEMORY != 0
+#define COMPILE_ATOMIC_RMW(OP, NAME)                  \
+    case WASM_OP_ATOMIC_RMW_I32_##NAME:               \
+        bytes = 4;                                    \
+        op_type = VALUE_TYPE_I32;                     \
+        goto OP_ATOMIC_##OP;                          \
+    case WASM_OP_ATOMIC_RMW_I64_##NAME:               \
+        bytes = 8;                                    \
+        op_type = VALUE_TYPE_I64;                     \
+        goto OP_ATOMIC_##OP;                          \
+    case WASM_OP_ATOMIC_RMW_I32_##NAME##8_U:          \
+        bytes = 1;                                    \
+        op_type = VALUE_TYPE_I32;                     \
+        goto OP_ATOMIC_##OP;                          \
+    case WASM_OP_ATOMIC_RMW_I32_##NAME##16_U:         \
+        bytes = 2;                                    \
+        op_type = VALUE_TYPE_I32;                     \
+        goto OP_ATOMIC_##OP;                          \
+    case WASM_OP_ATOMIC_RMW_I64_##NAME##8_U:          \
+        bytes = 1;                                    \
+        op_type = VALUE_TYPE_I64;                     \
+        goto OP_ATOMIC_##OP;                          \
+    case WASM_OP_ATOMIC_RMW_I64_##NAME##16_U:         \
+        bytes = 2;                                    \
+        op_type = VALUE_TYPE_I64;                     \
+        goto OP_ATOMIC_##OP;                          \
+    case WASM_OP_ATOMIC_RMW_I64_##NAME##32_U:         \
+        bytes = 4;                                    \
+        op_type = VALUE_TYPE_I64;                     \
+        OP_ATOMIC_##OP : bin_op = AtomicRMWBinOp##OP; \
+        goto build_atomic_rmw;
+#endif
+
 static bool
 jit_compile_func(JitCompContext *cc)
 {
@@ -2096,6 +2148,8 @@ jit_compile_func(JitCompContext *cc)
                     case WASM_OP_ATOMIC_FENCE:
                         /* Skip memory index */
                         frame_ip++;
+                        if (!jit_compiler_op_atomic_fence(cc))
+                            return false;
                         break;
                     case WASM_OP_ATOMIC_I32_LOAD:
                         bytes = 4;
@@ -2192,15 +2246,12 @@ jit_compile_func(JitCompContext *cc)
                             return false;
                         break;
 
-                        /* TODO */
-                        /*
                         COMPILE_ATOMIC_RMW(Add, ADD);
                         COMPILE_ATOMIC_RMW(Sub, SUB);
                         COMPILE_ATOMIC_RMW(And, AND);
                         COMPILE_ATOMIC_RMW(Or, OR);
                         COMPILE_ATOMIC_RMW(Xor, XOR);
                         COMPILE_ATOMIC_RMW(Xchg, XCHG);
-                        */
 
                     build_atomic_rmw:
                         if (!jit_compile_op_atomic_rmw(cc, bin_op, op_type,

+ 11 - 0
core/iwasm/fast-jit/jit_frontend.h

@@ -108,6 +108,17 @@ typedef enum FloatArithmetic {
     FLOAT_MAX,
 } FloatArithmetic;
 
+#if WASM_ENABLE_SHARED_MEMORY != 0
+typedef enum AtomicRMWBinOp {
+    AtomicRMWBinOpAdd,
+    AtomicRMWBinOpSub,
+    AtomicRMWBinOpAnd,
+    AtomicRMWBinOpOr,
+    AtomicRMWBinOpXor,
+    AtomicRMWBinOpXchg
+} AtomicRMWBinOp;
+#endif
+
 /**
  * Translate instructions in a function. The translated block must
  * end with a branch instruction whose targets are offsets relating to

+ 17 - 1
core/iwasm/fast-jit/jit_ir.c

@@ -10,7 +10,11 @@
 /**
  * Operand kinds of instructions.
  */
-enum { JIT_OPND_KIND_Reg, JIT_OPND_KIND_VReg, JIT_OPND_KIND_LookupSwitch };
+enum {
+    JIT_OPND_KIND_Reg,
+    JIT_OPND_KIND_VReg,
+    JIT_OPND_KIND_LookupSwitch,
+};
 
 /**
  * Operand kind of each instruction.
@@ -45,6 +49,18 @@ static const uint8 insn_opnd_first_use[] = {
     jit_calloc(offsetof(JitInsn, _opnd._opnd_VReg._reg) \
                + sizeof(JitReg) * (OPND_NUM))
 
+JitInsn *
+_jit_insn_new_Reg_0(JitOpcode opc)
+{
+    JitInsn *insn = JIT_INSN_NEW_Reg(0);
+
+    if (insn) {
+        insn->opcode = opc;
+    }
+
+    return insn;
+}
+
 JitInsn *
 _jit_insn_new_Reg_1(JitOpcode opc, JitReg r0)
 {

+ 44 - 0
core/iwasm/fast-jit/jit_ir.def

@@ -200,6 +200,50 @@ INSN(CALLBC, Reg, 4, 2)
 INSN(RETURNBC, Reg, 3, 0)
 INSN(RETURN, Reg, 1, 0)
 
+#if WASM_ENABLE_SHARED_MEMORY != 0
+/* Atomic Memory Accesses */
+/* op1(replacement val) op2(expected val) op3(mem data) op4(offset)
+ * and in x86, the result is stored in register al/ax/eax/rax */
+INSN(AT_CMPXCHGU8, Reg, 4, 0)
+INSN(AT_CMPXCHGU16, Reg, 4, 0)
+INSN(AT_CMPXCHGI32, Reg, 4, 0)
+INSN(AT_CMPXCHGU32, Reg, 4, 0)
+INSN(AT_CMPXCHGI64, Reg, 4, 0)
+/* rmw operations:
+ * op1(read value) op2(operand value) op3(mem data) op4(offset) */
+INSN(AT_ADDU8, Reg, 4, 1)
+INSN(AT_ADDU16, Reg, 4, 1)
+INSN(AT_ADDI32, Reg, 4, 1)
+INSN(AT_ADDU32, Reg, 4, 1)
+INSN(AT_ADDI64, Reg, 4, 1)
+INSN(AT_SUBU8, Reg, 4, 1)
+INSN(AT_SUBU16, Reg, 4, 1)
+INSN(AT_SUBI32, Reg, 4, 1)
+INSN(AT_SUBU32, Reg, 4, 1)
+INSN(AT_SUBI64, Reg, 4, 1)
+INSN(AT_ANDU8, Reg, 4, 1)
+INSN(AT_ANDU16, Reg, 4, 1)
+INSN(AT_ANDI32, Reg, 4, 1)
+INSN(AT_ANDU32, Reg, 4, 1)
+INSN(AT_ANDI64, Reg, 4, 1)
+INSN(AT_ORU8, Reg, 4, 1)
+INSN(AT_ORU16, Reg, 4, 1)
+INSN(AT_ORI32, Reg, 4, 1)
+INSN(AT_ORU32, Reg, 4, 1)
+INSN(AT_ORI64, Reg, 4, 1)
+INSN(AT_XORU8, Reg, 4, 1)
+INSN(AT_XORU16, Reg, 4, 1)
+INSN(AT_XORI32, Reg, 4, 1)
+INSN(AT_XORU32, Reg, 4, 1)
+INSN(AT_XORI64, Reg, 4, 1)
+INSN(AT_XCHGU8, Reg, 4, 1)
+INSN(AT_XCHGU16, Reg, 4, 1)
+INSN(AT_XCHGI32, Reg, 4, 1)
+INSN(AT_XCHGU32, Reg, 4, 1)
+INSN(AT_XCHGI64, Reg, 4, 1)
+INSN(FENCE, Reg, 0, 0)
+#endif
+
 #undef INSN
 
 /**

+ 23 - 15
core/iwasm/fast-jit/jit_ir.h

@@ -313,7 +313,8 @@ typedef struct JitInsn {
     /* Opcode of the instruction. */
     uint16 opcode;
 
-    /* Reserved field that may be used by optimizations locally. */
+    /* Reserved field that may be used by optimizations locally.
+     * bit_0(Least Significant Bit) is atomic flag for load/store */
     uint8 flags_u8;
 
     /* The unique ID of the instruction. */
@@ -346,6 +347,9 @@ typedef enum JitOpcode {
  * Helper functions for creating new instructions.  Don't call them
  * directly.  Use jit_insn_new_NAME, such as jit_insn_new_MOV instead.
  */
+
+JitInsn *
+_jit_insn_new_Reg_0(JitOpcode opc);
 JitInsn *
 _jit_insn_new_Reg_1(JitOpcode opc, JitReg r0);
 JitInsn *
@@ -368,31 +372,35 @@ _jit_insn_new_LookupSwitch_1(JitOpcode opc, JitReg value, uint32 num);
  * Instruction creation functions jit_insn_new_NAME, where NAME is the
  * name of the instruction defined in jit_ir.def.
  */
+#define ARG_DECL_Reg_0
+#define ARG_LIST_Reg_0
 #define ARG_DECL_Reg_1 JitReg r0
-#define ARG_LIST_Reg_1 r0
+#define ARG_LIST_Reg_1 , r0
 #define ARG_DECL_Reg_2 JitReg r0, JitReg r1
-#define ARG_LIST_Reg_2 r0, r1
+#define ARG_LIST_Reg_2 , r0, r1
 #define ARG_DECL_Reg_3 JitReg r0, JitReg r1, JitReg r2
-#define ARG_LIST_Reg_3 r0, r1, r2
+#define ARG_LIST_Reg_3 , r0, r1, r2
 #define ARG_DECL_Reg_4 JitReg r0, JitReg r1, JitReg r2, JitReg r3
-#define ARG_LIST_Reg_4 r0, r1, r2, r3
+#define ARG_LIST_Reg_4 , r0, r1, r2, r3
 #define ARG_DECL_Reg_5 JitReg r0, JitReg r1, JitReg r2, JitReg r3, JitReg r4
-#define ARG_LIST_Reg_5 r0, r1, r2, r3, r4
+#define ARG_LIST_Reg_5 , r0, r1, r2, r3, r4
 #define ARG_DECL_VReg_1 JitReg r0, int n
-#define ARG_LIST_VReg_1 r0, n
+#define ARG_LIST_VReg_1 , r0, n
 #define ARG_DECL_VReg_2 JitReg r0, JitReg r1, int n
-#define ARG_LIST_VReg_2 r0, r1, n
+#define ARG_LIST_VReg_2 , r0, r1, n
 #define ARG_DECL_LookupSwitch_1 JitReg value, uint32 num
-#define ARG_LIST_LookupSwitch_1 value, num
-#define INSN(NAME, OPND_KIND, OPND_NUM, FIRST_USE)             \
-    static inline JitInsn *jit_insn_new_##NAME(                \
-        ARG_DECL_##OPND_KIND##_##OPND_NUM)                     \
-    {                                                          \
-        return _jit_insn_new_##OPND_KIND##_##OPND_NUM(         \
-            JIT_OP_##NAME, ARG_LIST_##OPND_KIND##_##OPND_NUM); \
+#define ARG_LIST_LookupSwitch_1 , value, num
+#define INSN(NAME, OPND_KIND, OPND_NUM, FIRST_USE)            \
+    static inline JitInsn *jit_insn_new_##NAME(               \
+        ARG_DECL_##OPND_KIND##_##OPND_NUM)                    \
+    {                                                         \
+        return _jit_insn_new_##OPND_KIND##_##OPND_NUM(        \
+            JIT_OP_##NAME ARG_LIST_##OPND_KIND##_##OPND_NUM); \
     }
 #include "jit_ir.def"
 #undef INSN
+#undef ARG_DECL_Reg_0
+#undef ARG_LIST_Reg_0
 #undef ARG_DECL_Reg_1
 #undef ARG_LIST_Reg_1
 #undef ARG_DECL_Reg_2

+ 14 - 0
core/iwasm/fast-jit/jit_regalloc.c

@@ -410,6 +410,13 @@ collect_distances(RegallocContext *rc, JitBasicBlock *basic_block)
 
     JIT_FOREACH_INSN(basic_block, insn)
     {
+#if WASM_ENABLE_SHARED_MEMORY != 0
+        /* fence insn doesn't have any operand, hence, no regs involved */
+        if (insn->opcode == JIT_OP_FENCE) {
+            continue;
+        }
+#endif
+
         JitRegVec regvec = jit_insn_opnd_regs(insn);
         unsigned i;
         JitReg *regp;
@@ -737,6 +744,13 @@ allocate_for_basic_block(RegallocContext *rc, JitBasicBlock *basic_block,
 
     JIT_FOREACH_INSN_REVERSE(basic_block, insn)
     {
+#if WASM_ENABLE_SHARED_MEMORY != 0
+        /* fence insn doesn't have any operand, hence, no regs involved */
+        if (insn->opcode == JIT_OP_FENCE) {
+            continue;
+        }
+#endif
+
         JitRegVec regvec = jit_insn_opnd_regs(insn);
         unsigned first_use = jit_insn_opnd_first_use(insn);
         unsigned i;

+ 1 - 0
core/iwasm/include/aot_export.h

@@ -63,6 +63,7 @@ typedef struct AOTCompOption {
     uint32_t stack_bounds_checks;
     char **custom_sections;
     uint32_t custom_sections_count;
+    const char *stack_usage_file;
 } AOTCompOption, *aot_comp_option_t;
 
 bool

+ 2 - 39
core/iwasm/libraries/lib-pthread/lib_pthread_wrapper.c

@@ -594,45 +594,8 @@ pthread_create_wrapper(wasm_exec_env_t exec_env,
         wasm_runtime_set_wasi_ctx(new_module_inst, wasi_ctx);
 #endif
 
-    /* workaround about passing instantiate-linking information */
-    {
-        CApiFuncImport *c_api_func_imports;
-        uint32 import_func_count = 0;
-        uint32 size_in_bytes = 0;
-
-#if WASM_ENABLE_INTERP != 0
-        if (module_inst->module_type == Wasm_Module_Bytecode) {
-            new_c_api_func_imports = &(
-                ((WASMModuleInstance *)new_module_inst)->e->c_api_func_imports);
-            c_api_func_imports =
-                ((WASMModuleInstance *)module_inst)->e->c_api_func_imports;
-            import_func_count = ((WASMModule *)module)->import_function_count;
-        }
-#endif
-#if WASM_ENABLE_AOT != 0
-        if (module_inst->module_type == Wasm_Module_AoT) {
-            AOTModuleInstanceExtra *e =
-                (AOTModuleInstanceExtra *)((AOTModuleInstance *)new_module_inst)
-                    ->e;
-            new_c_api_func_imports = &(e->c_api_func_imports);
-
-            e = (AOTModuleInstanceExtra *)((AOTModuleInstance *)module_inst)->e;
-            c_api_func_imports = e->c_api_func_imports;
-
-            import_func_count = ((AOTModule *)module)->import_func_count;
-        }
-#endif
-
-        if (import_func_count != 0 && c_api_func_imports) {
-            size_in_bytes = sizeof(CApiFuncImport *) * import_func_count;
-            *new_c_api_func_imports = wasm_runtime_malloc(size_in_bytes);
-            if (!(*new_c_api_func_imports))
-                goto fail;
-
-            bh_memcpy_s(*new_c_api_func_imports, size_in_bytes,
-                        c_api_func_imports, size_in_bytes);
-        }
-    }
+    if (!(wasm_cluster_dup_c_api_imports(new_module_inst, module_inst)))
+        goto fail;
 
     if (!(info_node = wasm_runtime_malloc(sizeof(ThreadInfoNode))))
         goto fail;

+ 3 - 0
core/iwasm/libraries/lib-wasi-threads/lib_wasi_threads_wrapper.c

@@ -96,6 +96,9 @@ thread_spawn_wrapper(wasm_exec_env_t exec_env, uint32 start_arg)
     wasm_runtime_set_custom_data_internal(
         new_module_inst, wasm_runtime_get_custom_data(module_inst));
 
+    if (!(wasm_cluster_dup_c_api_imports(new_module_inst, module_inst)))
+        goto thread_preparation_fail;
+
 #if WASM_ENABLE_LIBC_WASI != 0
     wasi_ctx = wasm_runtime_get_wasi_ctx(module_inst);
     if (wasi_ctx)

+ 7 - 1
core/iwasm/libraries/lib-wasi-threads/test/build.sh

@@ -12,6 +12,12 @@ WAMR_DIR=../../../../..
 for test_c in *.c; do
     test_wasm="$(basename $test_c .c).wasm"
 
+    if [ $test_wasm = "linear_memory_size_update.wasm" ]; then
+        thread_start_file=""
+    else
+        thread_start_file=$WAMR_DIR/samples/wasi-threads/wasm-apps/wasi_thread_start.S
+    fi
+
     echo "Compiling $test_c to $test_wasm"
     $CC \
         -target wasm32-wasi-threads \
@@ -24,6 +30,6 @@ for test_c in *.c; do
         -Wl,--export=malloc \
         -Wl,--export=free \
         -I $WAMR_DIR/samples/wasi-threads/wasm-apps \
-        $WAMR_DIR/samples/wasi-threads/wasm-apps/wasi_thread_start.S \
+        $thread_start_file \
         $test_c -o $test_wasm
 done

+ 94 - 0
core/iwasm/libraries/lib-wasi-threads/test/linear_memory_size_update.c

@@ -0,0 +1,94 @@
+/*
+ * Copyright (C) 2023 Amazon.com Inc. or its affiliates. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+#include <stdlib.h>
+#include <pthread.h>
+
+typedef enum {
+    APP_STARTED,
+    THREAD_STARTED,
+    MEMORY_ALLOCATED,
+} app_state_t;
+typedef struct {
+
+    pthread_cond_t cond;
+    pthread_mutex_t mutex;
+    app_state_t state;
+    char *data;
+} context_t;
+
+void
+context_init(context_t *ctx)
+{
+    pthread_cond_init(&ctx->cond, NULL);
+    pthread_mutex_init(&ctx->mutex, NULL);
+    ctx->state = APP_STARTED;
+    ctx->data = NULL;
+}
+
+void
+context_destroy(context_t *ctx)
+{
+    pthread_cond_destroy(&ctx->cond);
+    pthread_mutex_destroy(&ctx->mutex);
+    if (ctx->data) {
+        free(ctx->data);
+    }
+}
+
+void
+context_set_state(context_t *ctx, app_state_t state)
+{
+    pthread_mutex_lock(&ctx->mutex);
+    ctx->state = state;
+    pthread_mutex_unlock(&ctx->mutex);
+    pthread_cond_signal(&ctx->cond);
+}
+
+void
+context_wait_for_state(context_t *ctx, app_state_t state)
+{
+    pthread_mutex_lock(&ctx->mutex);
+    while (ctx->state != state) {
+        pthread_cond_wait(&ctx->cond, &ctx->mutex);
+    }
+    pthread_mutex_unlock(&ctx->mutex);
+}
+
+void *
+fnc(void *p)
+{
+    context_t *ctx = (context_t *)p;
+    context_set_state(ctx, THREAD_STARTED);
+
+    context_wait_for_state(ctx, MEMORY_ALLOCATED);
+
+    // trigger memory.copy
+    __builtin_memcpy(ctx->data + 512 * 1024, ctx->data + 1024, 1024);
+
+    return NULL;
+}
+
+int
+main()
+{
+    context_t ctx;
+    context_init(&ctx);
+
+    pthread_t th;
+    pthread_create(&th, NULL, fnc, &ctx);
+
+    context_wait_for_state(&ctx, THREAD_STARTED);
+
+    // trigger memory.grow
+    ctx.data = calloc(1024 * 1024, 1);
+
+    context_set_state(&ctx, MEMORY_ALLOCATED);
+
+    pthread_join(th, NULL);
+
+    context_destroy(&ctx);
+
+    return 0;
+}

+ 44 - 0
core/iwasm/libraries/lib-wasi-threads/test/trap_after_main_thread_finishes.c

@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2023 Amazon.com Inc. or its affiliates. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef __wasi__
+#error This example only compiles to WASM/WASI target
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include "wasi_thread_start.h"
+
+enum CONSTANTS {
+    SECOND = 1000 * 1000 * 1000, /* 1 second */
+    TIMEOUT = 1LL * SECOND
+};
+
+typedef struct {
+    start_args_t base;
+} shared_t;
+
+void
+__wasi_thread_start_C(int thread_id, int *start_arg)
+{
+    /* Wait so that the exception is raised after the main thread has finished
+     * already */
+    __builtin_wasm_memory_atomic_wait32(NULL, 0, TIMEOUT);
+    __builtin_trap();
+}
+
+int
+main(int argc, char **argv)
+{
+    shared_t data = { 0 };
+
+    assert(start_args_init(&data.base));
+    int thread_id = __wasi_thread_spawn(&data);
+    assert(thread_id > 0 && "Thread creation failed");
+
+    return EXIT_SUCCESS;
+}

+ 3 - 0
core/iwasm/libraries/lib-wasi-threads/test/trap_after_main_thread_finishes.json

@@ -0,0 +1,3 @@
+{
+  "exit_code": 1
+}

+ 12 - 6
core/iwasm/libraries/libc-wasi/libc_wasi_wrapper.c

@@ -56,8 +56,14 @@ typedef struct WASIContext *wasi_ctx_t;
 wasi_ctx_t
 wasm_runtime_get_wasi_ctx(wasm_module_inst_t module_inst);
 
-static inline size_t
-min(size_t a, size_t b)
+static inline uint64_t
+min_uint64(uint64_t a, uint64_t b)
+{
+    return a > b ? b : a;
+}
+
+static inline uint32_t
+min_uint32(uint32_t a, uint32_t b)
 {
     return a > b ? b : a;
 }
@@ -962,7 +968,7 @@ get_timeout_for_poll_oneoff(const wasi_subscription_t *in,
         const __wasi_subscription_t *s = &in[i];
         if (s->u.type == __WASI_EVENTTYPE_CLOCK
             && (s->u.u.clock.flags & __WASI_SUBSCRIPTION_CLOCK_ABSTIME) == 0) {
-            timeout = min(timeout, s->u.u.clock.timeout);
+            timeout = min_uint64(timeout, s->u.u.clock.timeout);
         }
     }
     return timeout;
@@ -1016,8 +1022,8 @@ execute_interruptible_poll_oneoff(
 
     while (timeout == (__wasi_timestamp_t)-1 || elapsed <= timeout) {
         /* update timeout for clock subscription events */
-        update_clock_subscription_data(in_copy, nsubscriptions,
-                                       min(time_quant, timeout - elapsed));
+        update_clock_subscription_data(
+            in_copy, nsubscriptions, min_uint64(time_quant, timeout - elapsed));
         err = wasmtime_ssp_poll_oneoff(curfds, in_copy, out, nsubscriptions,
                                        nevents);
         elapsed += time_quant;
@@ -1999,7 +2005,7 @@ copy_buffer_to_iovec_app(wasm_module_inst_t module_inst, uint8 *buf_begin,
          * only copy the amount in the app buffer. Otherwise, we fill the iovec
          * buffer and reduce size to copy on the next iteration
          */
-        size_to_copy_into_iovec = min(data->buf_len, size_to_copy);
+        size_to_copy_into_iovec = min_uint32(data->buf_len, size_to_copy);
 
         native_addr = (void *)addr_app_to_native(data->buf_offset);
         bh_memcpy_s(native_addr, size_to_copy_into_iovec, buf,

+ 49 - 0
core/iwasm/libraries/thread-mgr/thread_manager.c

@@ -733,6 +733,55 @@ fail1:
     return -1;
 }
 
+bool
+wasm_cluster_dup_c_api_imports(WASMModuleInstanceCommon *module_inst_dst,
+                               const WASMModuleInstanceCommon *module_inst_src)
+{
+    /* workaround about passing instantiate-linking information */
+    CApiFuncImport **new_c_api_func_imports = NULL;
+    CApiFuncImport *c_api_func_imports;
+    uint32 import_func_count = 0;
+    uint32 size_in_bytes = 0;
+
+#if WASM_ENABLE_INTERP != 0
+    if (module_inst_src->module_type == Wasm_Module_Bytecode) {
+        new_c_api_func_imports =
+            &(((WASMModuleInstance *)module_inst_dst)->e->c_api_func_imports);
+        c_api_func_imports = ((const WASMModuleInstance *)module_inst_src)
+                                 ->e->c_api_func_imports;
+        import_func_count =
+            ((WASMModule *)(((const WASMModuleInstance *)module_inst_src)
+                                ->module))
+                ->import_function_count;
+    }
+#endif
+#if WASM_ENABLE_AOT != 0
+    if (module_inst_src->module_type == Wasm_Module_AoT) {
+        AOTModuleInstanceExtra *e =
+            (AOTModuleInstanceExtra *)((AOTModuleInstance *)module_inst_dst)->e;
+        new_c_api_func_imports = &(e->c_api_func_imports);
+
+        e = (AOTModuleInstanceExtra *)((AOTModuleInstance *)module_inst_src)->e;
+        c_api_func_imports = e->c_api_func_imports;
+
+        import_func_count =
+            ((AOTModule *)(((AOTModuleInstance *)module_inst_src)->module))
+                ->import_func_count;
+    }
+#endif
+
+    if (import_func_count != 0 && c_api_func_imports) {
+        size_in_bytes = sizeof(CApiFuncImport) * import_func_count;
+        *new_c_api_func_imports = wasm_runtime_malloc(size_in_bytes);
+        if (!(*new_c_api_func_imports))
+            return false;
+
+        bh_memcpy_s(*new_c_api_func_imports, size_in_bytes, c_api_func_imports,
+                    size_in_bytes);
+    }
+    return true;
+}
+
 #if WASM_ENABLE_DEBUG_INTERP != 0
 WASMCurrentEnvStatus *
 wasm_cluster_create_exenv_status()

+ 5 - 0
core/iwasm/libraries/thread-mgr/thread_manager.h

@@ -74,6 +74,11 @@ wasm_cluster_destroy(WASMCluster *cluster);
 WASMCluster *
 wasm_exec_env_get_cluster(WASMExecEnv *exec_env);
 
+/* Forward registered functions to a new thread */
+bool
+wasm_cluster_dup_c_api_imports(WASMModuleInstanceCommon *module_inst_dst,
+                               const WASMModuleInstanceCommon *module_inst_src);
+
 int32
 wasm_cluster_create_thread(WASMExecEnv *exec_env,
                            wasm_module_inst_t module_inst, bool alloc_aux_stack,

+ 13 - 0
core/iwasm/libraries/wasi-nn/README.md

@@ -24,6 +24,7 @@ Build the runtime image for your execution target type.
 `EXECUTION_TYPE` can be:
 * `cpu`
 * `nvidia-gpu`
+* `vx-delegate`
 
 ```
 EXECUTION_TYPE=cpu
@@ -71,6 +72,18 @@ docker run \
     /assets/test_tensorflow.wasm
 ```
 
+* vx-delegate for NPU (x86 simulater)
+
+```
+docker run \
+    -v $PWD/core/iwasm/libraries/wasi-nn/test:/assets wasi-nn-vx-delegate \
+    --dir=/assets \
+    --env="TARGET=gpu" \
+    /assets/test_tensorflow.wasm
+```
+
+
+
 Requirements:
 * [NVIDIA docker](https://github.com/NVIDIA/nvidia-docker).
 

+ 41 - 0
core/iwasm/libraries/wasi-nn/cmake/Findtensorflow_lite.cmake

@@ -0,0 +1,41 @@
+# Copyright (C) 2019 Intel Corporation. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+find_library(TENSORFLOW_LITE 
+     NAMES tensorflow-lite
+)
+
+if(NOT EXISTS ${TENSORFLOW_LITE})
+    if (NOT EXISTS "${WAMR_ROOT_DIR}/core/deps/tensorflow-src")
+        execute_process(COMMAND ${WAMR_ROOT_DIR}/core/deps/install_tensorflow.sh
+                        RESULT_VARIABLE TENSORFLOW_RESULT
+        )
+    else ()
+        message("Tensorflow is already downloaded.")
+    endif()
+    set(TENSORFLOW_SOURCE_DIR "${WAMR_ROOT_DIR}/core/deps/tensorflow-src")
+
+    if (WASI_NN_ENABLE_GPU EQUAL 1)
+    # Tensorflow specific:
+    # * https://www.tensorflow.org/lite/guide/build_cmake#available_options_to_build_tensorflow_lite
+    set (TFLITE_ENABLE_GPU ON)
+    endif ()
+
+    include_directories (${CMAKE_CURRENT_BINARY_DIR}/flatbuffers/include)
+    include_directories (${TENSORFLOW_SOURCE_DIR})
+    add_subdirectory(
+        "${TENSORFLOW_SOURCE_DIR}/tensorflow/lite"
+        "${CMAKE_CURRENT_BINARY_DIR}/tensorflow-lite" EXCLUDE_FROM_ALL) 
+
+else()
+    find_path(TENSORFLOW_LITE_INCLUDE_DIR
+    NAMES tensorflow/lite/interpreter.h
+    )
+    find_path(FLATBUFFER_INCLUDE_DIR
+    NAMES flatbuffers/flatbuffers.h
+    )
+    include_directories (${TENSORFLOW_LITE_INCLUDE_DIR})
+    include_directories (${FLATBUFFER_INCLUDE_DIR})    
+endif()
+

+ 40 - 3
core/iwasm/libraries/wasi-nn/src/wasi_nn_tensorflowlite.cpp

@@ -21,6 +21,10 @@
 #include <tensorflow/lite/delegates/gpu/delegate.h>
 #endif
 
+#if defined(WASI_NN_ENABLE_EXTERNAL_DELEGATE)
+#include <tensorflow/lite/delegates/external/external_delegate.h>
+#endif
+
 /* Maximum number of graphs per WASM instance */
 #define MAX_GRAPHS_PER_INST 10
 /* Maximum number of graph execution context per WASM instance*/
@@ -42,6 +46,7 @@ typedef struct {
     uint32_t current_interpreters;
     Interpreter interpreters[MAX_GRAPH_EXEC_CONTEXTS_PER_INST];
     korp_mutex g_lock;
+    TfLiteDelegate *delegate;
 } TFLiteContext;
 
 /* Utils */
@@ -194,18 +199,40 @@ tensorflowlite_init_execution_context(void *tflite_ctx, graph g,
 #if defined(WASI_NN_ENABLE_GPU)
             NN_WARN_PRINTF("GPU enabled.");
             // https://www.tensorflow.org/lite/performance/gpu
-            auto options = TfLiteGpuDelegateOptionsV2Default();
+            TfLiteGpuDelegateOptionsV2 options =
+                TfLiteGpuDelegateOptionsV2Default();
             options.inference_preference =
                 TFLITE_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED;
             options.inference_priority1 =
                 TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY;
-            auto *delegate = TfLiteGpuDelegateV2Create(&options);
+            tfl_ctx->delegate = TfLiteGpuDelegateV2Create(&options);
+            if (tfl_ctx->delegate == NULL) {
+                NN_ERR_PRINTF("Error when generating GPU delegate.");
+                use_default = true;
+                return missing_memory;
+            }
             if (tfl_ctx->interpreters[*ctx]
-                    .interpreter->ModifyGraphWithDelegate(delegate)
+                    .interpreter->ModifyGraphWithDelegate(tfl_ctx->delegate)
                 != kTfLiteOk) {
                 NN_ERR_PRINTF("Error when enabling GPU delegate.");
                 use_default = true;
             }
+#elif defined(WASI_NN_ENABLE_EXTERNAL_DELEGATE)
+            NN_WARN_PRINTF("external delegation enabled.");
+            TfLiteExternalDelegateOptions options =
+                TfLiteExternalDelegateOptionsDefault(WASI_NN_EXT_DELEGATE_PATH);
+            tfl_ctx->delegate = TfLiteExternalDelegateCreate(&options);
+            if (tfl_ctx->delegate == NULL) {
+                NN_ERR_PRINTF("Error when generating External delegate.");
+                use_default = true;
+                return missing_memory;
+            }
+            if (tfl_ctx->interpreters[*ctx]
+                    .interpreter->ModifyGraphWithDelegate(tfl_ctx->delegate)
+                != kTfLiteOk) {
+                NN_ERR_PRINTF("Error when enabling External delegate.");
+                use_default = true;
+            }
 #else
             NN_WARN_PRINTF("GPU not enabled.");
             use_default = true;
@@ -350,6 +377,8 @@ tensorflowlite_initialize(void **tflite_ctx)
         NN_ERR_PRINTF("Error while initializing the lock");
     }
 
+    tfl_ctx->delegate = NULL;
+
     *tflite_ctx = (void *)tfl_ctx;
 }
 
@@ -364,6 +393,14 @@ tensorflowlite_destroy(void *tflite_ctx)
     */
     TFLiteContext *tfl_ctx = (TFLiteContext *)tflite_ctx;
 
+    if (tfl_ctx->delegate != NULL) {
+#if defined(WASI_NN_ENABLE_GPU)
+        TfLiteGpuDelegateV2Delete(tfl_ctx->delegate);
+#elif defined(WASI_NN_ENABLE_EXTERNAL_DELEGATE)
+        TfLiteExternalDelegateDelete(tfl_ctx->delegate);
+#endif
+    }
+
     NN_DBG_PRINTF("Freeing memory.");
     for (int i = 0; i < MAX_GRAPHS_PER_INST; ++i) {
         tfl_ctx->models[i].model.reset();

+ 99 - 0
core/iwasm/libraries/wasi-nn/test/Dockerfile.vx-delegate

@@ -0,0 +1,99 @@
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+FROM ubuntu:20.04 AS base
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+
+RUN apt-get update && apt-get install -y \
+    cmake build-essential git curl libssl-dev python3
+
+
+# Build TensorFlow Lite VX delegate default built for x86-64 simulator
+WORKDIR /tmp
+RUN git clone https://github.com/VeriSilicon/TIM-VX.git tim-vx
+RUN git clone https://github.com/VeriSilicon/tflite-vx-delegate.git
+RUN git clone https://github.com/tensorflow/tensorflow.git
+
+
+# Build TIM-VX
+WORKDIR /tmp/tim-vx/host_build
+RUN cmake -DCMAKE_INSTALL_PREFIX=/usr/local  ../
+RUN make -j$(grep -c ^processor /proc/cpuinfo)
+RUN make install
+
+WORKDIR /tmp/tim-vx
+#RUN mkdir -p prebuilt-sdk/x86_64_linux/lib/include 
+#RUN cp prebuilt-sdk/x86_64_linux/include/CL prebuilt-sdk/x86_64_linux/lib/include -fr
+
+
+# Build TensorFlow Lite
+WORKDIR /tmp/tensorflow/build
+RUN cmake \
+  -DBUILD_SHARED_LIBS=ON=on \
+  -DTFLITE_ENABLE_RUY=on \
+  -DTFLITE_ENABLE_NNAPI=off \
+  -DTFLITE_ENABLE_XNNPACK=on \
+  -DTFLITE_ENABLE_EXTERNAL_DELEGATE=on \
+  ../tensorflow/lite/
+RUN make -j$(grep -c ^processor /proc/cpuinfo)
+RUN make install
+RUN cp --no-preserve=ownership -d lib*.so* /usr/local/lib
+RUN cp -r --no-preserve=ownership -d flatbuffers/include/flatbuffers /usr/local/include
+# install header files
+RUN install -d /usr/local/include/tensorflow/lite && \
+    cd /tmp/tensorflow/tensorflow/lite && \
+    cp --parents \
+        $(find . -name "*.h*") \
+        /usr/local/include/tensorflow/lite
+# install version.h from core
+RUN install -d /usr/local/include/tensorflow/core/public && \
+    cp /tmp/tensorflow/tensorflow/core/public/version.h /usr/local/include/tensorflow/core/public
+
+
+# Build Vx Delegate default built for x86-64 simulator
+WORKDIR /tmp/tflite-vx-delegate/build
+RUN cmake \
+   -DBUILD_SHARED_LIBS=ON \
+   -DFETCHCONTENT_SOURCE_DIR_TENSORFLOW=/tmp/tensorflow \
+   -DTFLITE_LIB_LOC=/usr/local/lib/libtensorflow-lite.so \
+   -DTIM_VX_INSTALL=/usr/local \
+   -DCMAKE_INSTALL_PREFIX=/usr/  \
+   ../
+RUN make vx_delegate -j$(grep -c ^processor /proc/cpuinfo)
+RUN make install
+RUN cp --no-preserve=ownership -d lib*.so* /usr/lib
+# install header files
+RUN install -d /usr/local/include/tensorflow-lite-vx-delegate && \
+    cd  /tmp/tflite-vx-delegate/ && \
+    cp --parents \
+        $(find . -name "*.h*") \
+        /usr/local/include/tensorflow-lite-vx-delegate
+
+ENV VIVANTE_SDK_DIR=/tmp/tim-vx/prebuilt-sdk/x86_64_linux/
+ENV VSIMULATOR_CONFIG=czl
+
+ENV LD_LIBRARY_PATH=/tmp/tim-vx/prebuilt-sdk/x86_64_linux/lib:/usr/local/lib:/lib/x86_64-linux-gnu/:/lib64/:/usr/lib:$LD_LIBRARY_PATH 
+
+
+# Build WASI-NN
+WORKDIR /home/wamr
+
+COPY . .
+
+WORKDIR /home/wamr/core/iwasm/libraries/wasi-nn/test/build
+
+RUN cmake \
+    -DCMAKE_LIBRARY_PATH=${CMAKE_LIBRARY_PATH}:/usr/local/lib/ \
+    -DCMAKE_INCLUDE_PATH=${CMAKE_INCLUDE_PATH}:/usr/local/include/ \
+    -DWAMR_BUILD_WASI_NN=1 \
+    -DWAMR_BUILD_WASI_NN_ENABLE_EXT=1 \
+    -DWASI_NN_EXT_DELEGATE_PATH="/usr/lib/libvx_delegate.so" \
+    ..
+
+RUN make -j $(grep -c ^processor /proc/cpuinfo)
+
+RUN cp /home/wamr/core/iwasm/libraries/wasi-nn/test/build/iwasm /run/iwasm
+
+ENTRYPOINT [ "/run/iwasm" ]

+ 5 - 0
core/iwasm/libraries/wasi-nn/wasi_nn.cmake

@@ -1,6 +1,11 @@
 # Copyright (C) 2019 Intel Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake)
+
+# Find tensorflow-lite
+find_package(tensorflow_lite REQUIRED)
+
 set (WASI_NN_DIR ${CMAKE_CURRENT_LIST_DIR})
 
 include_directories (${WASI_NN_DIR})

+ 14 - 8
core/shared/mem-alloc/ems/ems_alloc.c

@@ -25,7 +25,7 @@ static bool
 remove_tree_node(gc_heap_t *heap, hmu_tree_node_t *p)
 {
     hmu_tree_node_t *q = NULL, **slot = NULL, *parent;
-    hmu_tree_node_t *root = &heap->kfc_tree_root;
+    hmu_tree_node_t *root = heap->kfc_tree_root;
     gc_uint8 *base_addr = heap->base_addr;
     gc_uint8 *end_addr = base_addr + heap->current_size;
 
@@ -38,13 +38,17 @@ remove_tree_node(gc_heap_t *heap, hmu_tree_node_t *p)
         goto fail;
     }
 
-    /* get the slot which holds pointer to node p*/
+    /* get the slot which holds pointer to node p */
     if (p == p->parent->right) {
-        slot = &p->parent->right;
+        /* Don't use `slot = &p->parent->right` to avoid compiler warning */
+        slot = (hmu_tree_node_t **)((uint8 *)p->parent
+                                    + offsetof(hmu_tree_node_t, right));
     }
     else if (p == p->parent->left) {
-        /* p should be a child of its parent*/
-        slot = &p->parent->left;
+        /* p should be a child of its parent */
+        /* Don't use `slot = &p->parent->left` to avoid compiler warning */
+        slot = (hmu_tree_node_t **)((uint8 *)p->parent
+                                    + offsetof(hmu_tree_node_t, left));
     }
     else {
         goto fail;
@@ -241,7 +245,7 @@ gci_add_fc(gc_heap_t *heap, hmu_t *hmu, gc_size_t size)
     node->left = node->right = node->parent = NULL;
 
     /* find proper node to link this new node to */
-    root = &heap->kfc_tree_root;
+    root = heap->kfc_tree_root;
     tp = root;
     bh_assert(tp->size < size);
     while (1) {
@@ -289,6 +293,7 @@ alloc_hmu(gc_heap_t *heap, gc_size_t size)
     uint32 node_idx = 0, init_node_idx = 0;
     hmu_tree_node_t *root = NULL, *tp = NULL, *last_tp = NULL;
     hmu_t *next, *rest;
+    uintptr_t tp_ret;
 
     bh_assert(gci_is_heap_valid(heap));
     bh_assert(size > 0 && !(size & 7));
@@ -354,7 +359,7 @@ alloc_hmu(gc_heap_t *heap, gc_size_t size)
     }
 
     /* need to find a node in tree*/
-    root = &heap->kfc_tree_root;
+    root = heap->kfc_tree_root;
 
     /* find the best node*/
     bh_assert(root);
@@ -402,7 +407,8 @@ alloc_hmu(gc_heap_t *heap, gc_size_t size)
             heap->highmark_size = heap->current_size - heap->total_free_size;
 
         hmu_set_size((hmu_t *)last_tp, size);
-        return (hmu_t *)last_tp;
+        tp_ret = (uintptr_t)last_tp;
+        return (hmu_t *)tp_ret;
     }
 
     return NULL;

+ 46 - 4
core/shared/mem-alloc/ems/ems_gc_internal.h

@@ -204,13 +204,47 @@ set_hmu_normal_node_next(hmu_normal_node_t *node, hmu_normal_node_t *next)
     }
 }
 
+/**
+ * Define hmu_tree_node as a packed struct, since it is at the 4-byte
+ * aligned address and the size of hmu_head is 4, so in 64-bit target,
+ * the left/right/parent fields will be at 8-byte aligned address,
+ * we can access them directly.
+ */
+#if UINTPTR_MAX == UINT64_MAX
+#if defined(_MSC_VER)
+__pragma(pack(push, 1));
+#define __attr_packed
+#elif defined(__GNUC__) || defined(__clang__)
+#define __attr_packed __attribute__((packed))
+#else
+#error "packed attribute isn't used to define struct hmu_tree_node"
+#endif
+#else /* else of UINTPTR_MAX == UINT64_MAX */
+#define __attr_packed
+#endif
+
 typedef struct hmu_tree_node {
     hmu_t hmu_header;
-    gc_size_t size;
     struct hmu_tree_node *left;
     struct hmu_tree_node *right;
     struct hmu_tree_node *parent;
-} hmu_tree_node_t;
+    gc_size_t size;
+} __attr_packed hmu_tree_node_t;
+
+#if UINTPTR_MAX == UINT64_MAX
+#if defined(_MSC_VER)
+__pragma(pack(pop));
+#endif
+#endif
+
+bh_static_assert(sizeof(hmu_tree_node_t) == 8 + 3 * sizeof(void *));
+bh_static_assert(offsetof(hmu_tree_node_t, left) == 4);
+
+#define ASSERT_TREE_NODE_ALIGNED_ACCESS(tree_node)                          \
+    do {                                                                    \
+        bh_assert((((uintptr_t)&tree_node->left) & (sizeof(uintptr_t) - 1)) \
+                  == 0);                                                    \
+    } while (0)
 
 typedef struct gc_heap_struct {
     /* for double checking*/
@@ -223,8 +257,16 @@ typedef struct gc_heap_struct {
 
     hmu_normal_list_t kfc_normal_list[HMU_NORMAL_NODE_CNT];
 
-    /* order in kfc_tree is: size[left] <= size[cur] < size[right]*/
-    hmu_tree_node_t kfc_tree_root;
+#if UINTPTR_MAX == UINT64_MAX
+    /* make kfc_tree_root_buf 4-byte aligned and not 8-byte aligned,
+       so kfc_tree_root's left/right/parent fields are 8-byte aligned
+       and we can access them directly */
+    uint32 __padding;
+#endif
+    uint8 kfc_tree_root_buf[sizeof(hmu_tree_node_t)];
+    /* point to kfc_tree_root_buf, the order in kfc_tree is:
+         size[left] <= size[cur] < size[right] */
+    hmu_tree_node_t *kfc_tree_root;
 
     /* whether heap is corrupted, e.g. the hmu nodes are modified
        by user */

+ 30 - 8
core/shared/mem-alloc/ems/ems_kfc.c

@@ -27,7 +27,7 @@ gc_init_internal(gc_heap_t *heap, char *base_addr, gc_size_t heap_max_size)
     heap->total_free_size = heap->current_size;
     heap->highmark_size = 0;
 
-    root = &heap->kfc_tree_root;
+    root = heap->kfc_tree_root = (hmu_tree_node_t *)heap->kfc_tree_root_buf;
     memset(root, 0, sizeof *root);
     root->size = sizeof *root;
     hmu_set_ut(&root->hmu_header, HMU_FC);
@@ -38,6 +38,9 @@ gc_init_internal(gc_heap_t *heap, char *base_addr, gc_size_t heap_max_size)
     hmu_set_ut(&q->hmu_header, HMU_FC);
     hmu_set_size(&q->hmu_header, heap->current_size);
 
+    ASSERT_TREE_NODE_ALIGNED_ACCESS(q);
+    ASSERT_TREE_NODE_ALIGNED_ACCESS(root);
+
     hmu_mark_pinuse(&q->hmu_header);
     root->right = q;
     q->parent = root;
@@ -165,6 +168,7 @@ gc_migrate(gc_handle_t handle, char *pool_buf_new, gc_size_t pool_buf_size)
     intptr_t offset = (uint8 *)base_addr_new - (uint8 *)heap->base_addr;
     hmu_t *cur = NULL, *end = NULL;
     hmu_tree_node_t *tree_node;
+    uint8 **p_left, **p_right, **p_parent;
     gc_size_t heap_max_size, size;
 
     if ((((uintptr_t)pool_buf_new) & 7) != 0) {
@@ -188,9 +192,18 @@ gc_migrate(gc_handle_t handle, char *pool_buf_new, gc_size_t pool_buf_size)
     }
 
     heap->base_addr = (uint8 *)base_addr_new;
-    adjust_ptr((uint8 **)&heap->kfc_tree_root.left, offset);
-    adjust_ptr((uint8 **)&heap->kfc_tree_root.right, offset);
-    adjust_ptr((uint8 **)&heap->kfc_tree_root.parent, offset);
+
+    ASSERT_TREE_NODE_ALIGNED_ACCESS(heap->kfc_tree_root);
+
+    p_left = (uint8 **)((uint8 *)heap->kfc_tree_root
+                        + offsetof(hmu_tree_node_t, left));
+    p_right = (uint8 **)((uint8 *)heap->kfc_tree_root
+                         + offsetof(hmu_tree_node_t, right));
+    p_parent = (uint8 **)((uint8 *)heap->kfc_tree_root
+                          + offsetof(hmu_tree_node_t, parent));
+    adjust_ptr(p_left, offset);
+    adjust_ptr(p_right, offset);
+    adjust_ptr(p_parent, offset);
 
     cur = (hmu_t *)heap->base_addr;
     end = (hmu_t *)((char *)heap->base_addr + heap->current_size);
@@ -206,12 +219,21 @@ gc_migrate(gc_handle_t handle, char *pool_buf_new, gc_size_t pool_buf_size)
 
         if (hmu_get_ut(cur) == HMU_FC && !HMU_IS_FC_NORMAL(size)) {
             tree_node = (hmu_tree_node_t *)cur;
-            adjust_ptr((uint8 **)&tree_node->left, offset);
-            adjust_ptr((uint8 **)&tree_node->right, offset);
-            if (tree_node->parent != &heap->kfc_tree_root)
+
+            ASSERT_TREE_NODE_ALIGNED_ACCESS(tree_node);
+
+            p_left = (uint8 **)((uint8 *)tree_node
+                                + offsetof(hmu_tree_node_t, left));
+            p_right = (uint8 **)((uint8 *)tree_node
+                                 + offsetof(hmu_tree_node_t, right));
+            p_parent = (uint8 **)((uint8 *)tree_node
+                                  + offsetof(hmu_tree_node_t, parent));
+            adjust_ptr(p_left, offset);
+            adjust_ptr(p_right, offset);
+            if (tree_node->parent != heap->kfc_tree_root)
                 /* The root node belongs to heap structure,
                    it is fixed part and isn't changed. */
-                adjust_ptr((uint8 **)&tree_node->parent, offset);
+                adjust_ptr(p_parent, offset);
         }
         cur = (hmu_t *)((char *)cur + size);
     }

+ 1 - 1
doc/embed_wamr.md

@@ -1,7 +1,7 @@
 Embedding WAMR guideline
 =====================================
 
-**Note**: This document is about how to embed WAMR into C/C++ host applications, for other languages, please refer to: [Embed WAMR into Python](../language-bindings/go), [Embed WAMR into Go](../language-bindings/go).
+**Note**: This document is about how to embed WAMR into C/C++ host applications, for other languages, please refer to: [Embed WAMR into Python](../language-bindings/python), [Embed WAMR into Go](../language-bindings/go).
 
 All the embedding APIs supported by the runtime are defined under folder [core/iwasm/include](../core/iwasm/include). The API details are available in the header files.
 

+ 2 - 2
language-bindings/python/README.md

@@ -30,5 +30,5 @@ import wamr.wasmcapi.ffi as ffi
 
 For more information:
 
-* [WAMR API](./wamr_api)
-* [WASM-C-API](./wasm_c_api)
+* [WAMR API](./wamr-api)
+* [WASM-C-API](./wasm-c-api)

+ 10 - 14
product-mini/README.md

@@ -18,14 +18,17 @@ Note that all ESP-IDF toolchain files live under `$IDF_PATH/tools/cmake/`.
 ## Linux
 
 First of all please install the dependent packages.
-Run command below in Ubuntu-18.04:
-
+Run command below in Ubuntu-22.04:
+``` Bash
+sudo apt install build-essential cmake g++-multilib libgcc-11-dev lib32gcc-11-dev ccache
+```
+Or in Ubuntu-20.04
 ``` Bash
-sudo apt install build-essential cmake g++-multilib libgcc-8-dev lib32gcc-8-dev
+sudo apt install build-essential cmake g++-multilib libgcc-9-dev lib32gcc-9-dev ccache
 ```
-Or in Ubuntu-16.04:
+Or in Ubuntu-18.04:
 ``` Bash
-sudo apt install build-essential cmake g++-multilib libgcc-5-dev lib32gcc-5-dev
+sudo apt install build-essential cmake g++-multilib libgcc-8-dev lib32gcc-8-dev ccache
 ```
 Or in Fedora:
 ``` Bash
@@ -248,7 +251,7 @@ WAMR provides some features which can be easily configured by passing options to
 
 ## Zephyr
 
-You need to prepare Zephyr first as described here https://docs.zephyrproject.org/latest/getting_started/index.html#get-zephyr-and-install-python-dependencies.
+You need to prepare Zephyr first as described [here](https://docs.zephyrproject.org/latest/getting_started/index.html#get-zephyr-and-install-python-dependencies).
 
 After that you need to point the `ZEPHYR_BASE` variable to e.g. `~/zephyrproject/zephyr`. Also, it is important that you have `west` available for subsequent actions.
 
@@ -258,14 +261,7 @@ cd <wamr_root_dir>/product-mini/platforms/zephyr/simple
 ./build_and_run.sh x86
 ```
 
-If you want to use the Espressif toolchain (esp32 or esp32c3), you can most conveniently install it with `west`:
-
-``` Bash
-cd $ZEPHYR_BASE
-west espressif install
-```
-
-After that set `ESPRESSIF_TOOLCHAIN_PATH` according to the output, for example `~/.espressif/tools/zephyr`.
+The [Zephyr SDK](https://github.com/zephyrproject-rtos/sdk-ng) provides toolchains for all supported targets. Follow the instructions in the [documentation](https://docs.zephyrproject.org/latest/develop/getting_started/index.html#install-zephyr-sdk) to ensure it is installed and configured correctly.
 
 Note:
 WAMR provides some features which can be easily configured by passing options to cmake, please see [WAMR vmcore cmake building configurations](../doc/build_wamr.md#wamr-vmcore-cmake-building-configurations) for details. Currently in Zephyr, interpreter, AOT and builtin libc are enabled by default.

+ 5 - 1
product-mini/platforms/posix/main.c

@@ -745,8 +745,12 @@ main(int argc, char *argv[])
 
 #if WASM_ENABLE_LIBC_WASI != 0
     if (ret == 0) {
-        /* propagate wasi exit code. */
+        /* wait for threads to finish and propagate wasi exit code. */
         ret = wasm_runtime_get_wasi_exit_code(wasm_module_inst);
+        if (wasm_runtime_get_exception(wasm_module_inst)) {
+            /* got an exception in spawned thread */
+            ret = 1;
+        }
     }
 #endif
 

+ 5 - 1
product-mini/platforms/windows/main.c

@@ -549,8 +549,12 @@ main(int argc, char *argv[])
 
 #if WASM_ENABLE_LIBC_WASI != 0
     if (ret == 0) {
-        /* propagate wasi exit code. */
+        /* wait for threads to finish and propagate wasi exit code. */
         ret = wasm_runtime_get_wasi_exit_code(wasm_module_inst);
+        if (wasm_runtime_get_exception(wasm_module_inst)) {
+            /* got an exception in spawned thread */
+            ret = 1;
+        }
     }
 #endif
 

+ 10 - 1
samples/wasm-c-api/CMakeLists.txt

@@ -24,7 +24,7 @@ if (APPLE)
   add_definitions(-DBH_PLATFORM_DARWIN)
 endif ()
 
-# Resetdefault linker flags
+# Reset default linker flags
 set(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS "")
 set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS "")
 
@@ -87,6 +87,15 @@ endif()
 set(WAMR_ROOT_DIR ${CMAKE_CURRENT_LIST_DIR}/../..)
 include (${WAMR_ROOT_DIR}/build-scripts/runtime_lib.cmake)
 
+if (NOT DEFINED SANITIZER)
+  set(SANITIZER "")
+elseif (SANITIZER STREQUAL "ubsan")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O2 -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=all -fno-sanitize=alignment" )
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=undefined")
+elseif (NOT (SANITIZER STREQUAL "") )
+  message(SEND_ERROR "Unsupported sanitizer: ${SANITIZER}")
+endif()
+
 add_library(vmlib STATIC ${WAMR_RUNTIME_LIB_SOURCE})
 if (MSVC)
   target_compile_definitions(vmlib PRIVATE WASM_API_EXTERN=)

+ 1 - 1
test-tools/wamr-ide/VSCode-Extension/src/utilities/lldbUtilities.ts

@@ -18,7 +18,7 @@ const LLDB_RESOURCE_DIR = 'resource/debug';
 const LLDB_OS_DOWNLOAD_URL_SUFFIX_MAP: Partial<
     Record<NodeJS.Platform, string>
 > = {
-    linux: 'x86_64-ubuntu-22.04',
+    linux: 'x86_64-ubuntu-20.04',
     darwin: 'universal-macos-latest',
 };
 

+ 2 - 1
tests/benchmarks/polybench/build.sh

@@ -34,7 +34,8 @@ do
                 utilities/polybench.c ${file}                       \
                 -Wl,--export=__heap_base -Wl,--export=__data_end    \
                 -Wl,--export=malloc -Wl,--export=free               \
-                -DPOLYBENCH_TIME -o ${OUT_DIR}/${file_name%.*}.wasm
+                -DPOLYBENCH_TIME -o ${OUT_DIR}/${file_name%.*}.wasm \
+                -D_WASI_EMULATED_PROCESS_CLOCKS
 
         echo "Compile ${file_name%.*}.wasm into ${file_name%.*}.aot"
         ${WAMRC_CMD} -o ${OUT_DIR}/${file_name%.*}.aot \

+ 0 - 8
tests/wamr-test-suites/test_wamr.sh

@@ -444,14 +444,6 @@ function spec_test()
 
     if [[ ${ENABLE_MULTI_THREAD} == 1 ]]; then
         ARGS_FOR_SPEC_TEST+="-p "
-        if [[ $1 == 'fast-jit' ]]; then
-          echo "fast-jit doesn't support multi-thread feature yet, skip it"
-          return
-        fi
-        if [[ $1 == 'multi-tier-jit' ]]; then
-          echo "multi-tier-jit doesn't support multi-thread feature yet, skip it"
-          return
-        fi
     fi
 
     if [[ ${ENABLE_XIP} == 1 ]]; then

+ 11 - 2
wamr-compiler/README.md

@@ -1,7 +1,15 @@
 
 ### Build wamrc AOT compiler
 
-Both wasm binary file and AOT file are supported by iwasm. The wamrc AOT compiler is to compile wasm binary file to AOT file which can also be run by iwasm. Execute following commands to build **wamrc** compiler for Linux:
+Both wasm binary file and AOT file are supported by iwasm. The wamrc AOT compiler is to compile wasm binary file to AOT file which can also be run by iwasm. You can execute following commands to build **wamrc** compiler:
+
+For **Linux**(Ubuntu 20.04 as an example):
+
+First, make sure necessary dependency are installed:
+
+```shell
+sudo apt-get install git build-essential cmake g++-multilib libgcc-9-dev lib32gcc-9-dev ccache 
+```
 
 ```shell
 cd wamr-compiler
@@ -13,6 +21,7 @@ make
 ```
 
 For **Windows**:
+
 ```shell
 cd wamr-compiler
 python build_llvm.py
@@ -20,4 +29,4 @@ mkdir build && cd build
 cmake ..
 cmake --build . --config Release
 # wamrc.exe is generated under .\Release directory
-```
+```

+ 5 - 0
wamr-compiler/main.c

@@ -42,6 +42,8 @@ print_help()
     printf("                              if the option is set:\n");
     printf("                                (1) it is always enabled when `--bounds-checks` is enabled,\n");
     printf("                                (2) else it is enabled/disabled according to the option value\n");
+    printf("  --stack-usage=<file>      Generate a stack-usage file.\n");
+    printf("                              Similarly to `clang -fstack-usage`.\n");
     printf("  --format=<format>         Specifies the format of the output file\n");
     printf("                            The format supported:\n");
     printf("                              aot (default)  AoT file\n");
@@ -204,6 +206,9 @@ main(int argc, char *argv[])
         else if (!strncmp(argv[0], "--stack-bounds-checks=", 22)) {
             option.stack_bounds_checks = (atoi(argv[0] + 22) == 1) ? 1 : 0;
         }
+        else if (!strncmp(argv[0], "--stack-usage=", 14)) {
+            option.stack_usage_file = argv[0] + 14;
+        }
         else if (!strncmp(argv[0], "--format=", 9)) {
             if (argv[0][9] == '\0')
                 PRINT_HELP_AND_EXIT();

Некоторые файлы не были показаны из-за большого количества измененных файлов