فهرست منبع

AOT call stack optimizations (#3773)

- Implement TINY / STANDARD frame modes - tiny mode is only able to keep track on the IP
  and func idx, STANDARD mode provides more capabilities (parameters, stack pointer etc.).
- Implement FRAME_PER_FUNCTION / FRAME_PER_CALL modes - frame per function adds
  code at the beginning and at the end of each function for allocating / deallocating stack frame,
  whereas in per-call mode the frame is allocated before each call. The exception is call to
  the imported function, where frame-per-function mode also allocates the stack before the
  `call` instruction (as it can't instrument the imported function).

At the moment TINY + FRAME_PER_FUNCTION is automatically enabled in case GC and perf
profiling are disabled and `values` call stack feature is not requested. In all the other cases
STANDARD + FRAME_PER_CALL is used.

STANDARD + FRAME_PER_FUNCTION and TINY + FRAME_PER_CALL are currently not
implemented but possible, and might be enabled in the future.

ps. https://github.com/bytecodealliance/wasm-micro-runtime/issues/3758
Marcin Kolny 1 سال پیش
والد
کامیت
cbc2078898

+ 4 - 0
core/iwasm/aot/aot_loader.c

@@ -597,6 +597,10 @@ load_target_info_section(const uint8 *buf, const uint8 *buf_end,
         return false;
     }
 
+#if WASM_ENABLE_DUMP_CALL_STACK != 0
+    module->feature_flags = target_info.feature_flags;
+#endif
+
     /* Finally, check feature flags */
     return check_feature_flags(error_buf, error_buf_size,
                                target_info.feature_flags);

+ 161 - 39
core/iwasm/aot/aot_runtime.c

@@ -4,6 +4,7 @@
  */
 
 #include "aot_runtime.h"
+#include "../compilation/aot_stack_frame.h"
 #include "bh_log.h"
 #include "mem_alloc.h"
 #include "../common/wasm_runtime_common.h"
@@ -72,6 +73,10 @@ bh_static_assert(offsetof(AOTFrame, sp) == sizeof(uintptr_t) * 5);
 bh_static_assert(offsetof(AOTFrame, frame_ref) == sizeof(uintptr_t) * 6);
 bh_static_assert(offsetof(AOTFrame, lp) == sizeof(uintptr_t) * 7);
 
+bh_static_assert(offsetof(AOTTinyFrame, func_index) == sizeof(uint32) * 0);
+bh_static_assert(offsetof(AOTTinyFrame, ip_offset) == sizeof(uint32) * 1);
+bh_static_assert(sizeof(AOTTinyFrame) == sizeof(uint32) * 2);
+
 static void
 set_error_buf(char *error_buf, uint32 error_buf_size, const char *string)
 {
@@ -110,6 +115,55 @@ runtime_malloc(uint64 size, char *error_buf, uint32 error_buf_size)
     return mem;
 }
 
+#if WASM_ENABLE_AOT_STACK_FRAME != 0
+static bool
+is_tiny_frame(WASMExecEnv *exec_env)
+{
+    AOTModule *module =
+        (AOTModule *)((AOTModuleInstance *)exec_env->module_inst)->module;
+
+    return module->feature_flags & WASM_FEATURE_TINY_STACK_FRAME;
+}
+
+static bool
+is_frame_per_function(WASMExecEnv *exec_env)
+{
+    AOTModule *module =
+        (AOTModule *)((AOTModuleInstance *)exec_env->module_inst)->module;
+
+    return module->feature_flags & WASM_FEATURE_FRAME_PER_FUNCTION;
+}
+
+static void *
+get_top_frame(WASMExecEnv *exec_env)
+{
+    if (is_tiny_frame(exec_env)) {
+        return exec_env->wasm_stack.top > exec_env->wasm_stack.bottom
+                   ? exec_env->wasm_stack.top - sizeof(AOTTinyFrame)
+                   : NULL;
+    }
+    else {
+        return exec_env->cur_frame;
+    }
+}
+
+static void *
+get_prev_frame(WASMExecEnv *exec_env, void *cur_frame)
+{
+    bh_assert(cur_frame);
+
+    if (is_tiny_frame(exec_env)) {
+        if ((uint8 *)cur_frame == exec_env->wasm_stack.bottom) {
+            return NULL;
+        }
+        return ((AOTTinyFrame *)cur_frame) - 1;
+    }
+    else {
+        return ((AOTFrame *)cur_frame)->prev_frame;
+    }
+}
+#endif
+
 static bool
 check_global_init_expr(const AOTModule *module, uint32 global_index,
                        char *error_buf, uint32 error_buf_size)
@@ -2265,7 +2319,7 @@ aot_call_function(WASMExecEnv *exec_env, AOTFunctionInstance *function,
         uint32 ext_ret_cell = wasm_get_cell_num(ext_ret_types, ext_ret_count);
         uint64 size;
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
-        struct WASMInterpFrame *prev_frame = exec_env->cur_frame;
+        void *prev_frame = get_top_frame(exec_env);
 #endif
 
         /* Allocate memory all arguments */
@@ -2296,7 +2350,8 @@ aot_call_function(WASMExecEnv *exec_env, AOTFunctionInstance *function,
         }
 
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
-        if (!aot_alloc_frame(exec_env, function->func_index)) {
+        if (!is_frame_per_function(exec_env)
+            && !aot_alloc_frame(exec_env, function->func_index)) {
             if (argv1 != argv1_buf)
                 wasm_runtime_free(argv1);
             return false;
@@ -2324,7 +2379,7 @@ aot_call_function(WASMExecEnv *exec_env, AOTFunctionInstance *function,
         /* Free all frames allocated, note that some frames
            may be allocated in AOT code and haven't been
            freed if exception occurred */
-        while (exec_env->cur_frame != prev_frame)
+        while (get_top_frame(exec_env) != prev_frame)
             aot_free_frame(exec_env);
 #endif
         if (!ret) {
@@ -2367,9 +2422,12 @@ aot_call_function(WASMExecEnv *exec_env, AOTFunctionInstance *function,
     }
     else {
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
-        struct WASMInterpFrame *prev_frame = exec_env->cur_frame;
-
-        if (!aot_alloc_frame(exec_env, function->func_index)) {
+        void *prev_frame = get_top_frame(exec_env);
+        /* Only allocate frame for frame-per-call mode; in the
+           frame-per-function mode the frame is allocated at the
+           beginning of the function. */
+        if (!is_frame_per_function(exec_env)
+            && !aot_alloc_frame(exec_env, function->func_index)) {
             return false;
         }
 #endif
@@ -2394,7 +2452,7 @@ aot_call_function(WASMExecEnv *exec_env, AOTFunctionInstance *function,
         /* Free all frames allocated, note that some frames
            may be allocated in AOT code and haven't been
            freed if exception occurred */
-        while (exec_env->cur_frame != prev_frame)
+        while (get_top_frame(exec_env) != prev_frame)
             aot_free_frame(exec_env);
 #endif
 
@@ -2880,7 +2938,7 @@ aot_invoke_native(WASMExecEnv *exec_env, uint32 func_idx, uint32 argc,
             goto fail;
         }
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
-        struct WASMInterpFrame *prev_frame = exec_env->cur_frame;
+        void *prev_frame = get_top_frame(exec_env);
 
         if (!aot_alloc_frame(exec_env, func_idx)) {
             goto fail;
@@ -2894,7 +2952,7 @@ aot_invoke_native(WASMExecEnv *exec_env, uint32 func_idx, uint32 argc,
         /* Free all frames allocated, note that some frames
            may be allocated in AOT code and haven't been
            freed if exception occurred */
-        while (exec_env->cur_frame != prev_frame)
+        while (get_top_frame(exec_env) != prev_frame)
             aot_free_frame(exec_env);
 #endif
     }
@@ -3622,8 +3680,8 @@ get_func_name_from_index(const AOTModuleInstance *module_inst,
           WASM_ENABLE_PERF_PROFILING != 0 */
 
 #if WASM_ENABLE_GC == 0
-bool
-aot_alloc_frame(WASMExecEnv *exec_env, uint32 func_index)
+static bool
+aot_alloc_standard_frame(WASMExecEnv *exec_env, uint32 func_index)
 {
     AOTModuleInstance *module_inst = (AOTModuleInstance *)exec_env->module_inst;
 #if WASM_ENABLE_PERF_PROFILING != 0
@@ -3670,8 +3728,8 @@ aot_alloc_frame(WASMExecEnv *exec_env, uint32 func_index)
 
 #else /* else of WASM_ENABLE_GC == 0 */
 
-bool
-aot_alloc_frame(WASMExecEnv *exec_env, uint32 func_index)
+static bool
+aot_alloc_standard_frame(WASMExecEnv *exec_env, uint32 func_index)
 {
     AOTModuleInstance *module_inst = (AOTModuleInstance *)exec_env->module_inst;
     AOTModule *module = (AOTModule *)module_inst->module;
@@ -3727,11 +3785,48 @@ aot_alloc_frame(WASMExecEnv *exec_env, uint32 func_index)
 }
 #endif /* end of WASM_ENABLE_GC == 0 */
 
+static bool
+aot_alloc_tiny_frame(WASMExecEnv *exec_env, uint32 func_index)
+{
+    AOTTinyFrame *new_frame = (AOTTinyFrame *)exec_env->wasm_stack.top;
+
+    if ((uint8 *)new_frame > exec_env->wasm_stack.top_boundary) {
+        aot_set_exception((WASMModuleInstance *)exec_env->module_inst,
+                          "wasm operand stack overflow");
+        return false;
+    }
+
+    new_frame->func_index = func_index;
+    exec_env->wasm_stack.top += sizeof(AOTTinyFrame);
+    return true;
+}
+
+bool
+aot_alloc_frame(WASMExecEnv *exec_env, uint32 func_index)
+{
+    AOTModule *module =
+        (AOTModule *)((AOTModuleInstance *)exec_env->module_inst)->module;
+
+    if (is_frame_per_function(exec_env)
+        && func_index >= module->import_func_count) {
+        /* in frame per function mode the frame is allocated at
+        the beginning of each frame, so we only need to allocate
+        the frame for imported functions */
+        return true;
+    }
+    if (is_tiny_frame(exec_env)) {
+        return aot_alloc_tiny_frame(exec_env, func_index);
+    }
+    else {
+        return aot_alloc_standard_frame(exec_env, func_index);
+    }
+}
+
 static inline void
-aot_free_frame_internal(WASMExecEnv *exec_env)
+aot_free_standard_frame(WASMExecEnv *exec_env)
 {
     AOTFrame *cur_frame = (AOTFrame *)exec_env->cur_frame;
-    AOTFrame *prev_frame = cur_frame->prev_frame;
+    AOTFrame *prev_frame = (AOTFrame *)cur_frame->prev_frame;
 
 #if WASM_ENABLE_PERF_PROFILING != 0
     uint64 time_elapsed =
@@ -3751,13 +3846,24 @@ aot_free_frame_internal(WASMExecEnv *exec_env)
     exec_env->cur_frame = (struct WASMInterpFrame *)prev_frame;
 }
 
+static inline void
+aot_free_tiny_frame(WASMExecEnv *exec_env)
+{
+    exec_env->wasm_stack.top =
+        get_prev_frame(exec_env, exec_env->wasm_stack.top);
+}
+
 void
 aot_free_frame(WASMExecEnv *exec_env)
 {
-    aot_free_frame_internal(exec_env);
+    if (is_tiny_frame(exec_env)) {
+        aot_free_tiny_frame(exec_env);
+    }
+    else {
+        aot_free_standard_frame(exec_env);
+    }
 }
 
-
 void
 aot_frame_update_profile_info(WASMExecEnv *exec_env, bool alloc_frame)
 {
@@ -3806,14 +3912,13 @@ aot_frame_update_profile_info(WASMExecEnv *exec_env, bool alloc_frame)
 bool
 aot_create_call_stack(struct WASMExecEnv *exec_env)
 {
-    AOTFrame *cur_frame = (AOTFrame *)exec_env->cur_frame,
-             *first_frame = cur_frame;
     AOTModuleInstance *module_inst = (AOTModuleInstance *)exec_env->module_inst;
     AOTModule *module = (AOTModule *)module_inst->module;
     uint32 n = 0;
 
-    while (cur_frame) {
-        cur_frame = cur_frame->prev_frame;
+    void *top_frame = get_top_frame(exec_env);
+    while (top_frame) {
+        top_frame = get_prev_frame(exec_env, top_frame);
         n++;
     }
 
@@ -3823,28 +3928,46 @@ aot_create_call_stack(struct WASMExecEnv *exec_env)
         return false;
     }
 
-    cur_frame = first_frame;
-    while (cur_frame) {
+    top_frame = get_top_frame(exec_env);
+    while (n-- > 0) {
+        uint32 func_index, ip_offset;
+        uint32 *lp = NULL;
+#if WASM_ENABLE_GC != 0
+        uint32 *sp = NULL;
+        uint8 *frame_ref = NULL;
+#endif
+        if (is_tiny_frame(exec_env)) {
+            AOTTinyFrame *frame = (AOTTinyFrame *)top_frame;
+            func_index = (uint32)frame->func_index;
+            ip_offset = (uint32)frame->ip_offset;
+        }
+        else {
+            AOTFrame *frame = (AOTFrame *)top_frame;
+            func_index = (uint32)frame->func_index;
+            ip_offset = (uint32)frame->ip_offset;
+            lp = frame->lp;
+#if WASM_ENABLE_GC != 0
+            sp = frame->sp;
+            frame_ref = frame->frame_ref;
+#endif
+        }
         WASMCApiFrame frame = { 0 };
         uint32 max_local_cell_num, max_stack_cell_num;
         uint32 all_cell_num, lp_size;
 
         frame.instance = module_inst;
         frame.module_offset = 0;
-        frame.func_index = (uint32)cur_frame->func_index;
-        frame.func_offset = (uint32)cur_frame->ip_offset;
-        frame.func_name_wp = get_func_name_from_index(
-            module_inst, (uint32)cur_frame->func_index);
-
-        if (cur_frame->func_index >= module->import_func_count) {
-            uint32 aot_func_idx =
-                (uint32)(cur_frame->func_index - module->import_func_count);
+        frame.func_index = func_index;
+        frame.func_offset = ip_offset;
+        frame.func_name_wp = get_func_name_from_index(module_inst, func_index);
+
+        if (func_index >= module->import_func_count) {
+            uint32 aot_func_idx = func_index - module->import_func_count;
             max_local_cell_num = module->max_local_cell_nums[aot_func_idx];
             max_stack_cell_num = module->max_stack_cell_nums[aot_func_idx];
         }
         else {
-            AOTFuncType *func_type =
-                module->import_funcs[cur_frame->func_index].func_type;
+            AOTFuncType *func_type = module->import_funcs[func_index].func_type;
             max_local_cell_num =
                 func_type->param_cell_num > 2 ? func_type->param_cell_num : 2;
             max_stack_cell_num = 0;
@@ -3856,12 +3979,12 @@ aot_create_call_stack(struct WASMExecEnv *exec_env)
 #else
         lp_size = align_uint(all_cell_num * 5, 4);
 #endif
-        if (lp_size > 0) {
+        if (lp_size > 0 && !is_tiny_frame(exec_env)) {
             if (!(frame.lp = wasm_runtime_malloc(lp_size))) {
                 destroy_c_api_frames(module_inst->frames);
                 return false;
             }
-            bh_memcpy_s(frame.lp, lp_size, cur_frame->lp, lp_size);
+            bh_memcpy_s(frame.lp, lp_size, lp, lp_size);
 
 #if WASM_ENABLE_GC != 0
             uint32 local_ref_flags_cell_num =
@@ -3869,9 +3992,8 @@ aot_create_call_stack(struct WASMExecEnv *exec_env)
                     .local_ref_flag_cell_num;
             uint8 *local_ref_flags =
                 module->func_local_ref_flags[frame.func_index].local_ref_flags;
-            frame.sp = frame.lp + (cur_frame->sp - cur_frame->lp);
-            frame.frame_ref = (uint8 *)frame.lp
-                              + (cur_frame->frame_ref - (uint8 *)cur_frame->lp);
+            frame.sp = frame.lp + (sp - lp);
+            frame.frame_ref = (uint8 *)frame.lp + (frame_ref - (uint8 *)lp);
             /* copy local ref flags from AOT module */
             bh_memcpy_s(frame.frame_ref, local_ref_flags_cell_num,
                         local_ref_flags, lp_size);
@@ -3885,7 +4007,7 @@ aot_create_call_stack(struct WASMExecEnv *exec_env)
             return false;
         }
 
-        cur_frame = cur_frame->prev_frame;
+        top_frame = get_prev_frame(exec_env, top_frame);
     }
 
     return true;

+ 8 - 1
core/iwasm/aot/aot_runtime.h

@@ -25,12 +25,15 @@ extern "C" {
 #define WASM_FEATURE_REF_TYPES (1 << 3)
 #define WASM_FEATURE_GARBAGE_COLLECTION (1 << 4)
 #define WASM_FEATURE_EXCEPTION_HANDLING (1 << 5)
-#define WASM_FEATURE_MEMORY64 (1 << 6)
+#define WASM_FEATURE_TINY_STACK_FRAME (1 << 6)
 #define WASM_FEATURE_MULTI_MEMORY (1 << 7)
 #define WASM_FEATURE_DYNAMIC_LINKING (1 << 8)
 #define WASM_FEATURE_COMPONENT_MODEL (1 << 9)
 #define WASM_FEATURE_RELAXED_SIMD (1 << 10)
 #define WASM_FEATURE_FLEXIBLE_VECTORS (1 << 11)
+/* Stack frame is created at the beginning of the function,
+ * and not at the beginning of each function call */
+#define WASM_FEATURE_FRAME_PER_FUNCTION (1 << 12)
 
 typedef enum AOTSectionType {
     AOT_SECTION_TYPE_TARGET_INFO = 0,
@@ -326,6 +329,10 @@ typedef struct AOTModule {
     /* `.data` and `.text` sections merged into one large mmaped section */
     uint8 *merged_data_text_sections;
     uint32 merged_data_text_sections_size;
+
+#if WASM_ENABLE_AOT_STACK_FRAME != 0
+    uint32 feature_flags;
+#endif
 } AOTModule;
 
 #define AOTMemoryInstance WASMMemoryInstance

+ 46 - 8
core/iwasm/compilation/aot_compiler.c

@@ -16,6 +16,7 @@
 #include "aot_emit_parametric.h"
 #include "aot_emit_table.h"
 #include "aot_emit_gc.h"
+#include "aot_stack_frame_comp.h"
 #include "simd/simd_access_lanes.h"
 #include "simd/simd_bitmask_extracts.h"
 #include "simd/simd_bit_shifts.h"
@@ -253,6 +254,13 @@ store_value(AOTCompContext *comp_ctx, LLVMValueRef value, uint8 value_type,
     return true;
 }
 
+void
+aot_call_stack_features_init_default(AOTCallStackFeatures *features)
+{
+    memset(features, 1, sizeof(AOTCallStackFeatures));
+    features->frame_per_function = false;
+}
+
 bool
 aot_frame_store_value(AOTCompContext *comp_ctx, LLVMValueRef value,
                       uint8 value_type, LLVMValueRef cur_frame, uint32 offset)
@@ -573,9 +581,10 @@ aot_gen_commit_values(AOTCompFrame *frame)
     return true;
 }
 
-bool
-aot_gen_commit_ip(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
-                  LLVMValueRef ip_value, bool is_64bit)
+static bool
+aot_standard_frame_gen_commit_ip(AOTCompContext *comp_ctx,
+                                 AOTFuncContext *func_ctx,
+                                 LLVMValueRef ip_value, bool is_64bit)
 {
     LLVMValueRef cur_frame = func_ctx->cur_frame;
     LLVMValueRef value_offset, value_addr, value_ptr;
@@ -613,6 +622,23 @@ aot_gen_commit_ip(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     return true;
 }
 
+bool
+aot_gen_commit_ip(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
+                  LLVMValueRef ip_value, bool is_64bit)
+{
+    switch (comp_ctx->aux_stack_frame_type) {
+        case AOT_STACK_FRAME_TYPE_STANDARD:
+            return aot_standard_frame_gen_commit_ip(comp_ctx, func_ctx,
+                                                    ip_value, is_64bit);
+        case AOT_STACK_FRAME_TYPE_TINY:
+            return aot_tiny_frame_gen_commit_ip(comp_ctx, func_ctx, ip_value);
+        default:
+            aot_set_last_error(
+                "unsupported mode when generating commit_ip code");
+            return false;
+    }
+}
+
 bool
 aot_gen_commit_sp_ip(AOTCompFrame *frame, bool commit_sp, bool commit_ip)
 {
@@ -962,6 +988,7 @@ static bool
 aot_compile_func(AOTCompContext *comp_ctx, uint32 func_index)
 {
     AOTFuncContext *func_ctx = comp_ctx->func_ctxes[func_index];
+    LLVMValueRef func_index_ref;
     uint8 *frame_ip = func_ctx->aot_func->code, opcode, *p_f32, *p_f64;
     uint8 *frame_ip_end = frame_ip + func_ctx->aot_func->code_size;
     uint8 *param_types = NULL;
@@ -984,16 +1011,27 @@ aot_compile_func(AOTCompContext *comp_ctx, uint32 func_index)
     LLVMMetadataRef location;
 #endif
 
-    if (comp_ctx->enable_aux_stack_frame) {
+    /* Start to translate the opcodes */
+    LLVMPositionBuilderAtEnd(
+        comp_ctx->builder,
+        func_ctx->block_stack.block_list_head->llvm_entry_block);
+
+    if (comp_ctx->aux_stack_frame_type
+        && comp_ctx->call_stack_features.frame_per_function) {
+        INT_CONST(func_index_ref,
+                  func_index + comp_ctx->comp_data->import_func_count, I32_TYPE,
+                  true);
+        if (!aot_alloc_frame_per_function_frame_for_aot_func(comp_ctx, func_ctx,
+                                                             func_index_ref)) {
+            return false;
+        }
+    }
+    if (comp_ctx->aux_stack_frame_type) {
         if (!init_comp_frame(comp_ctx, func_ctx, func_index)) {
             return false;
         }
     }
 
-    /* Start to translate the opcodes */
-    LLVMPositionBuilderAtEnd(
-        comp_ctx->builder,
-        func_ctx->block_stack.block_list_head->llvm_entry_block);
     while (frame_ip < frame_ip_end) {
         opcode = *frame_ip++;
 

+ 9 - 0
core/iwasm/compilation/aot_compiler.h

@@ -661,6 +661,15 @@ set_local_gc_ref(AOTCompFrame *frame, int n, LLVMValueRef value, uint8 ref_type)
 #define F64_CONST(v) LLVMConstReal(F64_TYPE, v)
 #define I8_CONST(v) LLVMConstInt(INT8_TYPE, v, true)
 
+#define INT_CONST(variable, value, type, is_signed)        \
+    do {                                                   \
+        variable = LLVMConstInt(type, value, is_signed);   \
+        if (!variable) {                                   \
+            aot_set_last_error("llvm build const failed"); \
+            return false;                                  \
+        }                                                  \
+    } while (0)
+
 #define LLVM_CONST(name) (comp_ctx->llvm_consts.name)
 #define I1_ZERO LLVM_CONST(i1_zero)
 #define I1_ONE LLVM_CONST(i1_one)

+ 6 - 0
core/iwasm/compilation/aot_emit_aot_file.c

@@ -4433,6 +4433,12 @@ aot_obj_data_create(AOTCompContext *comp_ctx)
     if (comp_ctx->enable_gc) {
         obj_data->target_info.feature_flags |= WASM_FEATURE_GARBAGE_COLLECTION;
     }
+    if (comp_ctx->aux_stack_frame_type == AOT_STACK_FRAME_TYPE_TINY) {
+        obj_data->target_info.feature_flags |= WASM_FEATURE_TINY_STACK_FRAME;
+    }
+    if (comp_ctx->call_stack_features.frame_per_function) {
+        obj_data->target_info.feature_flags |= WASM_FEATURE_FRAME_PER_FUNCTION;
+    }
 
     bh_print_time("Begin to resolve object file info");
 

+ 43 - 7
core/iwasm/compilation/aot_emit_control.c

@@ -6,6 +6,7 @@
 #include "aot_emit_control.h"
 #include "aot_compiler.h"
 #include "aot_emit_exception.h"
+#include "aot_stack_frame_comp.h"
 #if WASM_ENABLE_GC != 0
 #include "aot_emit_gc.h"
 #endif
@@ -38,13 +39,24 @@ format_block_name(char *name, uint32 name_size, uint32 block_index,
         snprintf(name, name_size, "%s", "func_end");
 }
 
-#define CREATE_BLOCK(new_llvm_block, name)                      \
-    do {                                                        \
-        if (!(new_llvm_block = LLVMAppendBasicBlockInContext(   \
-                  comp_ctx->context, func_ctx->func, name))) {  \
-            aot_set_last_error("add LLVM basic block failed."); \
-            goto fail;                                          \
-        }                                                       \
+#define CREATE_BLOCK(new_llvm_block, name)                                   \
+    do {                                                                     \
+        if (!(new_llvm_block = LLVMAppendBasicBlockInContext(                \
+                  comp_ctx->context, func_ctx->func, name))) {               \
+            aot_set_last_error("add LLVM basic block failed.");              \
+            goto fail;                                                       \
+        }                                                                    \
+        if (!strcmp(name, "func_end") && comp_ctx->aux_stack_frame_type      \
+            && comp_ctx->call_stack_features.frame_per_function) {           \
+            LLVMBasicBlockRef cur_block =                                    \
+                LLVMGetInsertBlock(comp_ctx->builder);                       \
+            SET_BUILDER_POS(new_llvm_block);                                 \
+            if (!aot_free_frame_per_function_frame_for_aot_func(comp_ctx,    \
+                                                                func_ctx)) { \
+                goto fail;                                                   \
+            }                                                                \
+            SET_BUILDER_POS(cur_block);                                      \
+        }                                                                    \
     } while (0)
 
 #define CURR_BLOCK() LLVMGetInsertBlock(comp_ctx->builder)
@@ -93,6 +105,11 @@ format_block_name(char *name, uint32 name_size, uint32 block_index,
                 goto fail;                                                  \
             }                                                               \
             SET_BUILDER_POS(block->llvm_end_block);                         \
+            LLVMValueRef first_instr =                                      \
+                get_first_non_phi(block->llvm_end_block);                   \
+            if (first_instr) {                                              \
+                LLVMPositionBuilderBefore(comp_ctx->builder, first_instr);  \
+            }                                                               \
             for (_i = 0; _i < block->result_count; _i++) {                  \
                 if (!(block->result_phis[_i] = LLVMBuildPhi(                \
                           comp_ctx->builder,                                \
@@ -158,6 +175,18 @@ get_target_block(AOTFuncContext *func_ctx, uint32 br_depth)
     return block;
 }
 
+LLVMValueRef
+get_first_non_phi(LLVMBasicBlockRef block)
+{
+    LLVMValueRef instr = LLVMGetFirstInstruction(block);
+
+    while (instr && LLVMIsAPHINode(instr)) {
+        instr = LLVMGetNextInstruction(instr);
+    }
+
+    return instr;
+}
+
 static void
 clear_frame_locals(AOTCompFrame *aot_frame)
 {
@@ -1361,6 +1390,13 @@ aot_compile_op_return(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         (*p_frame_ip - 1) - comp_ctx->comp_data->wasm_module->buf_code);
 #endif
 
+    if (comp_ctx->aux_stack_frame_type
+        && comp_ctx->call_stack_features.frame_per_function
+        && !aot_free_frame_per_function_frame_for_aot_func(comp_ctx,
+                                                           func_ctx)) {
+        return false;
+    }
+
     if (block_func->result_count) {
         /* Store extra result values to function parameters */
         for (i = 0; i < block_func->result_count - 1; i++) {

+ 63 - 15
core/iwasm/compilation/aot_emit_function.c

@@ -7,6 +7,7 @@
 #include "aot_emit_exception.h"
 #include "aot_emit_control.h"
 #include "aot_emit_table.h"
+#include "aot_stack_frame_comp.h"
 #include "../aot/aot_runtime.h"
 #if WASM_ENABLE_GC != 0
 #include "aot_emit_gc.h"
@@ -1403,6 +1404,7 @@ aot_compile_op_call(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     LLVMValueRef *param_values = NULL, value_ret = NULL, func;
     LLVMValueRef import_func_idx, res;
     LLVMValueRef ext_ret, ext_ret_ptr, ext_ret_idx;
+    LLVMValueRef func_idx_ref;
     int32 i, j = 0, param_count, result_count, ext_ret_count;
     uint64 total_size;
     uint8 wasm_ret_type;
@@ -1447,12 +1449,28 @@ aot_compile_op_call(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
             return false;
     }
 
-    if (comp_ctx->enable_aux_stack_frame) {
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
-        if (!alloc_frame_for_aot_func(comp_ctx, func_ctx, func_idx))
-            return false;
-#endif
+    if (comp_ctx->aux_stack_frame_type) {
+        if (func_idx < import_func_count
+            && comp_ctx->call_stack_features.frame_per_function) {
+            INT_CONST(func_idx_ref, func_idx, I32_TYPE, true);
+            if (!aot_alloc_frame_per_function_frame_for_aot_func(
+                    comp_ctx, func_ctx, func_idx_ref)) {
+                return false;
+            }
+        }
+        else if (!comp_ctx->call_stack_features.frame_per_function) {
+            if (comp_ctx->aux_stack_frame_type
+                != AOT_STACK_FRAME_TYPE_STANDARD) {
+                aot_set_last_error("unsupported mode");
+                return false;
+            }
+            if (!alloc_frame_for_aot_func(comp_ctx, func_ctx, func_idx)) {
+                return false;
+            }
+        }
     }
+#endif
 
     /* Get param cell number */
     param_cell_num = func_type->param_cell_num;
@@ -1522,7 +1540,7 @@ aot_compile_op_call(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     }
 
     if (func_idx < import_func_count) {
-        if (comp_ctx->enable_aux_stack_frame
+        if (comp_ctx->aux_stack_frame_type == AOT_STACK_FRAME_TYPE_STANDARD
             && !commit_params_to_frame_of_import_func(
                 comp_ctx, func_ctx, func_type, param_values + 1)) {
             goto fail;
@@ -1813,12 +1831,26 @@ aot_compile_op_call(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         }
     }
 
-    if (comp_ctx->enable_aux_stack_frame) {
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
-        if (!free_frame_for_aot_func(comp_ctx, func_ctx))
-            goto fail;
-#endif
+    if (comp_ctx->aux_stack_frame_type) {
+        if (func_idx < import_func_count
+            && comp_ctx->call_stack_features.frame_per_function) {
+            if (!aot_free_frame_per_function_frame_for_aot_func(comp_ctx,
+                                                                func_ctx)) {
+                goto fail;
+            }
+        }
+        else if (!comp_ctx->call_stack_features.frame_per_function) {
+            if (comp_ctx->aux_stack_frame_type
+                != AOT_STACK_FRAME_TYPE_STANDARD) {
+                aot_set_last_error("unsupported mode");
+            }
+            if (!free_frame_for_aot_func(comp_ctx, func_ctx)) {
+                goto fail;
+            }
+        }
     }
+#endif
 
     /* Insert suspend check point */
     if (comp_ctx->enable_thread_mgr) {
@@ -2439,7 +2471,8 @@ aot_compile_op_call_indirect(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         goto fail;
     }
 
-    if (comp_ctx->enable_aux_stack_frame) {
+    if (comp_ctx->aux_stack_frame_type
+        && !comp_ctx->call_stack_features.frame_per_function) {
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
         /*  TODO: use current frame instead of allocating new frame
                   for WASM_OP_RETURN_CALL_INDIRECT */
@@ -2508,7 +2541,13 @@ aot_compile_op_call_indirect(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     /* Translate call import block */
     LLVMPositionBuilderAtEnd(comp_ctx->builder, block_call_import);
 
-    if (comp_ctx->enable_aux_stack_frame
+    if (comp_ctx->aot_frame && comp_ctx->call_stack_features.frame_per_function
+        && !aot_alloc_frame_per_function_frame_for_aot_func(comp_ctx, func_ctx,
+                                                            func_idx)) {
+        goto fail;
+    }
+
+    if (comp_ctx->aux_stack_frame_type == AOT_STACK_FRAME_TYPE_STANDARD
         && !commit_params_to_frame_of_import_func(comp_ctx, func_ctx, func_type,
                                                   param_values + 1)) {
         goto fail;
@@ -2545,6 +2584,12 @@ aot_compile_op_call_indirect(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         && !check_call_return(comp_ctx, func_ctx, res))
         goto fail;
 
+    if (comp_ctx->aot_frame && comp_ctx->call_stack_features.frame_per_function
+        && !aot_free_frame_per_function_frame_for_aot_func(comp_ctx,
+                                                           func_ctx)) {
+        goto fail;
+    }
+
     block_curr = LLVMGetInsertBlock(comp_ctx->builder);
     for (i = 0; i < func_result_count; i++) {
         LLVMAddIncoming(result_phis[i], &value_rets[i], &block_curr, 1);
@@ -2629,7 +2674,8 @@ aot_compile_op_call_indirect(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         PUSH(result_phis[i], func_type->types[func_param_count + i]);
     }
 
-    if (comp_ctx->enable_aux_stack_frame) {
+    if (comp_ctx->aux_stack_frame_type
+        && !comp_ctx->call_stack_features.frame_per_function) {
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
         if (!free_frame_for_aot_func(comp_ctx, func_ctx))
             goto fail;
@@ -2936,7 +2982,8 @@ aot_compile_op_call_ref(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         goto fail;
     }
 
-    if (comp_ctx->enable_aux_stack_frame) {
+    if (comp_ctx->aux_stack_frame_type
+        && !comp_ctx->call_stack_features.frame_per_function) {
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
         /*  TODO: use current frame instead of allocating new frame
                   for WASM_OP_RETURN_CALL_REF */
@@ -3005,7 +3052,7 @@ aot_compile_op_call_ref(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     /* Translate call import block */
     LLVMPositionBuilderAtEnd(comp_ctx->builder, block_call_import);
 
-    if (comp_ctx->enable_aux_stack_frame
+    if (comp_ctx->aux_stack_frame_type == AOT_STACK_FRAME_TYPE_STANDARD
         && !commit_params_to_frame_of_import_func(comp_ctx, func_ctx, func_type,
                                                   param_values + 1)) {
         goto fail;
@@ -3133,7 +3180,8 @@ aot_compile_op_call_ref(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         PUSH(result_phis[i], func_type->types[func_param_count + i]);
     }
 
-    if (comp_ctx->enable_aux_stack_frame) {
+    if (comp_ctx->aux_stack_frame_type
+        && !comp_ctx->call_stack_features.frame_per_function) {
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
         if (!free_frame_for_aot_func(comp_ctx, func_ctx))
             goto fail;

+ 2 - 4
core/iwasm/compilation/aot_llvm.c

@@ -1771,7 +1771,7 @@ aot_create_func_context(const AOTCompData *comp_data, AOTCompContext *comp_ctx,
         goto fail;
     }
 
-    if (comp_ctx->enable_aux_stack_frame
+    if (comp_ctx->aux_stack_frame_type
         && !create_aux_stack_frame(comp_ctx, func_ctx)) {
         goto fail;
     }
@@ -2577,9 +2577,7 @@ aot_create_comp_context(const AOTCompData *comp_data, aot_comp_option_t option)
     if (option->enable_ref_types)
         comp_ctx->enable_ref_types = true;
 
-    if (option->enable_aux_stack_frame)
-        comp_ctx->enable_aux_stack_frame = true;
-
+    comp_ctx->aux_stack_frame_type = option->aux_stack_frame_type;
     comp_ctx->call_stack_features = option->call_stack_features;
 
     if (option->enable_perf_profiling)

+ 1 - 1
core/iwasm/compilation/aot_llvm.h

@@ -410,7 +410,7 @@ typedef struct AOTCompContext {
     bool enable_aux_stack_check;
 
     /* Generate auxiliary stack frame */
-    bool enable_aux_stack_frame;
+    AOTStackFrameType aux_stack_frame_type;
 
     /* Auxiliary call stack features */
     AOTCallStackFeatures call_stack_features;

+ 27 - 0
core/iwasm/compilation/aot_stack_frame.h

@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2024 Amazon Inc.  All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _AOT_STACK_FRAME_H_
+#define _AOT_STACK_FRAME_H_
+
+#include "platform_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    /* The non-imported function index of current function */
+    uint32 func_index;
+
+    /* Instruction pointer: offset to the bytecode array */
+    uint32 ip_offset;
+} AOTTinyFrame;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

+ 148 - 0
core/iwasm/compilation/aot_stack_frame_comp.c

@@ -0,0 +1,148 @@
+/*
+ * Copyright (C) 2024 Amazon Inc.  All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+#include "aot_stack_frame_comp.h"
+#include "aot_emit_exception.h"
+
+#define ADD_IN_BOUNDS_GEP(variable, type, pointer, indices, num_indices)     \
+    do {                                                                     \
+        if (!(variable =                                                     \
+                  LLVMBuildInBoundsGEP2(comp_ctx->builder, type, pointer,    \
+                                        indices, num_indices, #variable))) { \
+            aot_set_last_error("llvm build in bounds gep failed");           \
+            return false;                                                    \
+        }                                                                    \
+    } while (0)
+
+#define ADD_STORE(value, pointer)                                 \
+    do {                                                          \
+        if (!LLVMBuildStore(comp_ctx->builder, value, pointer)) { \
+            aot_set_last_error("llvm build store failed");        \
+            return false;                                         \
+        }                                                         \
+    } while (0)
+
+#define ADD_LOAD(value, type, pointer)                                         \
+    do {                                                                       \
+        if (!(value =                                                          \
+                  LLVMBuildLoad2(comp_ctx->builder, type, pointer, #value))) { \
+            aot_set_last_error("llvm build load failed");                      \
+            return false;                                                      \
+        }                                                                      \
+    } while (0)
+
+static bool
+aot_alloc_tiny_frame_for_aot_func(AOTCompContext *comp_ctx,
+                                  AOTFuncContext *func_ctx,
+                                  LLVMValueRef func_index)
+{
+    LLVMValueRef wasm_stack_top_ptr = func_ctx->wasm_stack_top_ptr,
+                 wasm_stack_top_bound = func_ctx->wasm_stack_top_bound,
+                 wasm_stack_top, cmp;
+    LLVMBasicBlockRef check_wasm_stack_succ;
+    LLVMValueRef offset;
+
+    ADD_LOAD(wasm_stack_top, INT8_PTR_TYPE, wasm_stack_top_ptr);
+
+    if (comp_ctx->call_stack_features.bounds_checks) {
+        if (!(check_wasm_stack_succ = LLVMAppendBasicBlockInContext(
+                  comp_ctx->context, func_ctx->func,
+                  "check_wasm_stack_succ"))) {
+            aot_set_last_error("llvm add basic block failed.");
+            return false;
+        }
+
+        LLVMMoveBasicBlockAfter(check_wasm_stack_succ,
+                                LLVMGetInsertBlock(comp_ctx->builder));
+
+        if (!(cmp = LLVMBuildICmp(comp_ctx->builder, LLVMIntUGE, wasm_stack_top,
+                                  wasm_stack_top_bound, "cmp"))) {
+            aot_set_last_error("llvm build icmp failed");
+            return false;
+        }
+
+        if (!(aot_emit_exception(comp_ctx, func_ctx,
+                                 EXCE_OPERAND_STACK_OVERFLOW, true, cmp,
+                                 check_wasm_stack_succ))) {
+            return false;
+        }
+    }
+
+    /* Save the func_idx on the top of the stack */
+    ADD_STORE(func_index, wasm_stack_top);
+
+    /* increment the stack pointer */
+    INT_CONST(offset, sizeof(AOTTinyFrame), I32_TYPE, true);
+    ADD_IN_BOUNDS_GEP(wasm_stack_top, INT8_TYPE, wasm_stack_top, &offset, 1);
+    ADD_STORE(wasm_stack_top, wasm_stack_top_ptr);
+
+    return true;
+}
+
+static bool
+aot_free_tiny_frame_for_aot_func(AOTCompContext *comp_ctx,
+                                 AOTFuncContext *func_ctx)
+{
+    LLVMValueRef wasm_stack_top_ptr = func_ctx->wasm_stack_top_ptr,
+                 wasm_stack_top;
+    LLVMValueRef offset;
+
+    ADD_LOAD(wasm_stack_top, INT8_PTR_TYPE, wasm_stack_top_ptr);
+
+    INT_CONST(offset, -sizeof(AOTTinyFrame),
+              comp_ctx->pointer_size == 8 ? I64_TYPE : I32_TYPE, true);
+    ADD_IN_BOUNDS_GEP(wasm_stack_top, INT8_TYPE, wasm_stack_top, &offset, 1);
+    ADD_STORE(wasm_stack_top, wasm_stack_top_ptr);
+
+    return true;
+}
+
+bool
+aot_tiny_frame_gen_commit_ip(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
+                             LLVMValueRef ip_value)
+{
+    LLVMValueRef wasm_stack_top_ptr = func_ctx->wasm_stack_top_ptr,
+                 wasm_stack_top;
+    LLVMValueRef offset, ip_addr;
+
+    bh_assert(ip_value);
+
+    ADD_LOAD(wasm_stack_top, INT8_PTR_TYPE, wasm_stack_top_ptr);
+
+    INT_CONST(offset, -4, comp_ctx->pointer_size == 8 ? I64_TYPE : I32_TYPE,
+              true);
+    ADD_IN_BOUNDS_GEP(ip_addr, INT8_TYPE, wasm_stack_top, &offset, 1);
+
+    ADD_STORE(ip_value, ip_addr);
+
+    return true;
+}
+
+bool
+aot_alloc_frame_per_function_frame_for_aot_func(AOTCompContext *comp_ctx,
+                                                AOTFuncContext *func_ctx,
+                                                LLVMValueRef func_index)
+{
+    switch (comp_ctx->aux_stack_frame_type) {
+        case AOT_STACK_FRAME_TYPE_TINY:
+            return aot_alloc_tiny_frame_for_aot_func(comp_ctx, func_ctx,
+                                                     func_index);
+        default:
+            aot_set_last_error("unsupported mode");
+            return false;
+    }
+}
+
+bool
+aot_free_frame_per_function_frame_for_aot_func(AOTCompContext *comp_ctx,
+                                               AOTFuncContext *func_ctx)
+{
+    switch (comp_ctx->aux_stack_frame_type) {
+        case AOT_STACK_FRAME_TYPE_TINY:
+            return aot_free_tiny_frame_for_aot_func(comp_ctx, func_ctx);
+        default:
+            aot_set_last_error("unsupported mode");
+            return false;
+    }
+}

+ 33 - 0
core/iwasm/compilation/aot_stack_frame_comp.h

@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2024 Amazon Inc.  All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _AOT_STACK_FRAME_COMP_H_
+#define _AOT_STACK_FRAME_COMP_H_
+
+#include "aot_stack_frame.h"
+#include "aot_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool
+aot_alloc_frame_per_function_frame_for_aot_func(AOTCompContext *comp_ctx,
+                                                AOTFuncContext *func_ctx,
+                                                LLVMValueRef func_index);
+
+bool
+aot_free_frame_per_function_frame_for_aot_func(AOTCompContext *comp_ctx,
+                                               AOTFuncContext *func_ctx);
+
+bool
+aot_tiny_frame_gen_commit_ip(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
+                             LLVMValueRef ip_value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

+ 17 - 1
core/iwasm/include/aot_comp_option.h

@@ -21,8 +21,24 @@ typedef struct {
 
     /* Enables or disables parameters, locals and stack operands. */
     bool values;
+
+    /* If enabled, stack frame is generated at the beginning of each
+     * function (frame-per-function mode). Otherwise, stack frame is
+     * generated before each call of a function (frame-per-call mode). */
+    bool frame_per_function;
 } AOTCallStackFeatures;
 
+void
+aot_call_stack_features_init_default(AOTCallStackFeatures *features);
+
+typedef enum {
+    AOT_STACK_FRAME_OFF = 0,
+    /* Use a small stack frame data structure (AOTTinyFrame) */
+    AOT_STACK_FRAME_TYPE_TINY,
+    /* Use a regular stack frame data structure (AOTFrame) */
+    AOT_STACK_FRAME_TYPE_STANDARD,
+} AOTStackFrameType;
+
 typedef struct AOTCompOption {
     bool is_jit_mode;
     bool is_indirect_mode;
@@ -38,7 +54,7 @@ typedef struct AOTCompOption {
     bool enable_ref_types;
     bool enable_gc;
     bool enable_aux_stack_check;
-    bool enable_aux_stack_frame;
+    AOTStackFrameType aux_stack_frame_type;
     AOTCallStackFeatures call_stack_features;
     bool enable_perf_profiling;
     bool enable_memory_profiling;

+ 2 - 2
core/iwasm/interpreter/wasm_loader.c

@@ -5406,8 +5406,8 @@ init_llvm_jit_functions_stage1(WASMModule *module, char *error_buf,
     option.enable_aux_stack_check = true;
 #if WASM_ENABLE_PERF_PROFILING != 0 || WASM_ENABLE_DUMP_CALL_STACK != 0 \
     || WASM_ENABLE_AOT_STACK_FRAME != 0
-    option.enable_aux_stack_frame = true;
-    memset(&option.call_stack_features, 1, sizeof(AOTCallStackFeatures));
+    option.aux_stack_frame_type = AOT_STACK_FRAME_TYPE_STANDARD;
+    aot_call_stack_features_init_default(&option.call_stack_features);
 #endif
 #if WASM_ENABLE_PERF_PROFILING != 0
     option.enable_perf_profiling = true;

+ 2 - 2
core/iwasm/interpreter/wasm_mini_loader.c

@@ -2148,8 +2148,8 @@ init_llvm_jit_functions_stage1(WASMModule *module, char *error_buf,
     option.enable_aux_stack_check = true;
 #if WASM_ENABLE_PERF_PROFILING != 0 || WASM_ENABLE_DUMP_CALL_STACK != 0 \
     || WASM_ENABLE_AOT_STACK_FRAME != 0
-    option.enable_aux_stack_frame = true;
-    memset(&option.call_stack_features, 1, sizeof(AOTCallStackFeatures));
+    option.aux_stack_frame_type = AOT_STACK_FRAME_TYPE_STANDARD;
+    aot_call_stack_features_init_default(&option.call_stack_features);
 #endif
 #if WASM_ENABLE_PERF_PROFILING != 0
     option.enable_perf_profiling = true;

+ 19 - 6
wamr-compiler/main.c

@@ -307,6 +307,13 @@ finish:
     return ret;
 }
 
+static bool
+can_enable_tiny_frame(const AOTCompOption *opt)
+{
+    return !opt->call_stack_features.values && !opt->enable_gc
+           && !opt->enable_perf_profiling;
+}
+
 static uint32
 resolve_segue_flags(char *str_flags)
 {
@@ -403,9 +410,7 @@ main(int argc, char *argv[])
     option.enable_bulk_memory = true;
     option.enable_ref_types = true;
     option.enable_gc = false;
-
-    /* Set all the features to true by default */
-    memset(&option.call_stack_features, 1, sizeof(AOTCallStackFeatures));
+    aot_call_stack_features_init_default(&option.call_stack_features);
 
     /* Process options */
     for (argc--, argv++; argc > 0 && argv[0][0] == '-'; argc--, argv++) {
@@ -519,7 +524,7 @@ main(int argc, char *argv[])
             option.enable_aux_stack_check = false;
         }
         else if (!strcmp(argv[0], "--enable-dump-call-stack")) {
-            option.enable_aux_stack_frame = true;
+            option.aux_stack_frame_type = AOT_STACK_FRAME_TYPE_STANDARD;
         }
         else if (!strncmp(argv[0], "--call-stack-features=", 22)) {
             /* Reset all the features, only enable the user-defined ones */
@@ -535,7 +540,7 @@ main(int argc, char *argv[])
             }
         }
         else if (!strcmp(argv[0], "--enable-perf-profiling")) {
-            option.enable_aux_stack_frame = true;
+            option.aux_stack_frame_type = AOT_STACK_FRAME_TYPE_STANDARD;
             option.enable_perf_profiling = true;
         }
         else if (!strcmp(argv[0], "--enable-memory-profiling")) {
@@ -550,7 +555,7 @@ main(int argc, char *argv[])
             option.is_indirect_mode = true;
         }
         else if (!strcmp(argv[0], "--enable-gc")) {
-            option.enable_aux_stack_frame = true;
+            option.aux_stack_frame_type = AOT_STACK_FRAME_TYPE_STANDARD;
             option.enable_gc = true;
         }
         else if (!strcmp(argv[0], "--disable-llvm-intrinsics")) {
@@ -652,6 +657,14 @@ main(int argc, char *argv[])
     if (!use_dummy_wasm && (argc == 0 || !out_file_name))
         PRINT_HELP_AND_EXIT();
 
+    if (option.aux_stack_frame_type == AOT_STACK_FRAME_TYPE_STANDARD
+        && can_enable_tiny_frame(&option)) {
+        LOG_VERBOSE("Use tiny frame mode for stack frames");
+        option.aux_stack_frame_type = AOT_STACK_FRAME_TYPE_TINY;
+        /* for now we only enable frame per function for a TINY frame mode */
+        option.call_stack_features.frame_per_function = true;
+    }
+
     if (!size_level_set) {
         /**
          * Set opt level to 1 by default for Windows and MacOS as