Procházet zdrojové kódy

Implement the segue optimization for LLVM AOT/JIT (#2230)

Segue is an optimization technology which uses x86 segment register to store
the WebAssembly linear memory base address, so as to remove most of the cost
of SFI (Software-based Fault Isolation) base addition and free up a general
purpose register, by this way it may:
- Improve the performance of JIT/AOT
- Reduce the footprint of JIT/AOT, the JIT/AOT code generated is smaller
- Reduce the compilation time of JIT/AOT

This PR uses the x86-64 GS segment register to apply the optimization, currently
it supports linux and linux-sgx platforms on x86-64 target. By default it is disabled,
developer can use the option below to enable it for wamrc and iwasm(with LLVM
JIT enabled):
```bash
wamrc --enable-segue=[<flags>] -o output_file wasm_file
iwasm --enable-segue=[<flags>] wasm_file [args...]
```
`flags` can be:
    i32.load, i64.load, f32.load, f64.load, v128.load,
    i32.store, i64.store, f32.store, f64.store, v128.store
Use comma to separate them, e.g. `--enable-segue=i32.load,i64.store`,
and `--enable-segue` means all flags are added.

Acknowledgement:
Many thanks to Intel Labs, UC San Diego and UT Austin teams for introducing this
technology and the great support and guidance!

Signed-off-by: Wenyong Huang <wenyong.huang@intel.com>
Co-authored-by: Vahldiek-oberwagner, Anjo Lucas <anjo.lucas.vahldiek-oberwagner@intel.com>
Wenyong Huang před 2 roky
rodič
revize
76be848ec3
42 změnil soubory, kde provedl 1864 přidání a 123 odebrání
  1. 10 0
      ATTRIBUTIONS.md
  2. 10 0
      core/iwasm/aot/aot_loader.c
  3. 18 0
      core/iwasm/aot/aot_runtime.c
  4. 6 1
      core/iwasm/common/wasm_memory.c
  5. 2 1
      core/iwasm/common/wasm_runtime_common.c
  6. 1 0
      core/iwasm/common/wasm_runtime_common.h
  7. 8 0
      core/iwasm/compilation/aot_compiler.h
  8. 185 48
      core/iwasm/compilation/aot_emit_memory.c
  9. 1 1
      core/iwasm/compilation/aot_emit_memory.h
  10. 53 0
      core/iwasm/compilation/aot_llvm.c
  11. 22 0
      core/iwasm/compilation/aot_llvm.h
  12. 52 22
      core/iwasm/compilation/simd/simd_load_store.c
  13. 1 0
      core/iwasm/include/aot_export.h
  14. 7 4
      core/iwasm/include/wasm_export.h
  15. 9 0
      core/iwasm/interpreter/wasm_interp_classic.c
  16. 9 0
      core/iwasm/interpreter/wasm_interp_fast.c
  17. 2 1
      core/iwasm/interpreter/wasm_loader.c
  18. 2 1
      core/iwasm/interpreter/wasm_mini_loader.c
  19. 14 0
      core/shared/platform/linux-sgx/platform_internal.h
  20. 14 0
      core/shared/platform/linux/platform_internal.h
  21. 77 5
      product-mini/platforms/posix/main.c
  22. 7 0
      tests/benchmarks/coremark/build.sh
  23. 10 3
      tests/benchmarks/coremark/run.sh
  24. 7 0
      tests/benchmarks/dhrystone/LICENSE
  25. 24 0
      tests/benchmarks/dhrystone/build.sh
  26. 306 0
      tests/benchmarks/dhrystone/include/dhry.h
  27. 19 0
      tests/benchmarks/dhrystone/run.sh
  28. 485 0
      tests/benchmarks/dhrystone/src/dhry_1.c
  29. 187 0
      tests/benchmarks/dhrystone/src/dhry_2.c
  30. 113 13
      tests/benchmarks/jetstream/build.sh
  31. 9 6
      tests/benchmarks/jetstream/jetstream.patch
  32. 15 3
      tests/benchmarks/jetstream/run_aot.sh
  33. 24 0
      tests/benchmarks/jetstream/tsf.patch
  34. 10 1
      tests/benchmarks/libsodium/build.sh
  35. 39 6
      tests/benchmarks/libsodium/run_aot.sh
  36. 8 0
      tests/benchmarks/polybench/build.sh
  37. 11 1
      tests/benchmarks/polybench/run_aot.sh
  38. 1 1
      tests/benchmarks/polybench/run_interp.sh
  39. 6 1
      tests/benchmarks/sightglass/build.sh
  40. 11 1
      tests/benchmarks/sightglass/run_aot.sh
  41. 2 2
      tests/benchmarks/sightglass/run_interp.sh
  42. 67 1
      wamr-compiler/main.c

+ 10 - 0
ATTRIBUTIONS.md

@@ -16,6 +16,7 @@ WAMR project reused some components from other open source project:
 - **asmjit**: for the Fast JIT x86-64 codegen implementation
 - **zydis**: for the Fast JIT x86-64 codegen implementation
 - **NuttX ELF headers**: used in core/iwasm/aot/debug/elf_parser.c
+- **Dhrystone**: for the test benchmakr dhrystone
 
 The WAMR fast interpreter is a clean room development. We would acknowledge the inspirations by [WASM3](https://github.com/wasm3/wasm3) open source project for the approach of pre-calculated oprand stack location.
 
@@ -35,6 +36,7 @@ The WAMR fast interpreter is a clean room development. We would acknowledge the
 | asmjit | unspecified | unspecified | https://github.com/asmjit/asmjit | |
 | zydis | unspecified | e14a07895136182a5b53e181eec3b1c6e0b434de | https://github.com/zyantific/zydis | |
 | NuttX ELF headers | 72313301e23f9c2de969fb64b9a0f67bb4c284df | 10.3.0 | https://github.com/apache/incubator-nuttx | |
+| Dhrystone | 2.1 | 2.1 | https://fossies.org/linux/privat/old/ | |
 
 ## Licenses
 
@@ -81,15 +83,19 @@ The WAMR fast interpreter is a clean room development. We would acknowledge the
 [LICENSE](./tests/wamr-test-suites/spec-test-script/LICENSE)
 
 ### libuv
+
 [LICENSE](./core/iwasm/libraries/libc-uvwasi/LICENSE_LIBUV)
 
 ### uvwasi
+
 [LICENSE](./core/iwasm/libraries/libc-uvwasi/LICENSE_UVWASI)
 
 ### asmjit
+
 [LICENSE](./core/iwasm/fast-jit/cg/LICENSE_ASMJIT)
 
 ### zydis
+
 [LICENSE](./core/iwasm/fast-jit/cg/LICENSE_ZYDIS)
 
 ### NuttX ELF headers
@@ -97,3 +103,7 @@ The WAMR fast interpreter is a clean room development. We would acknowledge the
 [LICENSE](./core/iwasm/aot/debug/LICENSE_NUTTX)
 
 [NOTICE](./core/iwasm/aot/debug/NOTICE_NUTTX)
+
+### Dhrystone
+
+[LICENSE](./tests/benchmarks/dhrystone/LICENSE)

+ 10 - 0
core/iwasm/aot/aot_loader.c

@@ -2889,6 +2889,16 @@ load(const uint8 *buf, uint32 size, AOTModule *module, char *error_buf,
            module->code and will be destroyed in aot_unload() */
         destroy_sections(section_list, false);
     }
+
+#if 0
+    {
+        uint32 i;
+        for (i = 0; i < module->func_count; i++) {
+            os_printf("AOT func %u, addr: %p\n", i, module->func_ptrs[i]);
+        }
+    }
+#endif
+
     return ret;
 fail:
     return false;

+ 18 - 0
core/iwasm/aot/aot_runtime.c

@@ -1015,6 +1015,15 @@ execute_post_instantiate_functions(AOTModuleInstance *module_inst,
         }
     }
 
+#if defined(os_writegsbase)
+    {
+        AOTMemoryInstance *memory_inst = aot_get_default_memory(module_inst);
+        if (memory_inst)
+            /* write base addr of linear memory to GS segment register */
+            os_writegsbase(memory_inst->memory_data);
+    }
+#endif
+
     /* Execute start function for both main insance and sub instance */
     if (module->start_function) {
         AOTFunctionInstance start_func = { 0 };
@@ -1453,6 +1462,15 @@ aot_call_function(WASMExecEnv *exec_env, AOTFunctionInstance *function,
     }
     argc = func_type->param_cell_num;
 
+#if defined(os_writegsbase)
+    {
+        AOTMemoryInstance *memory_inst = aot_get_default_memory(module_inst);
+        if (memory_inst)
+            /* write base addr of linear memory to GS segment register */
+            os_writegsbase(memory_inst->memory_data);
+    }
+#endif
+
     /* func pointer was looked up previously */
     bh_assert(function->u.func.func_ptr != NULL);
 

+ 6 - 1
core/iwasm/common/wasm_memory.c

@@ -624,6 +624,11 @@ wasm_enlarge_memory_internal(WASMModuleInstance *module, uint32 inc_page_count)
 #endif
 #endif
 
+#if defined(os_writegsbase)
+    /* write base addr of linear memory to GS segment register */
+    os_writegsbase(memory_data_new);
+#endif
+
     return ret;
 }
 #else
@@ -756,4 +761,4 @@ wasm_get_linear_memory_size(WASMMemoryInstance *memory, void *node)
 #endif
     return linear_mem_size;
 }
-#endif
+#endif

+ 2 - 1
core/iwasm/common/wasm_runtime_common.c

@@ -130,7 +130,7 @@ static JitCompOptions jit_options = { 0 };
 #endif
 
 #if WASM_ENABLE_JIT != 0
-static LLVMJITOptions llvm_jit_options = { 3, 3 };
+static LLVMJITOptions llvm_jit_options = { 3, 3, 0 };
 #endif
 
 static RunningMode runtime_running_mode = Mode_Default;
@@ -554,6 +554,7 @@ wasm_runtime_full_init(RuntimeInitArgs *init_args)
 #if WASM_ENABLE_JIT != 0
     llvm_jit_options.size_level = init_args->llvm_jit_size_level;
     llvm_jit_options.opt_level = init_args->llvm_jit_opt_level;
+    llvm_jit_options.segue_flags = init_args->segue_flags;
 #endif
 
     if (!wasm_runtime_env_init()) {

+ 1 - 0
core/iwasm/common/wasm_runtime_common.h

@@ -420,6 +420,7 @@ typedef struct wasm_frame_t {
 typedef struct LLVMJITOptions {
     uint32 opt_level;
     uint32 size_level;
+    uint32 segue_flags;
 } LLVMJITOptions;
 #endif
 

+ 8 - 0
core/iwasm/compilation/aot_compiler.h

@@ -239,6 +239,13 @@ check_type_compatible(uint8 src_type, uint8 dst_type)
 #define FUNC_REF_TYPE comp_ctx->basic_types.funcref_type
 #define EXTERN_REF_TYPE comp_ctx->basic_types.externref_type
 
+#define INT8_PTR_TYPE_GS comp_ctx->basic_types.int8_ptr_type_gs
+#define INT16_PTR_TYPE_GS comp_ctx->basic_types.int16_ptr_type_gs
+#define INT32_PTR_TYPE_GS comp_ctx->basic_types.int32_ptr_type_gs
+#define INT64_PTR_TYPE_GS comp_ctx->basic_types.int64_ptr_type_gs
+#define F32_PTR_TYPE_GS comp_ctx->basic_types.float32_ptr_type_gs
+#define F64_PTR_TYPE_GS comp_ctx->basic_types.float64_ptr_type_gs
+
 #define I32_CONST(v) LLVMConstInt(I32_TYPE, v, true)
 #define I64_CONST(v) LLVMConstInt(I64_TYPE, v, true)
 #define F32_CONST(v) LLVMConstReal(F32_TYPE, v)
@@ -272,6 +279,7 @@ check_type_compatible(uint8 src_type, uint8 dst_type)
 
 #define V128_TYPE comp_ctx->basic_types.v128_type
 #define V128_PTR_TYPE comp_ctx->basic_types.v128_ptr_type
+#define V128_PTR_TYPE_GS comp_ctx->basic_types.v128_ptr_type_gs
 #define V128_i8x16_TYPE comp_ctx->basic_types.i8x16_vec_type
 #define V128_i16x8_TYPE comp_ctx->basic_types.i16x8_vec_type
 #define V128_i32x4_TYPE comp_ctx->basic_types.i32x4_vec_type

+ 185 - 48
core/iwasm/compilation/aot_emit_memory.c

@@ -81,7 +81,7 @@ get_memory_curr_page_count(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
 
 LLVMValueRef
 aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
-                          uint32 offset, uint32 bytes)
+                          uint32 offset, uint32 bytes, bool enable_segue)
 {
     LLVMValueRef offset_const = I32_CONST(offset);
     LLVMValueRef addr, maddr, offset1, cmp1, cmp2, cmp;
@@ -162,11 +162,20 @@ aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
             /* inside memory space */
             offset1 = I32_CONST((uint32)mem_offset);
             CHECK_LLVM_CONST(offset1);
-            if (!(maddr = LLVMBuildInBoundsGEP2(comp_ctx->builder, INT8_TYPE,
-                                                mem_base_addr, &offset1, 1,
-                                                "maddr"))) {
-                aot_set_last_error("llvm build add failed.");
-                goto fail;
+            if (!enable_segue) {
+                if (!(maddr = LLVMBuildInBoundsGEP2(comp_ctx->builder,
+                                                    INT8_TYPE, mem_base_addr,
+                                                    &offset1, 1, "maddr"))) {
+                    aot_set_last_error("llvm build add failed.");
+                    goto fail;
+                }
+            }
+            else {
+                if (!(maddr = LLVMBuildIntToPtr(comp_ctx->builder, offset1,
+                                                INT8_PTR_TYPE_GS, "maddr"))) {
+                    aot_set_last_error("llvm build IntToPtr failed.");
+                    goto fail;
+                }
             }
             return maddr;
         }
@@ -244,11 +253,29 @@ aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         }
     }
 
-    /* maddr = mem_base_addr + offset1 */
-    if (!(maddr = LLVMBuildInBoundsGEP2(comp_ctx->builder, INT8_TYPE,
+    if (!enable_segue) {
+        /* maddr = mem_base_addr + offset1 */
+        if (!(maddr =
+                  LLVMBuildInBoundsGEP2(comp_ctx->builder, INT8_TYPE,
                                         mem_base_addr, &offset1, 1, "maddr"))) {
-        aot_set_last_error("llvm build add failed.");
-        goto fail;
+            aot_set_last_error("llvm build add failed.");
+            goto fail;
+        }
+    }
+    else {
+        LLVMValueRef maddr_base;
+
+        if (!(maddr_base = LLVMBuildIntToPtr(comp_ctx->builder, addr,
+                                             INT8_PTR_TYPE_GS, "maddr_base"))) {
+            aot_set_last_error("llvm build int to ptr failed.");
+            goto fail;
+        }
+        if (!(maddr = LLVMBuildInBoundsGEP2(comp_ctx->builder, INT8_TYPE,
+                                            maddr_base, &offset_const, 1,
+                                            "maddr"))) {
+            aot_set_last_error("llvm build inboundgep failed.");
+            goto fail;
+        }
     }
     return maddr;
 fail:
@@ -388,13 +415,18 @@ aot_compile_op_i32_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 {
     LLVMValueRef maddr, value = NULL;
     LLVMTypeRef data_type;
+    bool enable_segue = comp_ctx->enable_segue_i32_load;
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
+                                            enable_segue)))
         return false;
 
     switch (bytes) {
         case 4:
-            BUILD_PTR_CAST(INT32_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT32_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT32_PTR_TYPE_GS);
 #if WASM_ENABLE_SHARED_MEMORY != 0
             if (atomic)
                 BUILD_ATOMIC_LOAD(align, I32_TYPE);
@@ -405,11 +437,17 @@ aot_compile_op_i32_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         case 2:
         case 1:
             if (bytes == 2) {
-                BUILD_PTR_CAST(INT16_PTR_TYPE);
+                if (!enable_segue)
+                    BUILD_PTR_CAST(INT16_PTR_TYPE);
+                else
+                    BUILD_PTR_CAST(INT16_PTR_TYPE_GS);
                 data_type = INT16_TYPE;
             }
             else {
-                BUILD_PTR_CAST(INT8_PTR_TYPE);
+                if (!enable_segue)
+                    BUILD_PTR_CAST(INT8_PTR_TYPE);
+                else
+                    BUILD_PTR_CAST(INT8_PTR_TYPE_GS);
                 data_type = INT8_TYPE;
             }
 
@@ -447,13 +485,18 @@ aot_compile_op_i64_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 {
     LLVMValueRef maddr, value = NULL;
     LLVMTypeRef data_type;
+    bool enable_segue = comp_ctx->enable_segue_i64_load;
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
+                                            enable_segue)))
         return false;
 
     switch (bytes) {
         case 8:
-            BUILD_PTR_CAST(INT64_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT64_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT64_PTR_TYPE_GS);
 #if WASM_ENABLE_SHARED_MEMORY != 0
             if (atomic)
                 BUILD_ATOMIC_LOAD(align, I64_TYPE);
@@ -465,15 +508,24 @@ aot_compile_op_i64_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         case 2:
         case 1:
             if (bytes == 4) {
-                BUILD_PTR_CAST(INT32_PTR_TYPE);
+                if (!enable_segue)
+                    BUILD_PTR_CAST(INT32_PTR_TYPE);
+                else
+                    BUILD_PTR_CAST(INT32_PTR_TYPE_GS);
                 data_type = I32_TYPE;
             }
             else if (bytes == 2) {
-                BUILD_PTR_CAST(INT16_PTR_TYPE);
+                if (!enable_segue)
+                    BUILD_PTR_CAST(INT16_PTR_TYPE);
+                else
+                    BUILD_PTR_CAST(INT16_PTR_TYPE_GS);
                 data_type = INT16_TYPE;
             }
             else {
-                BUILD_PTR_CAST(INT8_PTR_TYPE);
+                if (!enable_segue)
+                    BUILD_PTR_CAST(INT8_PTR_TYPE);
+                else
+                    BUILD_PTR_CAST(INT8_PTR_TYPE_GS);
                 data_type = INT8_TYPE;
             }
 
@@ -509,12 +561,18 @@ aot_compile_op_f32_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                         uint32 align, uint32 offset)
 {
     LLVMValueRef maddr, value;
+    bool enable_segue = comp_ctx->enable_segue_f32_load;
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 4)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 4,
+                                            enable_segue)))
         return false;
 
-    BUILD_PTR_CAST(F32_PTR_TYPE);
+    if (!enable_segue)
+        BUILD_PTR_CAST(F32_PTR_TYPE);
+    else
+        BUILD_PTR_CAST(F32_PTR_TYPE_GS);
     BUILD_LOAD(F32_TYPE);
+
     PUSH_F32(value);
     return true;
 fail:
@@ -526,12 +584,18 @@ aot_compile_op_f64_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                         uint32 align, uint32 offset)
 {
     LLVMValueRef maddr, value;
+    bool enable_segue = comp_ctx->enable_segue_f64_load;
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 8)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 8,
+                                            enable_segue)))
         return false;
 
-    BUILD_PTR_CAST(F64_PTR_TYPE);
+    if (!enable_segue)
+        BUILD_PTR_CAST(F64_PTR_TYPE);
+    else
+        BUILD_PTR_CAST(F64_PTR_TYPE_GS);
     BUILD_LOAD(F64_TYPE);
+
     PUSH_F64(value);
     return true;
 fail:
@@ -543,22 +607,33 @@ aot_compile_op_i32_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                          uint32 align, uint32 offset, uint32 bytes, bool atomic)
 {
     LLVMValueRef maddr, value;
+    bool enable_segue = comp_ctx->enable_segue_i32_store;
 
     POP_I32(value);
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
+                                            enable_segue)))
         return false;
 
     switch (bytes) {
         case 4:
-            BUILD_PTR_CAST(INT32_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT32_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT32_PTR_TYPE_GS);
             break;
         case 2:
-            BUILD_PTR_CAST(INT16_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT16_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT16_PTR_TYPE_GS);
             BUILD_TRUNC(value, INT16_TYPE);
             break;
         case 1:
-            BUILD_PTR_CAST(INT8_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT8_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT8_PTR_TYPE_GS);
             BUILD_TRUNC(value, INT8_TYPE);
             break;
         default:
@@ -582,26 +657,40 @@ aot_compile_op_i64_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                          uint32 align, uint32 offset, uint32 bytes, bool atomic)
 {
     LLVMValueRef maddr, value;
+    bool enable_segue = comp_ctx->enable_segue_i64_store;
 
     POP_I64(value);
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
+                                            enable_segue)))
         return false;
 
     switch (bytes) {
         case 8:
-            BUILD_PTR_CAST(INT64_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT64_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT64_PTR_TYPE_GS);
             break;
         case 4:
-            BUILD_PTR_CAST(INT32_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT32_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT32_PTR_TYPE_GS);
             BUILD_TRUNC(value, I32_TYPE);
             break;
         case 2:
-            BUILD_PTR_CAST(INT16_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT16_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT16_PTR_TYPE_GS);
             BUILD_TRUNC(value, INT16_TYPE);
             break;
         case 1:
-            BUILD_PTR_CAST(INT8_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT8_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT8_PTR_TYPE_GS);
             BUILD_TRUNC(value, INT8_TYPE);
             break;
         default:
@@ -625,13 +714,18 @@ aot_compile_op_f32_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                          uint32 align, uint32 offset)
 {
     LLVMValueRef maddr, value;
+    bool enable_segue = comp_ctx->enable_segue_f32_store;
 
     POP_F32(value);
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 4)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 4,
+                                            enable_segue)))
         return false;
 
-    BUILD_PTR_CAST(F32_PTR_TYPE);
+    if (!enable_segue)
+        BUILD_PTR_CAST(F32_PTR_TYPE);
+    else
+        BUILD_PTR_CAST(F32_PTR_TYPE_GS);
     BUILD_STORE();
     return true;
 fail:
@@ -643,13 +737,18 @@ aot_compile_op_f64_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                          uint32 align, uint32 offset)
 {
     LLVMValueRef maddr, value;
+    bool enable_segue = comp_ctx->enable_segue_f64_store;
 
     POP_F64(value);
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 8)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 8,
+                                            enable_segue)))
         return false;
 
-    BUILD_PTR_CAST(F64_PTR_TYPE);
+    if (!enable_segue)
+        BUILD_PTR_CAST(F64_PTR_TYPE);
+    else
+        BUILD_PTR_CAST(F64_PTR_TYPE_GS);
     BUILD_STORE();
     return true;
 fail:
@@ -1140,13 +1239,19 @@ aot_compile_op_atomic_rmw(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                           uint32 offset, uint32 bytes)
 {
     LLVMValueRef maddr, value, result;
+    bool enable_segue = (op_type == VALUE_TYPE_I32)
+                            ? comp_ctx->enable_segue_i32_load
+                                  && comp_ctx->enable_segue_i32_store
+                            : comp_ctx->enable_segue_i64_load
+                                  && comp_ctx->enable_segue_i64_store;
 
     if (op_type == VALUE_TYPE_I32)
         POP_I32(value);
     else
         POP_I64(value);
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
+                                            enable_segue)))
         return false;
 
     if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align))
@@ -1154,19 +1259,31 @@ aot_compile_op_atomic_rmw(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     switch (bytes) {
         case 8:
-            BUILD_PTR_CAST(INT64_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT64_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT64_PTR_TYPE_GS);
             break;
         case 4:
-            BUILD_PTR_CAST(INT32_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT32_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT32_PTR_TYPE_GS);
             if (op_type == VALUE_TYPE_I64)
                 BUILD_TRUNC(value, I32_TYPE);
             break;
         case 2:
-            BUILD_PTR_CAST(INT16_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT16_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT16_PTR_TYPE_GS);
             BUILD_TRUNC(value, INT16_TYPE);
             break;
         case 1:
-            BUILD_PTR_CAST(INT8_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT8_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT8_PTR_TYPE_GS);
             BUILD_TRUNC(value, INT8_TYPE);
             break;
         default:
@@ -1208,6 +1325,11 @@ aot_compile_op_atomic_cmpxchg(AOTCompContext *comp_ctx,
                               uint32 align, uint32 offset, uint32 bytes)
 {
     LLVMValueRef maddr, value, expect, result;
+    bool enable_segue = (op_type == VALUE_TYPE_I32)
+                            ? comp_ctx->enable_segue_i32_load
+                                  && comp_ctx->enable_segue_i32_store
+                            : comp_ctx->enable_segue_i64_load
+                                  && comp_ctx->enable_segue_i64_store;
 
     if (op_type == VALUE_TYPE_I32) {
         POP_I32(value);
@@ -1218,7 +1340,8 @@ aot_compile_op_atomic_cmpxchg(AOTCompContext *comp_ctx,
         POP_I64(expect);
     }
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
+                                            enable_segue)))
         return false;
 
     if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align))
@@ -1226,22 +1349,34 @@ aot_compile_op_atomic_cmpxchg(AOTCompContext *comp_ctx,
 
     switch (bytes) {
         case 8:
-            BUILD_PTR_CAST(INT64_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT64_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT64_PTR_TYPE_GS);
             break;
         case 4:
-            BUILD_PTR_CAST(INT32_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT32_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT32_PTR_TYPE_GS);
             if (op_type == VALUE_TYPE_I64) {
                 BUILD_TRUNC(value, I32_TYPE);
                 BUILD_TRUNC(expect, I32_TYPE);
             }
             break;
         case 2:
-            BUILD_PTR_CAST(INT16_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT16_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT16_PTR_TYPE_GS);
             BUILD_TRUNC(value, INT16_TYPE);
             BUILD_TRUNC(expect, INT16_TYPE);
             break;
         case 1:
-            BUILD_PTR_CAST(INT8_PTR_TYPE);
+            if (!enable_segue)
+                BUILD_PTR_CAST(INT8_PTR_TYPE);
+            else
+                BUILD_PTR_CAST(INT8_PTR_TYPE_GS);
             BUILD_TRUNC(value, INT8_TYPE);
             BUILD_TRUNC(expect, INT8_TYPE);
             break;
@@ -1318,7 +1453,8 @@ aot_compile_op_atomic_wait(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     CHECK_LLVM_CONST(is_wait64);
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
+                                            false)))
         return false;
 
     if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align))
@@ -1393,7 +1529,8 @@ aot_compiler_op_atomic_notify(AOTCompContext *comp_ctx,
 
     POP_I32(count);
 
-    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
+                                            false)))
         return false;
 
     if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align))

+ 1 - 1
core/iwasm/compilation/aot_emit_memory.h

@@ -53,7 +53,7 @@ aot_compile_op_f64_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
 LLVMValueRef
 aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
-                          uint32 offset, uint32 bytes);
+                          uint32 offset, uint32 bytes, bool enable_segue);
 
 bool
 aot_compile_op_memory_size(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);

+ 53 - 0
core/iwasm/compilation/aot_llvm.c

@@ -1132,6 +1132,28 @@ aot_set_llvm_basic_types(AOTLLVMTypes *basic_types, LLVMContextRef context)
     basic_types->v128_type = basic_types->i64x2_vec_type;
     basic_types->v128_ptr_type = LLVMPointerType(basic_types->v128_type, 0);
 
+    basic_types->int8_ptr_type_gs =
+        LLVMPointerType(basic_types->int8_type, 256);
+    basic_types->int16_ptr_type_gs =
+        LLVMPointerType(basic_types->int16_type, 256);
+    basic_types->int32_ptr_type_gs =
+        LLVMPointerType(basic_types->int32_type, 256);
+    basic_types->int64_ptr_type_gs =
+        LLVMPointerType(basic_types->int64_type, 256);
+    basic_types->float32_ptr_type_gs =
+        LLVMPointerType(basic_types->float32_type, 256);
+    basic_types->float64_ptr_type_gs =
+        LLVMPointerType(basic_types->float64_type, 256);
+    basic_types->v128_ptr_type_gs =
+        LLVMPointerType(basic_types->v128_type, 256);
+    if (!basic_types->int8_ptr_type_gs || !basic_types->int16_ptr_type_gs
+        || !basic_types->int32_ptr_type_gs || !basic_types->int64_ptr_type_gs
+        || !basic_types->float32_ptr_type_gs
+        || !basic_types->float64_ptr_type_gs
+        || !basic_types->v128_ptr_type_gs) {
+        return false;
+    }
+
     basic_types->i1x2_vec_type = LLVMVectorType(basic_types->int1_type, 2);
 
     basic_types->funcref_type = LLVMInt32TypeInContext(context);
@@ -2073,6 +2095,37 @@ aot_create_comp_context(const AOTCompData *comp_data, aot_comp_option_t option)
         }
     }
 
+    triple = LLVMGetTargetMachineTriple(comp_ctx->target_machine);
+    if (!triple) {
+        aot_set_last_error("get target machine triple failed.");
+        goto fail;
+    }
+    if (strstr(triple, "linux") && !strcmp(comp_ctx->target_arch, "x86_64")) {
+        if (option->segue_flags) {
+            if (option->segue_flags & (1 << 0))
+                comp_ctx->enable_segue_i32_load = true;
+            if (option->segue_flags & (1 << 1))
+                comp_ctx->enable_segue_i64_load = true;
+            if (option->segue_flags & (1 << 2))
+                comp_ctx->enable_segue_f32_load = true;
+            if (option->segue_flags & (1 << 3))
+                comp_ctx->enable_segue_f64_load = true;
+            if (option->segue_flags & (1 << 4))
+                comp_ctx->enable_segue_v128_load = true;
+            if (option->segue_flags & (1 << 8))
+                comp_ctx->enable_segue_i32_store = true;
+            if (option->segue_flags & (1 << 9))
+                comp_ctx->enable_segue_i64_store = true;
+            if (option->segue_flags & (1 << 10))
+                comp_ctx->enable_segue_f32_store = true;
+            if (option->segue_flags & (1 << 11))
+                comp_ctx->enable_segue_f64_store = true;
+            if (option->segue_flags & (1 << 12))
+                comp_ctx->enable_segue_v128_store = true;
+        }
+    }
+    LLVMDisposeMessage(triple);
+
     if (option->enable_simd && strcmp(comp_ctx->target_arch, "x86_64") != 0
         && strncmp(comp_ctx->target_arch, "aarch64", 7) != 0) {
         /* Disable simd if it isn't supported by target arch */

+ 22 - 0
core/iwasm/compilation/aot_llvm.h

@@ -214,6 +214,14 @@ typedef struct AOTLLVMTypes {
     LLVMTypeRef f32x4_vec_type;
     LLVMTypeRef f64x2_vec_type;
 
+    LLVMTypeRef int8_ptr_type_gs;
+    LLVMTypeRef int16_ptr_type_gs;
+    LLVMTypeRef int32_ptr_type_gs;
+    LLVMTypeRef int64_ptr_type_gs;
+    LLVMTypeRef float32_ptr_type_gs;
+    LLVMTypeRef float64_ptr_type_gs;
+    LLVMTypeRef v128_ptr_type_gs;
+
     LLVMTypeRef i1x2_vec_type;
 
     LLVMTypeRef meta_data_type;
@@ -341,6 +349,19 @@ typedef struct AOTCompContext {
     /* Disable LLVM link time optimization */
     bool disable_llvm_lto;
 
+    /* Enable to use segument register as the base addr
+       of linear memory for load/store operations */
+    bool enable_segue_i32_load;
+    bool enable_segue_i64_load;
+    bool enable_segue_f32_load;
+    bool enable_segue_f64_load;
+    bool enable_segue_v128_load;
+    bool enable_segue_i32_store;
+    bool enable_segue_i64_store;
+    bool enable_segue_f32_store;
+    bool enable_segue_f64_store;
+    bool enable_segue_v128_store;
+
     /* Whether optimize the JITed code */
     bool optimize;
 
@@ -413,6 +434,7 @@ typedef struct AOTCompOption {
     uint32 output_format;
     uint32 bounds_checks;
     uint32 stack_bounds_checks;
+    uint32 segue_flags;
     char **custom_sections;
     uint32 custom_sections_count;
     const char *stack_usage_file;

+ 52 - 22
core/iwasm/compilation/simd/simd_load_store.c

@@ -14,12 +14,12 @@
 static LLVMValueRef
 simd_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, uint32 align,
           uint32 offset, uint32 data_length, LLVMTypeRef ptr_type,
-          LLVMTypeRef data_type)
+          LLVMTypeRef data_type, bool enable_segue)
 {
     LLVMValueRef maddr, data;
 
     if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset,
-                                            data_length))) {
+                                            data_length, enable_segue))) {
         HANDLE_FAILURE("aot_check_memory_overflow");
         return NULL;
     }
@@ -44,10 +44,12 @@ bool
 aot_compile_simd_v128_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                            uint32 align, uint32 offset)
 {
+    bool enable_segue = comp_ctx->enable_segue_v128_load;
+    LLVMTypeRef v128_ptr_type = enable_segue ? V128_PTR_TYPE_GS : V128_PTR_TYPE;
     LLVMValueRef result;
 
     if (!(result = simd_load(comp_ctx, func_ctx, align, offset, 16,
-                             V128_PTR_TYPE, V128_TYPE))) {
+                             v128_ptr_type, V128_TYPE, enable_segue))) {
         return false;
     }
 
@@ -75,6 +77,7 @@ aot_compile_simd_load_extend(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         LLVMVectorType(I32_TYPE, 2),   LLVMVectorType(I32_TYPE, 2),
     };
     LLVMTypeRef sub_vector_type, sub_vector_ptr_type;
+    bool enable_segue = comp_ctx->enable_segue_v128_load;
 
     bh_assert(opcode_index < 6);
 
@@ -82,13 +85,15 @@ aot_compile_simd_load_extend(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     /* to vector ptr type */
     if (!sub_vector_type
-        || !(sub_vector_ptr_type = LLVMPointerType(sub_vector_type, 0))) {
+        || !(sub_vector_ptr_type =
+                 LLVMPointerType(sub_vector_type, enable_segue ? 256 : 0))) {
         HANDLE_FAILURE("LLVMPointerType");
         return false;
     }
 
-    if (!(sub_vector = simd_load(comp_ctx, func_ctx, align, offset, 8,
-                                 sub_vector_ptr_type, sub_vector_type))) {
+    if (!(sub_vector =
+              simd_load(comp_ctx, func_ctx, align, offset, 8,
+                        sub_vector_ptr_type, sub_vector_type, enable_segue))) {
         return false;
     }
 
@@ -118,6 +123,9 @@ aot_compile_simd_load_splat(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     LLVMValueRef element, result;
     LLVMTypeRef element_ptr_types[] = { INT8_PTR_TYPE, INT16_PTR_TYPE,
                                         INT32_PTR_TYPE, INT64_PTR_TYPE };
+    LLVMTypeRef element_ptr_types_gs[] = { INT8_PTR_TYPE_GS, INT16_PTR_TYPE_GS,
+                                           INT32_PTR_TYPE_GS,
+                                           INT64_PTR_TYPE_GS };
     LLVMTypeRef element_data_types[] = { INT8_TYPE, INT16_TYPE, I32_TYPE,
                                          I64_TYPE };
     uint32 data_lengths[] = { 1, 2, 4, 8 };
@@ -133,13 +141,16 @@ aot_compile_simd_load_splat(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         LLVM_CONST(i32x4_zero),
         LLVM_CONST(i32x2_zero),
     };
+    bool enable_segue = comp_ctx->enable_segue_v128_load;
 
     bh_assert(opcode_index < 4);
 
-    if (!(element = simd_load(comp_ctx, func_ctx, align, offset,
-                              data_lengths[opcode_index],
-                              element_ptr_types[opcode_index],
-                              element_data_types[opcode_index]))) {
+    if (!(element = simd_load(
+              comp_ctx, func_ctx, align, offset, data_lengths[opcode_index],
+              comp_ctx->enable_segue_v128_load
+                  ? element_ptr_types_gs[opcode_index]
+                  : element_ptr_types[opcode_index],
+              element_data_types[opcode_index], enable_segue))) {
         return false;
     }
 
@@ -170,11 +181,15 @@ aot_compile_simd_load_lane(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     uint32 data_lengths[] = { 1, 2, 4, 8 };
     LLVMTypeRef element_ptr_types[] = { INT8_PTR_TYPE, INT16_PTR_TYPE,
                                         INT32_PTR_TYPE, INT64_PTR_TYPE };
+    LLVMTypeRef element_ptr_types_gs[] = { INT8_PTR_TYPE_GS, INT16_PTR_TYPE_GS,
+                                           INT32_PTR_TYPE_GS,
+                                           INT64_PTR_TYPE_GS };
     LLVMTypeRef element_data_types[] = { INT8_TYPE, INT16_TYPE, I32_TYPE,
                                          I64_TYPE };
     LLVMTypeRef vector_types[] = { V128_i8x16_TYPE, V128_i16x8_TYPE,
                                    V128_i32x4_TYPE, V128_i64x2_TYPE };
     LLVMValueRef lane = simd_lane_id_to_llvm_value(comp_ctx, lane_id);
+    bool enable_segue = comp_ctx->enable_segue_v128_load;
 
     bh_assert(opcode_index < 4);
 
@@ -183,10 +198,12 @@ aot_compile_simd_load_lane(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         return false;
     }
 
-    if (!(element = simd_load(comp_ctx, func_ctx, align, offset,
-                              data_lengths[opcode_index],
-                              element_ptr_types[opcode_index],
-                              element_data_types[opcode_index]))) {
+    if (!(element = simd_load(
+              comp_ctx, func_ctx, align, offset, data_lengths[opcode_index],
+              comp_ctx->enable_segue_v128_load
+                  ? element_ptr_types_gs[opcode_index]
+                  : element_ptr_types[opcode_index],
+              element_data_types[opcode_index], enable_segue))) {
         return false;
     }
 
@@ -207,6 +224,8 @@ aot_compile_simd_load_zero(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     uint32 opcode_index = opcode - SIMD_v128_load32_zero;
     uint32 data_lengths[] = { 4, 8 };
     LLVMTypeRef element_ptr_types[] = { INT32_PTR_TYPE, INT64_PTR_TYPE };
+    LLVMTypeRef element_ptr_types_gs[] = { INT32_PTR_TYPE_GS,
+                                           INT64_PTR_TYPE_GS };
     LLVMTypeRef element_data_types[] = { I32_TYPE, I64_TYPE };
     LLVMValueRef zero[] = {
         LLVM_CONST(i32x4_vec_zero),
@@ -222,13 +241,16 @@ aot_compile_simd_load_zero(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
           LLVM_CONST(i32_six) },
         { LLVM_CONST(i32_zero), LLVM_CONST(i32_two) },
     };
+    bool enable_segue = comp_ctx->enable_segue_v128_load;
 
     bh_assert(opcode_index < 2);
 
-    if (!(element = simd_load(comp_ctx, func_ctx, align, offset,
-                              data_lengths[opcode_index],
-                              element_ptr_types[opcode_index],
-                              element_data_types[opcode_index]))) {
+    if (!(element = simd_load(
+              comp_ctx, func_ctx, align, offset, data_lengths[opcode_index],
+              comp_ctx->enable_segue_v128_load
+                  ? element_ptr_types_gs[opcode_index]
+                  : element_ptr_types[opcode_index],
+              element_data_types[opcode_index], enable_segue))) {
         return false;
     }
 
@@ -260,12 +282,12 @@ aot_compile_simd_load_zero(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 static bool
 simd_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, uint32 align,
            uint32 offset, uint32 data_length, LLVMValueRef value,
-           LLVMTypeRef value_ptr_type)
+           LLVMTypeRef value_ptr_type, bool enable_segue)
 {
     LLVMValueRef maddr, result;
 
     if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset,
-                                            data_length)))
+                                            data_length, enable_segue)))
         return false;
 
     if (!(maddr = LLVMBuildBitCast(comp_ctx->builder, maddr, value_ptr_type,
@@ -288,12 +310,14 @@ bool
 aot_compile_simd_v128_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                             uint32 align, uint32 offset)
 {
+    bool enable_segue = comp_ctx->enable_segue_v128_store;
+    LLVMTypeRef v128_ptr_type = enable_segue ? V128_PTR_TYPE_GS : V128_PTR_TYPE;
     LLVMValueRef value;
 
     POP_V128(value);
 
     return simd_store(comp_ctx, func_ctx, align, offset, 16, value,
-                      V128_PTR_TYPE);
+                      v128_ptr_type, enable_segue);
 fail:
     return false;
 }
@@ -307,10 +331,14 @@ aot_compile_simd_store_lane(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     uint32 data_lengths[] = { 1, 2, 4, 8 };
     LLVMTypeRef element_ptr_types[] = { INT8_PTR_TYPE, INT16_PTR_TYPE,
                                         INT32_PTR_TYPE, INT64_PTR_TYPE };
+    LLVMTypeRef element_ptr_types_gs[] = { INT8_PTR_TYPE_GS, INT16_PTR_TYPE_GS,
+                                           INT32_PTR_TYPE_GS,
+                                           INT64_PTR_TYPE_GS };
     uint32 opcode_index = opcode - SIMD_v128_store8_lane;
     LLVMTypeRef vector_types[] = { V128_i8x16_TYPE, V128_i16x8_TYPE,
                                    V128_i32x4_TYPE, V128_i64x2_TYPE };
     LLVMValueRef lane = simd_lane_id_to_llvm_value(comp_ctx, lane_id);
+    bool enable_segue = comp_ctx->enable_segue_v128_store;
 
     bh_assert(opcode_index < 4);
 
@@ -327,5 +355,7 @@ aot_compile_simd_store_lane(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     return simd_store(comp_ctx, func_ctx, align, offset,
                       data_lengths[opcode_index], element,
-                      element_ptr_types[opcode_index]);
+                      enable_segue ? element_ptr_types_gs[opcode_index]
+                                   : element_ptr_types[opcode_index],
+                      enable_segue);
 }

+ 1 - 0
core/iwasm/include/aot_export.h

@@ -61,6 +61,7 @@ typedef struct AOTCompOption {
     uint32_t output_format;
     uint32_t bounds_checks;
     uint32_t stack_bounds_checks;
+    uint32_t segue_flags;
     char **custom_sections;
     uint32_t custom_sections_count;
     const char *stack_usage_file;

+ 7 - 4
core/iwasm/include/wasm_export.h

@@ -167,6 +167,8 @@ typedef struct RuntimeInitArgs {
     /* LLVM JIT opt and size level */
     uint32_t llvm_jit_opt_level;
     uint32_t llvm_jit_size_level;
+    /* Segue optimization flags for LLVM JIT */
+    uint32_t segue_flags;
 } RuntimeInitArgs;
 
 #ifndef WASM_VALKIND_T_DEFINED
@@ -1351,20 +1353,21 @@ WASM_RUNTIME_API_EXTERN void
 wasm_runtime_get_version(uint32_t *major, uint32_t *minor, uint32_t *patch);
 
 /**
- * Check whether an import func `(import <module_name> <func_name> (func ...))` is linked or not
- * with runtime registered natvie functions
+ * Check whether an import func `(import <module_name> <func_name> (func ...))`
+ * is linked or not with runtime registered natvie functions
  */
 WASM_RUNTIME_API_EXTERN bool
 wasm_runtime_is_import_func_linked(const char *module_name,
                                    const char *func_name);
 
 /**
- * Check whether an import global `(import <module_name> <global_name> (global ...))` is linked or not
- * with runtime registered natvie globals
+ * Check whether an import global `(import <module_name> <global_name> (global ...))`
+ * is linked or not with runtime registered natvie globals
  */
 WASM_RUNTIME_API_EXTERN bool
 wasm_runtime_is_import_global_linked(const char *module_name,
                                      const char *global_name);
+
 /* clang-format on */
 
 #ifdef __cplusplus

+ 9 - 0
core/iwasm/interpreter/wasm_interp_classic.c

@@ -4231,6 +4231,15 @@ wasm_interp_call_wasm(WASMModuleInstance *module_inst, WASMExecEnv *exec_env,
 
     wasm_exec_env_set_cur_frame(exec_env, frame);
 
+#if defined(os_writegsbase)
+    {
+        WASMMemoryInstance *memory_inst = wasm_get_default_memory(module_inst);
+        if (memory_inst)
+            /* write base addr of linear memory to GS segment register */
+            os_writegsbase(memory_inst->memory_data);
+    }
+#endif
+
     if (function->is_import_func) {
 #if WASM_ENABLE_MULTI_MODULE != 0
         if (function->import_module_inst) {

+ 9 - 0
core/iwasm/interpreter/wasm_interp_fast.c

@@ -3979,6 +3979,15 @@ wasm_interp_call_wasm(WASMModuleInstance *module_inst, WASMExecEnv *exec_env,
 
     wasm_exec_env_set_cur_frame(exec_env, frame);
 
+#if defined(os_writegsbase)
+    {
+        WASMMemoryInstance *memory_inst = wasm_get_default_memory(module_inst);
+        if (memory_inst)
+            /* write base addr of linear memory to GS segment register */
+            os_writegsbase(memory_inst->memory_data);
+    }
+#endif
+
     if (function->is_import_func) {
 #if WASM_ENABLE_MULTI_MODULE != 0
         if (function->import_module_inst) {

+ 2 - 1
core/iwasm/interpreter/wasm_loader.c

@@ -3000,7 +3000,7 @@ init_llvm_jit_functions_stage1(WASMModule *module, char *error_buf,
     if (module->function_count == 0)
         return true;
 
-#if WASM_ENABLE_FAST_JIT != 0 && WASM_ENABLE_LLVM_JIT != 0
+#if WASM_ENABLE_FAST_JIT != 0 && WASM_ENABLE_LAZY_JIT != 0
     if (os_mutex_init(&module->tierup_wait_lock) != 0) {
         set_error_buf(error_buf, error_buf_size, "init jit tierup lock failed");
         return false;
@@ -3035,6 +3035,7 @@ init_llvm_jit_functions_stage1(WASMModule *module, char *error_buf,
     llvm_jit_options = wasm_runtime_get_llvm_jit_options();
     option.opt_level = llvm_jit_options.opt_level;
     option.size_level = llvm_jit_options.size_level;
+    option.segue_flags = llvm_jit_options.segue_flags;
 
 #if WASM_ENABLE_BULK_MEMORY != 0
     option.enable_bulk_memory = true;

+ 2 - 1
core/iwasm/interpreter/wasm_mini_loader.c

@@ -1843,7 +1843,7 @@ init_llvm_jit_functions_stage1(WASMModule *module, char *error_buf,
     if (module->function_count == 0)
         return true;
 
-#if WASM_ENABLE_FAST_JIT != 0 && WASM_ENABLE_LLVM_JIT != 0
+#if WASM_ENABLE_FAST_JIT != 0 && WASM_ENABLE_LAZY_JIT != 0
     if (os_mutex_init(&module->tierup_wait_lock) != 0) {
         set_error_buf(error_buf, error_buf_size, "init jit tierup lock failed");
         return false;
@@ -1876,6 +1876,7 @@ init_llvm_jit_functions_stage1(WASMModule *module, char *error_buf,
     option.is_jit_mode = true;
     option.opt_level = llvm_jit_options.opt_level;
     option.size_level = llvm_jit_options.size_level;
+    option.segue_flags = llvm_jit_options.segue_flags;
 
 #if WASM_ENABLE_BULK_MEMORY != 0
     option.enable_bulk_memory = true;

+ 14 - 0
core/shared/platform/linux-sgx/platform_internal.h

@@ -56,6 +56,20 @@ typedef unsigned int korp_sem;
 #define OS_THREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
 #endif
 
+#if defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)
+#define os_writegsbase(base_addr)                                 \
+    do {                                                          \
+        uint64 __gs_value = (uint64)(uintptr_t)base_addr;         \
+        asm volatile("wrgsbase %0" ::"r"(__gs_value) : "memory"); \
+    } while (0)
+#if 0
+/* _writegsbase_u64 also works, but need to add -mfsgsbase flag for gcc */
+#include <immintrin.h>
+#define os_writegsbase(base_addr) \
+    _writegsbase_u64(((uint64)(uintptr_t)base_addr))
+#endif
+#endif
+
 typedef int (*os_print_function_t)(const char *message);
 void
 os_set_print_function(os_print_function_t pf);

+ 14 - 0
core/shared/platform/linux/platform_internal.h

@@ -63,6 +63,20 @@ typedef sem_t korp_sem;
 
 #define bh_socket_t int
 
+#if defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)
+#define os_writegsbase(base_addr)                                 \
+    do {                                                          \
+        uint64 __gs_value = (uint64)(uintptr_t)base_addr;         \
+        asm volatile("wrgsbase %0" ::"r"(__gs_value) : "memory"); \
+    } while (0)
+#if 0
+/* _writegsbase_u64 also works, but need to add -mfsgsbase flag for gcc */
+#include <immintrin.h>
+#define os_writegsbase(base_addr) \
+    _writegsbase_u64(((uint64)(uintptr_t)base_addr))
+#endif
+#endif
+
 #if WASM_DISABLE_HW_BOUND_CHECK == 0
 #if defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)            \
     || defined(BUILD_TARGET_AARCH64) || defined(BUILD_TARGET_RISCV64_LP64D) \

+ 77 - 5
product-mini/platforms/posix/main.c

@@ -54,6 +54,14 @@ print_help()
 #if WASM_ENABLE_JIT != 0
     printf("  --llvm-jit-size-level=n  Set LLVM JIT size level, default is 3\n");
     printf("  --llvm-jit-opt-level=n   Set LLVM JIT optimization level, default is 3\n");
+#if defined(os_writegsbase)
+    printf("  --enable-segue[=<flags>] Enable using segment register GS as the base address of\n");
+    printf("                           linear memory, which may improve performance, flags can be:\n");
+    printf("                              i32.load, i64.load, f32.load, f64.load, v128.load,\n");
+    printf("                              i32.store, i64.store, f32.store, f64.store, v128.store\n");
+    printf("                           Use comma to separate, e.g. --enable-segue=i32.load,i64.store\n");
+    printf("                           and --enable-segue means all flags are added.\n");
+#endif
 #endif
     printf("  --repl                   Start a very simple REPL (read-eval-print-loop) mode\n"
            "                           that runs commands in the form of \"FUNC ARG...\"\n");
@@ -117,13 +125,13 @@ app_instance_func(wasm_module_inst_t module_inst, const char *func_name)
 }
 
 /**
- * Split a space separated strings into an array of strings
+ * Split a string into an array of strings
  * Returns NULL on failure
  * Memory must be freed by caller
  * Based on: http://stackoverflow.com/a/11198630/471795
  */
 static char **
-split_string(char *str, int *count)
+split_string(char *str, int *count, const char *delimer)
 {
     char **res = NULL, **res1;
     char *p;
@@ -131,7 +139,7 @@ split_string(char *str, int *count)
 
     /* split string and append tokens to 'res' */
     do {
-        p = strtok(str, " ");
+        p = strtok(str, delimer);
         str = NULL;
         res1 = res;
         res = (char **)realloc(res1, sizeof(char *) * (uint32)(idx + 1));
@@ -180,7 +188,7 @@ app_instance_repl(wasm_module_inst_t module_inst)
             printf("exit repl mode\n");
             break;
         }
-        app_argv = split_string(cmd, &app_argc);
+        app_argv = split_string(cmd, &app_argc, " ");
         if (app_argv == NULL) {
             LOG_ERROR("Wasm prepare param failed: split string failed.\n");
             break;
@@ -195,6 +203,59 @@ app_instance_repl(wasm_module_inst_t module_inst)
     return NULL;
 }
 
+#if WASM_ENABLE_JIT != 0
+static uint32
+resolve_segue_flags(char *str_flags)
+{
+    uint32 segue_flags = 0;
+    int32 flag_count, i;
+    char **flag_list;
+
+    flag_list = split_string(str_flags, &flag_count, ",");
+    if (flag_list) {
+        for (i = 0; i < flag_count; i++) {
+            if (!strcmp(flag_list[i], "i32.load")) {
+                segue_flags |= 1 << 0;
+            }
+            else if (!strcmp(flag_list[i], "i64.load")) {
+                segue_flags |= 1 << 1;
+            }
+            else if (!strcmp(flag_list[i], "f32.load")) {
+                segue_flags |= 1 << 2;
+            }
+            else if (!strcmp(flag_list[i], "f64.load")) {
+                segue_flags |= 1 << 3;
+            }
+            else if (!strcmp(flag_list[i], "v128.load")) {
+                segue_flags |= 1 << 4;
+            }
+            else if (!strcmp(flag_list[i], "i32.store")) {
+                segue_flags |= 1 << 8;
+            }
+            else if (!strcmp(flag_list[i], "i64.store")) {
+                segue_flags |= 1 << 9;
+            }
+            else if (!strcmp(flag_list[i], "f32.store")) {
+                segue_flags |= 1 << 10;
+            }
+            else if (!strcmp(flag_list[i], "f64.store")) {
+                segue_flags |= 1 << 11;
+            }
+            else if (!strcmp(flag_list[i], "v128.store")) {
+                segue_flags |= 1 << 12;
+            }
+            else {
+                /* invalid flag */
+                segue_flags = (uint32)-1;
+                break;
+            }
+        }
+        free(flag_list);
+    }
+    return segue_flags;
+}
+#endif /* end of WASM_ENABLE_JIT != 0 */
+
 #if WASM_ENABLE_LIBC_WASI != 0
 static bool
 validate_env_str(char *env)
@@ -367,6 +428,7 @@ main(int argc, char *argv[])
 #if WASM_ENABLE_JIT != 0
     uint32 llvm_jit_size_level = 3;
     uint32 llvm_jit_opt_level = 3;
+    uint32 segue_flags = 0;
 #endif
     wasm_module_t wasm_module = NULL;
     wasm_module_inst_t wasm_module_inst = NULL;
@@ -487,7 +549,16 @@ main(int argc, char *argv[])
                 llvm_jit_opt_level = 3;
             }
         }
-#endif
+        else if (!strcmp(argv[0], "--enable-segue")) {
+            /* all flags are enabled */
+            segue_flags = 0x1F1F;
+        }
+        else if (!strncmp(argv[0], "--enable-segue=", 15)) {
+            segue_flags = resolve_segue_flags(argv[0] + 15);
+            if (segue_flags == (uint32)-1)
+                return print_help();
+        }
+#endif /* end of WASM_ENABLE_JIT != 0 */
 #if WASM_ENABLE_LIBC_WASI != 0
         else if (!strncmp(argv[0], "--dir=", 6)) {
             if (argv[0][6] == '\0')
@@ -632,6 +703,7 @@ main(int argc, char *argv[])
 #if WASM_ENABLE_JIT != 0
     init_args.llvm_jit_size_level = llvm_jit_size_level;
     init_args.llvm_jit_opt_level = llvm_jit_opt_level;
+    init_args.segue_flags = segue_flags;
 #endif
 
 #if WASM_ENABLE_DEBUG_INTERP != 0

+ 7 - 0
tests/benchmarks/coremark/build.sh

@@ -3,6 +3,8 @@
 # Copyright (C) 2019 Intel Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+PLATFORM=$(uname -s | tr A-Z a-z)
+
 WAMRC="../../../wamr-compiler/build/wamrc"
 
 if [ ! -d coremark ]; then
@@ -32,4 +34,9 @@ cd ..
 echo "Compile coremark.wasm to coremark.aot .."
 ${WAMRC} -o coremark.aot coremark.wasm
 
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo "Compile coremark.wasm to coremark_segue.aot .."
+    ${WAMRC} --enable-segue -o coremark_segue.aot coremark.wasm
+fi
+
 echo "Done"

+ 10 - 3
tests/benchmarks/coremark/run.sh

@@ -3,14 +3,21 @@
 # Copyright (C) 2019 Intel Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-IWASM="../../../product-mini/platforms/linux/build/iwasm"
+PLATFORM=$(uname -s | tr A-Z a-z)
+
+IWASM="../../../product-mini/platforms/${PLATFORM}/build/iwasm"
 WAMRC="../../../wamr-compiler/build/wamrc"
 
 echo "Run coremark with native .."
 ./coremark.exe
 
-echo "Run coremark with iwasm mode .."
+echo "Run coremark with iwasm aot mode .."
 ${IWASM} coremark.aot
 
-echo "Run coremakr with iwasm interpreter .."
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo "Run coremark with iwasm aot-segue mode .."
+    ${IWASM} coremark_segue.aot
+fi
+
+echo "Run coremark with iwasm interpreter mode .."
 ${IWASM} coremark.wasm

+ 7 - 0
tests/benchmarks/dhrystone/LICENSE

@@ -0,0 +1,7 @@
+Dhrystone
+------------------------------------------------------------------------------
+There is no explicit license defined.  They were originally
+written in ADA by Reinhold P. Weicker and translated to C by Rick Richardson .
+
+The source obtained from the following site:
+https://fossies.org/linux/privat/old/dhrystone-2.1.tar.gz

+ 24 - 0
tests/benchmarks/dhrystone/build.sh

@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+PLATFORM=$(uname -s | tr A-Z a-z)
+
+WAMRC_CMD=$PWD/../../../wamr-compiler/build/wamrc
+
+echo "===> compile dhrystone src to dhrystone_native"
+gcc -O3 -o dhrystone_native src/dhry_1.c src/dhry_2.c -I include
+
+echo "===> compile dhrystone src to dhrystone.wasm"
+/opt/wasi-sdk/bin/clang -O3 \
+    -o dhrystone.wasm src/dhry_1.c src/dhry_2.c -I include \
+    -Wl,--export=__heap_base -Wl,--export=__data_end
+
+echo "===> compile dhrystone.wasm to dhrystone.aot"
+${WAMRC_CMD} -o dhrystone.aot dhrystone.wasm
+
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo "===> compile dhrystone.wasm to dhrystone_segue.aot"
+    ${WAMRC_CMD} --enable-segue -o dhrystone_segue.aot dhrystone.wasm
+fi

+ 306 - 0
tests/benchmarks/dhrystone/include/dhry.h

@@ -0,0 +1,306 @@
+/*
+ **************************************************************************
+ *                       DHRYSTONE 2.1 BENCHMARK PC VERSION
+ **************************************************************************
+ *
+ *                   "DHRYSTONE" Benchmark Program
+ *                   -----------------------------
+ *
+ *  Version:    C, Version 2.1
+ *
+ *  File:       dhry.h (part 1 of 3)
+ *
+ *  Date:       May 25, 1988
+ *
+ *  Author:     Reinhold P. Weicker
+ *                      Siemens AG, AUT E 51
+ *                      Postfach 3220
+ *                      8520 Erlangen
+ *                      Germany (West)
+ *                              Phone:  [+49]-9131-7-20330
+ *                                      (8-17 Central European Time)
+ *                              Usenet: ..!mcsun!unido!estevax!weicker
+ *
+ *            Original Version (in Ada) published in
+ *            "Communications of the ACM" vol. 27., no. 10 (Oct. 1984),
+ *            pp. 1013 - 1030, together with the statistics
+ *            on which the distribution of statements etc. is based.
+ *
+ *            In this C version, the following C library functions are used:
+ *            - strcpy, strcmp (inside the measurement loop)
+ *            - printf, scanf (outside the measurement loop)
+ *            In addition, Berkeley UNIX system calls "times ()" or "time ()"
+ *            are used for execution time measurement. For measurements
+ *            on other systems, these calls have to be changed.
+ *
+ *  Collection of Results:
+ *              Reinhold Weicker (address see above) and
+ *
+ *              Rick Richardson
+ *              PC Research. Inc.
+ *              94 Apple Orchard Drive
+ *              Tinton Falls, NJ 07724
+ *                      Phone:  (201) 389-8963 (9-17 EST)
+ *                      Usenet: ...!uunet!pcrat!rick
+ *
+ *      Please send results to Rick Richardson and/or Reinhold Weicker.
+ *      Complete information should be given on hardware and software used.
+ *      Hardware information includes: Machine type, CPU, type and size
+ *      of caches; for microprocessors: clock frequency, memory speed
+ *      (number of wait states).
+ *      Software information includes: Compiler (and runtime library)
+ *      manufacturer and version, compilation switches, OS version.
+ *      The Operating System version may give an indication about the
+ *      compiler; Dhrystone itself performs no OS calls in the measurement
+ *      loop.
+ *
+ *      The complete output generated by the program should be mailed
+ *      such that at least some checks for correctness can be made.
+ *
+ **************************************************************************
+ *
+ *  This version has changes made by Roy Longbottom to conform to a common
+ *  format for a series of standard benchmarks for PCs:
+ *
+ *  Running time greater than 5 seconds due to inaccuracy of the PC clock.
+ *
+ *  Automatic adjustment of run time, no manually inserted parameters.
+ *
+ *  Initial display of calibration times to confirm linearity.
+ *
+ *  Display of results within one screen (or at a slow speed as the test
+ *  progresses) so that it can be seen to have run successfully.
+ *
+ *  Facilities to type in details of system used etc.
+ *
+ *  All results and details appended to a results file.
+ *
+ *
+ *  Roy Longbottom
+ *  101323.2241@compuserve.com
+ *
+ **************************************************************************
+ *
+ *  For details of history, changes, other defines, benchmark construction
+ *  statistics see official versions from ftp.nosc.mil/pub/aburto where
+ *  the latest table of results (dhry.tbl) are available. See also
+ *  netlib@ornl.gov
+ *
+ **************************************************************************
+ *
+ * Defines:     The following "Defines" are possible:
+ *              -DREG=register          (default: Not defined)
+ *                      As an approximation to what an average C programmer
+ *                      might do, the "register" storage class is applied
+ *                      (if enabled by -DREG=register)
+ *                      - for local variables, if they are used (dynamically)
+ *                        five or more times
+ *                      - for parameters if they are used (dynamically)
+ *                        six or more times
+ *                      Note that an optimal "register" strategy is
+ *                      compiler-dependent, and that "register" declarations
+ *                      do not necessarily lead to faster execution.
+ *              -DNOSTRUCTASSIGN        (default: Not defined)
+ *                      Define if the C compiler does not support
+ *                      assignment of structures.
+ *              -DNOENUMS               (default: Not defined)
+ *                      Define if the C compiler does not support
+ *                      enumeration types.
+ ***************************************************************************
+ *
+ *  Compilation model and measurement (IMPORTANT):
+ *
+ *  This C version of Dhrystone consists of three files:
+ *  - dhry.h (this file, containing global definitions and comments)
+ *  - dhry_1.c (containing the code corresponding to Ada package Pack_1)
+ *  - dhry_2.c (containing the code corresponding to Ada package Pack_2)
+ *
+ *  The following "ground rules" apply for measurements:
+ *  - Separate compilation
+ *  - No procedure merging
+ *  - Otherwise, compiler optimizations are allowed but should be indicated
+ *  - Default results are those without register declarations
+ *  See the companion paper "Rationale for Dhrystone Version 2" for a more
+ *  detailed discussion of these ground rules.
+ *
+ *  For 16-Bit processors (e.g. 80186, 80286), times for all compilation
+ *  models ("small", "medium", "large" etc.) should be given if possible,
+ *  together with a definition of these models for the compiler system used.
+ *
+ **************************************************************************
+ *                Examples of Pentium Results
+ *
+ * Dhrystone Benchmark  Version 2.1 (Language: C)
+ *
+ * Month run            4/1996
+ * PC model             Escom
+ * CPU                  Pentium
+ * Clock MHz            100
+ * Cache                256K
+ * Options              Neptune chipset
+ * OS/DOS               Windows 95
+ * Compiler             Watcom C/ C++ 10.5 Win386
+ * OptLevel             -otexan -zp8 -fp5 -5r
+ * Run by               Roy Longbottom
+ * From                 UK
+ * Mail                 101323.2241@compuserve.com
+ *
+ * Final values         (* implementation-dependent):
+ *
+ * Int_Glob:      O.K.  5
+ * Bool_Glob:     O.K.  1
+ * Ch_1_Glob:     O.K.  A
+ * Ch_2_Glob:     O.K.  B
+ * Arr_1_Glob[8]: O.K.  7
+ * Arr_2_Glob8/7: O.K.     1600010
+ * Ptr_Glob->
+ *   Ptr_Comp:       *  98008
+ *   Discr:       O.K.  0
+ *   Enum_Comp:   O.K.  2
+ *   Int_Comp:    O.K.  17
+ *   Str_Comp:    O.K.  DHRYSTONE PROGRAM, SOME STRING
+ * Next_Ptr_Glob->
+ *   Ptr_Comp:       *  98008 same as above
+ *   Discr:       O.K.  0
+ *   Enum_Comp:   O.K.  1
+ *   Int_Comp:    O.K.  18
+ *   Str_Comp:    O.K.  DHRYSTONE PROGRAM, SOME STRING
+ * Int_1_Loc:     O.K.  5
+ * Int_2_Loc:     O.K.  13
+ * Int_3_Loc:     O.K.  7
+ * Enum_Loc:      O.K.  1
+ * Str_1_Loc:     O.K.  DHRYSTONE PROGRAM, 1'ST STRING
+ * Str_2_Loc:     O.K.  DHRYSTONE PROGRAM, 2'ND STRING
+ *
+ * Register option      Selected.
+ *
+ * Microseconds 1 loop:          4.53
+ * Dhrystones / second:      220690
+ * VAX MIPS rating:            125.61
+ *
+ *
+ * Dhrystone Benchmark  Version 2.1 (Language: C)
+ *
+ * Month run            4/1996
+ * PC model             Escom
+ * CPU                  Pentium
+ * Clock MHz            100
+ * Cache                256K
+ * Options              Neptune chipset
+ * OS/DOS               Windows 95
+ * Compiler             Watcom C/ C++ 10.5 Win386
+ * OptLevel                 No optimisation
+ * Run by               Roy Longbottom
+ * From                 UK
+ * Mail                 101323.2241@compuserve.com
+ *
+ * Final values         (* implementation-dependent):
+ *
+ * Int_Glob:      O.K.  5
+ * Bool_Glob:     O.K.  1
+ * Ch_1_Glob:     O.K.  A
+ * Ch_2_Glob:     O.K.  B
+ * Arr_1_Glob[8]: O.K.  7
+ * Arr_2_Glob8/7: O.K.      320010
+ * Ptr_Glob->
+ *   Ptr_Comp:       *  98004
+ *   Discr:       O.K.  0
+ *   Enum_Comp:   O.K.  2
+ *   Int_Comp:    O.K.  17
+ *   Str_Comp:    O.K.  DHRYSTONE PROGRAM, SOME STRING
+ * Next_Ptr_Glob->
+ *   Ptr_Comp:       *  98004 same as above
+ *   Discr:       O.K.  0
+ *   Enum_Comp:   O.K.  1
+ *   Int_Comp:    O.K.  18
+ *   Str_Comp:    O.K.  DHRYSTONE PROGRAM, SOME STRING
+ * Int_1_Loc:     O.K.  5
+ * Int_2_Loc:     O.K.  13
+ * Int_3_Loc:     O.K.  7
+ * Enum_Loc:      O.K.  1
+ * Str_1_Loc:     O.K.  DHRYSTONE PROGRAM, 1'ST STRING
+ * Str_2_Loc:     O.K.  DHRYSTONE PROGRAM, 2'ND STRING
+ *
+ * Register option      Not selected.
+ *
+ * Microseconds 1 loop:         20.06
+ * Dhrystones / second:       49844
+ * VAX MIPS rating:             28.37
+ *
+ **************************************************************************
+ */
+
+/* Compiler and system dependent definitions: */
+
+#ifndef TIME
+#define TIMES
+#endif
+/* Use times(2) time function unless    */
+/* explicitly defined otherwise         */
+
+#ifdef TIMES
+/* #include <sys/types.h>
+   #include <sys/times.h> */
+/* for "times" */
+#endif
+
+#define Mic_secs_Per_Second 1000000.0
+/* Berkeley UNIX C returns process times in seconds/HZ */
+
+#ifdef NOSTRUCTASSIGN
+#define structassign(d, s) memcpy(&(d), &(s), sizeof(d))
+#else
+#define structassign(d, s) d = s
+#endif
+
+#ifdef NOENUM
+#define Ident_1 0
+#define Ident_2 1
+#define Ident_3 2
+#define Ident_4 3
+#define Ident_5 4
+typedef int Enumeration;
+#else
+typedef enum { Ident_1, Ident_2, Ident_3, Ident_4, Ident_5 } Enumeration;
+#endif
+/* for boolean and enumeration types in Ada, Pascal */
+
+/* General definitions: */
+
+#include <stdio.h>
+#include <string.h>
+
+/* for strcpy, strcmp */
+
+#define Null 0
+/* Value of a Null pointer */
+#define true 1
+#define false 0
+
+typedef int One_Thirty;
+typedef int One_Fifty;
+typedef char Capital_Letter;
+typedef int Boolean;
+typedef char Str_30[31];
+typedef int Arr_1_Dim[50];
+typedef int Arr_2_Dim[50][50];
+
+typedef struct record {
+    struct record *Ptr_Comp;
+    Enumeration Discr;
+    union {
+        struct {
+            Enumeration Enum_Comp;
+            int Int_Comp;
+            char Str_Comp[31];
+        } var_1;
+        struct {
+            Enumeration E_Comp_2;
+            char Str_2_Comp[31];
+        } var_2;
+        struct {
+            char Ch_1_Comp;
+            char Ch_2_Comp;
+        } var_3;
+    } variant;
+} Rec_Type, *Rec_Pointer;

+ 19 - 0
tests/benchmarks/dhrystone/run.sh

@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+PLATFORM=$(uname -s | tr A-Z a-z)
+
+readonly IWASM_CMD="../../../product-mini/platforms/${PLATFORM}/build/iwasm"
+
+echo "============> run dhrystone native"
+./dhrystone_native
+
+echo "============> run dhrystone.aot"
+${IWASM_CMD} dhrystone.aot
+
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo "============> run dhrystone_segue.aot"
+    ${IWASM_CMD} dhrystone_segue.aot
+fi

+ 485 - 0
tests/benchmarks/dhrystone/src/dhry_1.c

@@ -0,0 +1,485 @@
+/*
+ *************************************************************************
+ *
+ *                   "DHRYSTONE" Benchmark Program
+ *                   -----------------------------
+ *
+ *  Version:    C, Version 2.1
+ *
+ *  File:       dhry_1.c (part 2 of 3)
+ *
+ *  Date:       May 25, 1988
+ *
+ *  Author:     Reinhold P. Weicker
+ *
+ *************************************************************************
+ */
+
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "dhry.h"
+
+/* Global Variables: */
+
+Rec_Pointer Ptr_Glob, Next_Ptr_Glob;
+int Int_Glob;
+Boolean Bool_Glob;
+char Ch_1_Glob, Ch_2_Glob;
+int Arr_1_Glob[50];
+int Arr_2_Glob[50][50];
+
+Enumeration
+Func_1(Capital_Letter Ch_1_Par_Val, Capital_Letter Ch_2_Par_Val);
+/*
+forward declaration necessary since Enumeration may not simply be int
+*/
+
+#ifndef ROPT
+#define REG
+/* REG becomes defined as empty */
+/* i.e. no register variables   */
+#else
+#define REG register
+#endif
+
+void
+Proc_1(REG Rec_Pointer Ptr_Val_Par);
+void
+Proc_2(One_Fifty *Int_Par_Ref);
+void
+Proc_3(Rec_Pointer *Ptr_Ref_Par);
+void
+Proc_4();
+void
+Proc_5();
+void
+Proc_6(Enumeration Enum_Val_Par, Enumeration *Enum_Ref_Par);
+void
+Proc_7(One_Fifty Int_1_Par_Val, One_Fifty Int_2_Par_Val,
+       One_Fifty *Int_Par_Ref);
+void
+Proc_8(Arr_1_Dim Arr_1_Par_Ref, Arr_2_Dim Arr_2_Par_Ref, int Int_1_Par_Val,
+       int Int_2_Par_Val);
+
+Boolean
+Func_2(Str_30 Str_1_Par_Ref, Str_30 Str_2_Par_Ref);
+
+/* variables for time measurement: */
+
+#define Too_Small_Time 2
+/* Measurements should last at least 2 seconds */
+
+#define BILLION 1000000000L
+#define MILLION 1000000
+struct timespec Begin_Time, End_Time;
+double User_Time;
+
+double Microseconds, Dhrystones_Per_Second, Vax_Mips;
+
+/* end of variables for time measurement */
+
+int
+main(int argc, char *argv[])
+/*****/
+
+/* main program, corresponds to procedures        */
+/* Main and Proc_0 in the Ada version             */
+{
+    One_Fifty Int_1_Loc;
+    REG One_Fifty Int_2_Loc;
+    One_Fifty Int_3_Loc;
+    REG char Ch_Index;
+    Enumeration Enum_Loc;
+    Str_30 Str_1_Loc;
+    Str_30 Str_2_Loc;
+    REG int Run_Index;
+    REG int Number_Of_Runs;
+    int endit, count = 10;
+    char general[9][80] = { " " };
+
+    /***********************************************************************
+     *         Change for compiler and optimisation used                   *
+     ***********************************************************************/
+
+    Next_Ptr_Glob = (Rec_Pointer)malloc(sizeof(Rec_Type));
+    Ptr_Glob = (Rec_Pointer)malloc(sizeof(Rec_Type));
+
+    Ptr_Glob->Ptr_Comp = Next_Ptr_Glob;
+    Ptr_Glob->Discr = Ident_1;
+    Ptr_Glob->variant.var_1.Enum_Comp = Ident_3;
+    Ptr_Glob->variant.var_1.Int_Comp = 40;
+    strcpy(Ptr_Glob->variant.var_1.Str_Comp, "DHRYSTONE PROGRAM, SOME STRING");
+    strcpy(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING");
+
+    Arr_2_Glob[8][7] = 10;
+    /* Was missing in published program. Without this statement,   */
+    /* Arr_2_Glob [8][7] would have an undefined value.            */
+    /* Warning: With 16-Bit processors and Number_Of_Runs > 32000, */
+    /* overflow may occur for this array element.                  */
+
+    printf("\n");
+    printf("Dhrystone Benchmark, Version 2.1 (Language: C or C++)\n");
+    printf("\n");
+
+    Number_Of_Runs = 5000;
+
+    do {
+
+        Number_Of_Runs = Number_Of_Runs * 2;
+        count = count - 1;
+        Arr_2_Glob[8][7] = 10;
+
+        /***************/
+        /* Start timer */
+        /***************/
+
+        clock_gettime(CLOCK_MONOTONIC, &Begin_Time);
+
+        for (Run_Index = 1; Run_Index <= Number_Of_Runs; ++Run_Index) {
+
+            Proc_5();
+            Proc_4();
+            /* Ch_1_Glob == 'A', Ch_2_Glob == 'B', Bool_Glob == true */
+            Int_1_Loc = 2;
+            Int_2_Loc = 3;
+            strcpy(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING");
+            Enum_Loc = Ident_2;
+            Bool_Glob = !Func_2(Str_1_Loc, Str_2_Loc);
+            /* Bool_Glob == 1 */
+            while (Int_1_Loc < Int_2_Loc) /* loop body executed once */
+            {
+                Int_3_Loc = 5 * Int_1_Loc - Int_2_Loc;
+                /* Int_3_Loc == 7 */
+                Proc_7(Int_1_Loc, Int_2_Loc, &Int_3_Loc);
+                /* Int_3_Loc == 7 */
+                Int_1_Loc += 1;
+            } /* while */
+              /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
+            Proc_8(Arr_1_Glob, Arr_2_Glob, Int_1_Loc, Int_3_Loc);
+            /* Int_Glob == 5 */
+            Proc_1(Ptr_Glob);
+            for (Ch_Index = 'A'; Ch_Index <= Ch_2_Glob; ++Ch_Index)
+            /* loop body executed twice */
+            {
+                if (Enum_Loc == Func_1(Ch_Index, 'C'))
+                /* then, not executed */
+                {
+                    Proc_6(Ident_1, &Enum_Loc);
+                    strcpy(Str_2_Loc, "DHRYSTONE PROGRAM, 3'RD STRING");
+                    Int_2_Loc = Run_Index;
+                    Int_Glob = Run_Index;
+                }
+            }
+            /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
+            Int_2_Loc = Int_2_Loc * Int_1_Loc;
+            Int_1_Loc = Int_2_Loc / Int_3_Loc;
+            Int_2_Loc = 7 * (Int_2_Loc - Int_3_Loc) - Int_1_Loc;
+            /* Int_1_Loc == 1, Int_2_Loc == 13, Int_3_Loc == 7 */
+            Proc_2(&Int_1_Loc);
+            /* Int_1_Loc == 5 */
+
+        } /* loop "for Run_Index" */
+
+        /**************/
+        /* Stop timer */
+        /**************/
+
+        clock_gettime(CLOCK_MONOTONIC, &End_Time);
+
+        User_Time = (End_Time.tv_sec - Begin_Time.tv_sec) * MILLION
+                    + (End_Time.tv_nsec - Begin_Time.tv_nsec) / 1000;
+        User_Time = User_Time / MILLION; /* convert to seconds */
+
+        printf("%ld runs %lf seconds \n", (long)Number_Of_Runs, User_Time);
+        if (User_Time > 5.0) {
+            count = 0;
+        }
+        else {
+            if (User_Time < 0.1) {
+                Number_Of_Runs = Number_Of_Runs * 5;
+            }
+        }
+    } /* calibrate/run do while */
+    while (count > 0);
+
+    printf("\n");
+    printf("Final values (* implementation-dependent):\n");
+    printf("\n");
+    printf("Int_Glob:      ");
+    if (Int_Glob == 5)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d  ", Int_Glob);
+
+    printf("Bool_Glob:     ");
+    if (Bool_Glob == 1)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d\n", Bool_Glob);
+
+    printf("Ch_1_Glob:     ");
+    if (Ch_1_Glob == 'A')
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%c  ", Ch_1_Glob);
+
+    printf("Ch_2_Glob:     ");
+    if (Ch_2_Glob == 'B')
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%c\n", Ch_2_Glob);
+
+    printf("Arr_1_Glob[8]: ");
+    if (Arr_1_Glob[8] == 7)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d  ", Arr_1_Glob[8]);
+
+    printf("Arr_2_Glob8/7: ");
+    if (Arr_2_Glob[8][7] == Number_Of_Runs + 10)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%10d\n", Arr_2_Glob[8][7]);
+
+    printf("Ptr_Glob->            ");
+    printf("  Ptr_Comp:       *    %p\n", Ptr_Glob->Ptr_Comp);
+
+    printf("  Discr:       ");
+    if (Ptr_Glob->Discr == 0)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d  ", Ptr_Glob->Discr);
+
+    printf("Enum_Comp:     ");
+    if (Ptr_Glob->variant.var_1.Enum_Comp == 2)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d\n", Ptr_Glob->variant.var_1.Enum_Comp);
+
+    printf("  Int_Comp:    ");
+    if (Ptr_Glob->variant.var_1.Int_Comp == 17)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d ", Ptr_Glob->variant.var_1.Int_Comp);
+
+    printf("Str_Comp:      ");
+    if (strcmp(Ptr_Glob->variant.var_1.Str_Comp,
+               "DHRYSTONE PROGRAM, SOME STRING")
+        == 0)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%s\n", Ptr_Glob->variant.var_1.Str_Comp);
+
+    printf("Next_Ptr_Glob->       ");
+    printf("  Ptr_Comp:       *    %p", Next_Ptr_Glob->Ptr_Comp);
+    printf(" same as above\n");
+
+    printf("  Discr:       ");
+    if (Next_Ptr_Glob->Discr == 0)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d  ", Next_Ptr_Glob->Discr);
+
+    printf("Enum_Comp:     ");
+    if (Next_Ptr_Glob->variant.var_1.Enum_Comp == 1)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp);
+
+    printf("  Int_Comp:    ");
+    if (Next_Ptr_Glob->variant.var_1.Int_Comp == 18)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d ", Next_Ptr_Glob->variant.var_1.Int_Comp);
+
+    printf("Str_Comp:      ");
+    if (strcmp(Next_Ptr_Glob->variant.var_1.Str_Comp,
+               "DHRYSTONE PROGRAM, SOME STRING")
+        == 0)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%s\n", Next_Ptr_Glob->variant.var_1.Str_Comp);
+
+    printf("Int_1_Loc:     ");
+    if (Int_1_Loc == 5)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d  ", Int_1_Loc);
+
+    printf("Int_2_Loc:     ");
+    if (Int_2_Loc == 13)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d\n", Int_2_Loc);
+
+    printf("Int_3_Loc:     ");
+    if (Int_3_Loc == 7)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d  ", Int_3_Loc);
+
+    printf("Enum_Loc:      ");
+    if (Enum_Loc == 1)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%d\n", Enum_Loc);
+
+    printf("Str_1_Loc:                             ");
+    if (strcmp(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING") == 0)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%s\n", Str_1_Loc);
+
+    printf("Str_2_Loc:                             ");
+    if (strcmp(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING") == 0)
+        printf("O.K.  ");
+    else
+        printf("WRONG ");
+    printf("%s\n", Str_2_Loc);
+
+    printf("\n");
+
+    if (User_Time < Too_Small_Time) {
+        printf("Measured time too small to obtain meaningful results\n");
+        printf("Please increase number of runs\n");
+        printf("\n");
+    }
+    else {
+        Microseconds = User_Time * Mic_secs_Per_Second / (double)Number_Of_Runs;
+        Dhrystones_Per_Second = (double)Number_Of_Runs / User_Time;
+        Vax_Mips = Dhrystones_Per_Second / 1757.0;
+
+        printf("Microseconds for one run through Dhrystone: ");
+        printf("%lf \n", Microseconds);
+        printf("Dhrystones per Second:                      ");
+        printf("%lf \n", Dhrystones_Per_Second);
+        printf("VAX  MIPS rating =                          ");
+        printf("%lf \n", Vax_Mips);
+        printf("\n");
+    }
+
+    free(Next_Ptr_Glob);
+    free(Ptr_Glob);
+    return 1;
+}
+
+void
+Proc_1(REG Rec_Pointer Ptr_Val_Par)
+/******************/
+
+/* executed once */
+{
+    REG Rec_Pointer Next_Record = Ptr_Val_Par->Ptr_Comp;
+    /* == Ptr_Glob_Next */
+    /* Local variable, initialized with Ptr_Val_Par->Ptr_Comp,    */
+    /* corresponds to "rename" in Ada, "with" in Pascal           */
+
+    structassign(*Ptr_Val_Par->Ptr_Comp, *Ptr_Glob);
+    Ptr_Val_Par->variant.var_1.Int_Comp = 5;
+    Next_Record->variant.var_1.Int_Comp = Ptr_Val_Par->variant.var_1.Int_Comp;
+    Next_Record->Ptr_Comp = Ptr_Val_Par->Ptr_Comp;
+    Proc_3(&Next_Record->Ptr_Comp);
+    /* Ptr_Val_Par->Ptr_Comp->Ptr_Comp
+                        == Ptr_Glob->Ptr_Comp */
+    if (Next_Record->Discr == Ident_1)
+    /* then, executed */
+    {
+        Next_Record->variant.var_1.Int_Comp = 6;
+        Proc_6(Ptr_Val_Par->variant.var_1.Enum_Comp,
+               &Next_Record->variant.var_1.Enum_Comp);
+        Next_Record->Ptr_Comp = Ptr_Glob->Ptr_Comp;
+        Proc_7(Next_Record->variant.var_1.Int_Comp, 10,
+               &Next_Record->variant.var_1.Int_Comp);
+    }
+    else { /* not executed */
+        structassign(*Ptr_Val_Par, *Ptr_Val_Par->Ptr_Comp);
+    }
+} /* Proc_1 */
+
+void
+Proc_2(One_Fifty *Int_Par_Ref)
+/******************/
+/* executed once */
+/* *Int_Par_Ref == 1, becomes 4 */
+
+{
+    One_Fifty Int_Loc;
+    Enumeration Enum_Loc;
+
+    Int_Loc = *Int_Par_Ref + 10;
+    do /* executed once */
+        if (Ch_1_Glob == 'A')
+        /* then, executed */
+        {
+            Int_Loc -= 1;
+            *Int_Par_Ref = Int_Loc - Int_Glob;
+            Enum_Loc = Ident_1;
+        }                        /* if */
+    while (Enum_Loc != Ident_1); /* true */
+} /* Proc_2 */
+
+void
+Proc_3(Rec_Pointer *Ptr_Ref_Par)
+/******************/
+/* executed once */
+/* Ptr_Ref_Par becomes Ptr_Glob */
+
+{
+    if (Ptr_Glob != Null)
+        /* then, executed */
+        *Ptr_Ref_Par = Ptr_Glob->Ptr_Comp;
+    Proc_7(10, Int_Glob, &Ptr_Glob->variant.var_1.Int_Comp);
+} /* Proc_3 */
+
+void
+Proc_4() /* without parameters */
+/*******/
+/* executed once */
+{
+    Boolean Bool_Loc;
+
+    Bool_Loc = Ch_1_Glob == 'A';
+    Bool_Glob = Bool_Loc | Bool_Glob;
+    Ch_2_Glob = 'B';
+} /* Proc_4 */
+
+void
+Proc_5() /* without parameters */
+/*******/
+/* executed once */
+{
+    Ch_1_Glob = 'A';
+    Bool_Glob = false;
+} /* Proc_5 */
+
+/* Procedure for the assignment of structures,          */
+/* if the C compiler doesn't support this feature       */
+#ifdef NOSTRUCTASSIGN
+memcpy(d, s, l) register char *d;
+register char *s;
+register int l;
+{
+    while (l--)
+        *d++ = *s++;
+}
+#endif

+ 187 - 0
tests/benchmarks/dhrystone/src/dhry_2.c

@@ -0,0 +1,187 @@
+/*
+ *************************************************************************
+ *
+ *                   "DHRYSTONE" Benchmark Program
+ *                   -----------------------------
+ *
+ *  Version:    C, Version 2.1
+ *
+ *  File:       dhry_2.c (part 3 of 3)
+ *
+ *  Date:       May 25, 1988
+ *
+ *  Author:     Reinhold P. Weicker
+ *
+ *************************************************************************
+ */
+
+#include "dhry.h"
+
+#ifndef REG
+#define REG
+/* REG becomes defined as empty */
+/* i.e. no register variables   */
+#else
+#define REG register
+#endif
+
+extern int Int_Glob;
+extern char Ch_1_Glob;
+
+Boolean
+Func_3(Enumeration Enum_Par_Val);
+
+void
+Proc_6(Enumeration Enum_Val_Par, Enumeration *Enum_Ref_Par)
+/*********************************/
+/* executed once */
+/* Enum_Val_Par == Ident_3, Enum_Ref_Par becomes Ident_2 */
+
+{
+    *Enum_Ref_Par = Enum_Val_Par;
+    if (!Func_3(Enum_Val_Par))
+        /* then, not executed */
+        *Enum_Ref_Par = Ident_4;
+    switch (Enum_Val_Par) {
+        case Ident_1:
+            *Enum_Ref_Par = Ident_1;
+            break;
+        case Ident_2:
+            if (Int_Glob > 100)
+                /* then */
+                *Enum_Ref_Par = Ident_1;
+            else
+                *Enum_Ref_Par = Ident_4;
+            break;
+        case Ident_3: /* executed */
+            *Enum_Ref_Par = Ident_2;
+            break;
+        case Ident_4:
+            break;
+        case Ident_5:
+            *Enum_Ref_Par = Ident_3;
+            break;
+    } /* switch */
+} /* Proc_6 */
+
+void
+Proc_7(One_Fifty Int_1_Par_Val, One_Fifty Int_2_Par_Val, One_Fifty *Int_Par_Ref)
+/**********************************************/
+/* executed three times                                      */
+/* first call:      Int_1_Par_Val == 2, Int_2_Par_Val == 3,  */
+/*                  Int_Par_Ref becomes 7                    */
+/* second call:     Int_1_Par_Val == 10, Int_2_Par_Val == 5, */
+/*                  Int_Par_Ref becomes 17                   */
+/* third call:      Int_1_Par_Val == 6, Int_2_Par_Val == 10, */
+/*                  Int_Par_Ref becomes 18                   */
+
+{
+    One_Fifty Int_Loc;
+
+    Int_Loc = Int_1_Par_Val + 2;
+    *Int_Par_Ref = Int_2_Par_Val + Int_Loc;
+} /* Proc_7 */
+
+void
+Proc_8(Arr_1_Dim Arr_1_Par_Ref, Arr_2_Dim Arr_2_Par_Ref, int Int_1_Par_Val,
+       int Int_2_Par_Val)
+/*********************************************************************/
+/* executed once      */
+/* Int_Par_Val_1 == 3 */
+/* Int_Par_Val_2 == 7 */
+
+{
+    REG One_Fifty Int_Index;
+    REG One_Fifty Int_Loc;
+
+    Int_Loc = Int_1_Par_Val + 5;
+    Arr_1_Par_Ref[Int_Loc] = Int_2_Par_Val;
+    Arr_1_Par_Ref[Int_Loc + 1] = Arr_1_Par_Ref[Int_Loc];
+    Arr_1_Par_Ref[Int_Loc + 30] = Int_Loc;
+    for (Int_Index = Int_Loc; Int_Index <= Int_Loc + 1; ++Int_Index)
+        Arr_2_Par_Ref[Int_Loc][Int_Index] = Int_Loc;
+    Arr_2_Par_Ref[Int_Loc][Int_Loc - 1] += 1;
+    Arr_2_Par_Ref[Int_Loc + 20][Int_Loc] = Arr_1_Par_Ref[Int_Loc];
+    Int_Glob = 5;
+} /* Proc_8 */
+
+Enumeration
+Func_1(Capital_Letter Ch_1_Par_Val, Capital_Letter Ch_2_Par_Val)
+/*************************************************/
+/* executed three times                                         */
+/* first call:      Ch_1_Par_Val == 'H', Ch_2_Par_Val == 'R'    */
+/* second call:     Ch_1_Par_Val == 'A', Ch_2_Par_Val == 'C'    */
+/* third call:      Ch_1_Par_Val == 'B', Ch_2_Par_Val == 'C'    */
+
+{
+    Capital_Letter Ch_1_Loc;
+    Capital_Letter Ch_2_Loc;
+
+    Ch_1_Loc = Ch_1_Par_Val;
+    Ch_2_Loc = Ch_1_Loc;
+    if (Ch_2_Loc != Ch_2_Par_Val)
+        /* then, executed */
+        return (Ident_1);
+    else /* not executed */
+    {
+        Ch_1_Glob = Ch_1_Loc;
+        return (Ident_2);
+    }
+} /* Func_1 */
+
+Boolean
+Func_2(Str_30 Str_1_Par_Ref, Str_30 Str_2_Par_Ref)
+/*************************************************/
+/* executed once */
+/* Str_1_Par_Ref == "DHRYSTONE PROGRAM, 1'ST STRING" */
+/* Str_2_Par_Ref == "DHRYSTONE PROGRAM, 2'ND STRING" */
+
+{
+    REG One_Thirty Int_Loc;
+    Capital_Letter Ch_Loc;
+
+    Int_Loc = 2;
+    while (Int_Loc <= 2) /* loop body executed once */
+        if (Func_1(Str_1_Par_Ref[Int_Loc], Str_2_Par_Ref[Int_Loc + 1])
+            == Ident_1)
+        /* then, executed */
+        {
+            Ch_Loc = 'A';
+            Int_Loc += 1;
+        } /* if, while */
+    if (Ch_Loc >= 'W' && Ch_Loc < 'Z')
+        /* then, not executed */
+        Int_Loc = 7;
+    if (Ch_Loc == 'R')
+        /* then, not executed */
+        return (true);
+    else /* executed */
+    {
+        if (strcmp(Str_1_Par_Ref, Str_2_Par_Ref) > 0)
+        /* then, not executed */
+        {
+            Int_Loc += 7;
+            Int_Glob = Int_Loc;
+            return (true);
+        }
+        else /* executed */
+            return (false);
+    } /* if Ch_Loc */
+} /* Func_2 */
+
+Boolean
+Func_3(Enumeration Enum_Par_Val)
+/***************************/
+/* executed once        */
+/* Enum_Par_Val == Ident_3 */
+
+{
+    Enumeration Enum_Loc;
+
+    Enum_Loc = Enum_Par_Val;
+    if (Enum_Loc == Ident_3)
+        /* then, executed */
+        return (true);
+    else /* not executed */
+        return (false);
+} /* Func_3 */

+ 113 - 13
tests/benchmarks/jetstream/build.sh

@@ -3,27 +3,45 @@
 # Copyright (C) 2019 Intel Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+source /opt/emsdk/emsdk_env.sh
+
+PLATFORM=$(uname -s | tr A-Z a-z)
+
 OUT_DIR=$PWD/out
 WAMRC_CMD=$PWD/../../../wamr-compiler/build/wamrc
 
 mkdir -p jetstream
+mkdir -p tsf-src
 mkdir -p ${OUT_DIR}
 
+if [[ $1 != "--no-simd" ]];then
+    NATIVE_SIMD_FLAGS="-msse2 -msse3 -msse4"
+    WASM_SIMD_FLAGS="-msimd128 -msse2 -msse3 -msse4"
+else
+    NATIVE_SIMD_FLAGS=""
+    WASM_SIMD_FLAGS=""
+fi
+
 cd jetstream
 
 echo "Download source files .."
-wget https://browserbench.org/JetStream/wasm/gcc-loops.cpp
-wget https://browserbench.org/JetStream/wasm/quicksort.c
-wget https://browserbench.org/JetStream/wasm/HashSet.cpp
-wget https://browserbench.org/JetStream/simple/float-mm.c
+wget -N https://browserbench.org/JetStream/wasm/gcc-loops.cpp
+wget -N https://browserbench.org/JetStream/wasm/quicksort.c
+wget -N https://browserbench.org/JetStream/wasm/HashSet.cpp
+wget -N https://browserbench.org/JetStream/simple/float-mm.c
+
+if [[ $? != 0 ]]; then
+    exit
+fi
 
-patch -p1 < ../jetstream.patch
+echo "Patch source files .."
+patch -p1 -N < ../jetstream.patch
 
 echo "Build gcc-loops with g++ .."
-g++ -O3 -msse2 -msse3 -msse4 -o ${OUT_DIR}/gcc-loops_native gcc-loops.cpp
+g++ -O3 ${NATIVE_SIMD_FLAGS} -o ${OUT_DIR}/gcc-loops_native gcc-loops.cpp
 
 echo "Build gcc-loops with em++ .."
-em++ -O3 -s STANDALONE_WASM=1 -msimd128 \
+em++ -O3 -s STANDALONE_WASM=1 ${WASM_SIMD_FLAGS} \
          -s INITIAL_MEMORY=1048576 \
          -s TOTAL_STACK=32768 \
          -s "EXPORTED_FUNCTIONS=['_main']" \
@@ -33,11 +51,16 @@ em++ -O3 -s STANDALONE_WASM=1 -msimd128 \
 echo "Compile gcc-loops.wasm to gcc-loops.aot"
 ${WAMRC_CMD} -o ${OUT_DIR}/gcc-loops.aot ${OUT_DIR}/gcc-loops.wasm
 
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo "Compile gcc-loops.wasm to gcc-loops_segue.aot"
+    ${WAMRC_CMD} --enable-segue -o ${OUT_DIR}/gcc-loops_segue.aot ${OUT_DIR}/gcc-loops.wasm
+fi
+
 echo "Build quicksort with gcc .."
-gcc -O3 -msse2 -msse3 -msse4 -o ${OUT_DIR}/quicksort_native quicksort.c
+gcc -O3 ${NATIVE_SIMD_FLAGS} -o ${OUT_DIR}/quicksort_native quicksort.c
 
 echo "Build quicksort with emcc .."
-emcc -O3 -s STANDALONE_WASM=1 -msimd128 \
+emcc -O3 -s STANDALONE_WASM=1 ${WASM_SIMD_FLAGS} \
          -s INITIAL_MEMORY=1048576 \
          -s TOTAL_STACK=32768 \
          -s "EXPORTED_FUNCTIONS=['_main']" \
@@ -46,12 +69,17 @@ emcc -O3 -s STANDALONE_WASM=1 -msimd128 \
 echo "Compile quicksort.wasm to quicksort.aot"
 ${WAMRC_CMD} -o ${OUT_DIR}/quicksort.aot ${OUT_DIR}/quicksort.wasm
 
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo "Compile quicksort.wasm to quicksort_segue.aot"
+    ${WAMRC_CMD} --enable-segue -o ${OUT_DIR}/quicksort_segue.aot ${OUT_DIR}/quicksort.wasm
+fi
+
 echo "Build HashSet with g++ .."
-g++ -O3 -msse2 -msse3 -msse4 -o ${OUT_DIR}/HashSet_native HashSet.cpp \
+g++ -O3 ${NATIVE_SIMD_FLAGS} -o ${OUT_DIR}/HashSet_native HashSet.cpp \
         -lstdc++
 
 echo "Build HashSet with em++ .."
-em++ -O3 -s STANDALONE_WASM=1 -msimd128 \
+em++ -O3 -s STANDALONE_WASM=1 ${WASM_SIMD_FLAGS} \
          -s INITIAL_MEMORY=1048576 \
          -s TOTAL_STACK=32768 \
          -s "EXPORTED_FUNCTIONS=['_main']" \
@@ -60,11 +88,16 @@ em++ -O3 -s STANDALONE_WASM=1 -msimd128 \
 echo "Compile HashSet.wasm to HashSet.aot"
 ${WAMRC_CMD} -o ${OUT_DIR}/HashSet.aot ${OUT_DIR}/HashSet.wasm
 
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo "Compile HashSet.wasm to HashSet_segue.aot"
+    ${WAMRC_CMD} --enable-segue -o ${OUT_DIR}/HashSet_segue.aot ${OUT_DIR}/HashSet.wasm
+fi
+
 echo "Build float-mm with gcc .."
-gcc -O3 -msse2 -msse3 -msse4 -o ${OUT_DIR}/float-mm_native float-mm.c
+gcc -O3 ${NATIVE_SIMD_FLAGS} -o ${OUT_DIR}/float-mm_native float-mm.c
 
 echo "Build float-mm with emcc .."
-emcc -O3 -s STANDALONE_WASM=1 -msimd128 \
+emcc -O3 -s STANDALONE_WASM=1 ${WASM_SIMD_FLAGS} \
          -s INITIAL_MEMORY=1048576 \
          -s TOTAL_STACK=32768 \
          -s "EXPORTED_FUNCTIONS=['_main']" \
@@ -72,3 +105,70 @@ emcc -O3 -s STANDALONE_WASM=1 -msimd128 \
 
 echo "Compile float-mm.wasm to float-mm.aot"
 ${WAMRC_CMD} -o ${OUT_DIR}/float-mm.aot ${OUT_DIR}/float-mm.wasm
+
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo "Compile float-mm.wasm to float-mm_segue.aot"
+    ${WAMRC_CMD} --enable-segue -o ${OUT_DIR}/float-mm_segue.aot ${OUT_DIR}/float-mm.wasm
+fi
+
+cd ../tsf-src
+
+tsf_srcs="tsf_asprintf.c tsf_buffer.c tsf_error.c tsf_reflect.c tsf_st.c \
+          tsf_type.c tsf_io.c tsf_native.c tsf_generator.c tsf_st_typetable.c \
+          tsf_parser.c tsf_buf_writer.c tsf_buf_reader.c tsf_primitive.c \
+          tsf_type_table.c tsf_copier.c tsf_destructor.c tsf_gpc_code_gen.c \
+          gpc_code_gen_util.c gpc_threaded.c gpc_intable.c gpc_instruction.c \
+          gpc_program.c gpc_proto.c gpc_stack_height.c tsf_serial_in_man.c \
+          tsf_serial_out_man.c tsf_type_in_map.c tsf_type_out_map.c \
+          tsf_stream_file_input.c tsf_stream_file_output.c tsf_sort.c \
+          tsf_version.c tsf_named_type.c tsf_io_utils.c tsf_zip_attr.c \
+          tsf_zip_reader.c tsf_zip_writer.c tsf_zip_abstract.c tsf_limits.c \
+          tsf_ra_type_man.c tsf_adaptive_reader.c tsf_sha1.c tsf_sha1_writer.c \
+          tsf_fsdb.c tsf_fsdb_protocol.c tsf_define_helpers.c tsf_ir.c \
+          tsf_ir_different.c tsf_ir_speed.c"
+
+tsf_files="${tsf_srcs} config.h gpc_worklist.h \
+           tsf_config_stub.h tsf.h tsf_internal.h tsf_region.h tsf_types.h \
+           gpc.h tsf_atomics.h tsf_define_helpers.h tsf_indent.h tsf_inttypes.h \
+           tsf_serial_protocol.h tsf_util.h gpc_int_common.h tsf_build_defines.h \
+           tsf_format.h tsf_internal_config.h tsf_ir_different.h tsf_sha1.h \
+           tsf_zip_abstract.h gpc_internal.h tsf_config.h tsf_fsdb_protocol.h \
+           tsf_internal_config_stub.h tsf_ir.h tsf_st.h \
+           gpc_instruction_dispatch.gen gpc_instruction_stack_effects.gen \
+           gpc_instruction_to_string.gen gpc_instruction_size.gen \
+           gpc_instruction_static_size.gen gpc_interpreter.gen"
+
+echo "Download tsf source files .."
+for t in ${tsf_files}
+do
+    wget -N "https://browserbench.org/JetStream/wasm/TSF/${t}"
+    if [[ $? != 0 ]]; then
+        exit
+    fi
+done
+
+patch -p1 -N < ../tsf.patch
+
+echo "Build tsf with gcc .."
+gcc \
+    -o ${OUT_DIR}/tsf_native -O3 ${NATIVE_SIMD_FLAGS} \
+    -I. -DTSF_BUILD_SYSTEM=1 \
+    ${tsf_srcs} -lm
+
+echo "Build tsf standalone with wasi-sdk .."
+/opt/wasi-sdk/bin/clang -O3 ${WASM_SIMD_FLAGS} -z stack-size=1048576 \
+    -Wl,--initial-memory=52428800 \
+    -Wl,--export=main \
+    -Wl,--export=__heap_base,--export=__data_end \
+    -I. -DTSF_BUILD_SYSTEM=1 \
+    -Wl,--allow-undefined \
+    -o ${OUT_DIR}/tsf.wasm \
+    ${tsf_srcs}
+
+echo "Compile tsf.wasm to tsf.aot"
+${WAMRC_CMD} -o ${OUT_DIR}/tsf.aot ${OUT_DIR}/tsf.wasm
+
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo "Compile tsf.wasm to tsf_segue.aot"
+    ${WAMRC_CMD} --enable-segue -o ${OUT_DIR}/tsf_segue.aot ${OUT_DIR}/tsf.wasm
+fi

+ 9 - 6
tests/benchmarks/jetstream/jetstream.patch

@@ -1,15 +1,18 @@
 diff -urN jetstream-org/HashSet.cpp jetstream/HashSet.cpp
---- jetstream-org/HashSet.cpp	2020-10-30 04:12:42.000000000 +0800
-+++ jetstream/HashSet.cpp	2022-01-24 17:11:08.619831711 +0800
-@@ -24,6 +24,7 @@
+--- jetstream-org/HashSet.cpp   2020-10-30 04:12:42.000000000 +0800
++++ jetstream/HashSet.cpp   2022-01-24 17:11:08.619831711 +0800
+@@ -22,8 +22,10 @@
+
+ #include <algorithm>
  #include <memory>
++#include <limits>
  #include <stdio.h>
  #include <stdlib.h>
 +#include <string.h>
  #include <sys/time.h>
- 
+
  // Compile with: xcrun clang++ -o HashSet HashSet.cpp -O2 -W -framework Foundation -licucore -std=c++11 -fvisibility=hidden -DNDEBUG=1
-@@ -76,7 +77,7 @@
+@@ -76,7 +78,7 @@
  inline ToType bitwise_cast(FromType from)
  {
      typename std::remove_const<ToType>::type to { };
@@ -17,4 +20,4 @@ diff -urN jetstream-org/HashSet.cpp jetstream/HashSet.cpp
 +    memcpy(&to, &from, sizeof(to));
      return to;
  }
- 
+

+ 15 - 3
tests/benchmarks/jetstream/run_aot.sh

@@ -3,6 +3,8 @@
 # Copyright (C) 2019 Intel Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+PLATFORM=$(uname -s | tr A-Z a-z)
+
 CUR_DIR=$PWD
 OUT_DIR=$CUR_DIR/out
 REPORT=$CUR_DIR/report.txt
@@ -13,7 +15,7 @@ IWASM_CMD=$CUR_DIR/../../../product-mini/platforms/${PLATFORM}/build/iwasm
 
 BENCH_NAME_MAX_LEN=20
 
-JETSTREAM_CASES="gcc-loops quicksort HashSet float-mm"
+JETSTREAM_CASES="gcc-loops HashSet tsf float-mm quicksort"
 
 rm -f $REPORT
 touch $REPORT
@@ -34,7 +36,11 @@ echo "Start to run cases, the result is written to report.txt"
 
 #run benchmarks
 cd $OUT_DIR
-echo -en "\t\t\t\t\t  native\tiwasm-aot\n" >> $REPORT
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo -en "\t\t\t\t\t  native\tiwasm-aot\tiwasm-aot-segue\n" >> $REPORT
+else
+    echo -en "\t\t\t\t\t  native\tiwasm-aot\n" >> $REPORT
+fi
 
 for t in $JETSTREAM_CASES
 do
@@ -46,7 +52,13 @@ do
 
     echo "run $t with iwasm aot .."
     echo -en "\t" >> $REPORT
-    $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+    $TIME -f "real-%e-time" $IWASM_CMD --dir=. ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    if [[ ${PLATFORM} == "linux" ]]; then
+        echo "run $t with iwasm aot segue .."
+        echo -en "\t" >> $REPORT
+        $TIME -f "real-%e-time" $IWASM_CMD --dir=. ${t}_segue.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+    fi
 
     echo -en "\n" >> $REPORT
 done

+ 24 - 0
tests/benchmarks/jetstream/tsf.patch

@@ -0,0 +1,24 @@
+diff -urN tsf-src-org/tsf_internal.h tsf-src/tsf_internal.h
+--- tsf-src-org/tsf_internal.h  2023-03-31 10:49:45.000000000 +0800
++++ tsf-src/tsf_internal.h  2023-05-11 08:18:35.000000000 +0800
+@@ -429,6 +429,7 @@
+ #endif
+             tsf_fsdb_connection_t *connection;
+ #endif
++            uint32_t __padding;
+         } remote;
+     } u;
+     tsf_limits_t *limits;
+diff -urN tsf-src-org/tsf_ir_speed.c tsf-src/tsf_ir_speed.c
+--- tsf-src-org/tsf_ir_speed.c  2023-03-31 10:49:45.000000000 +0800
++++ tsf-src/tsf_ir_speed.c  2023-05-11 08:18:35.000000000 +0800
+@@ -63,6 +63,9 @@
+         Program_t *program;
+         unsigned elementIndex;
+
++        if (!(programIndex % 100))
++            printf("##programIndex: %u\n", programIndex);
++
+         CS(program = tsf_region_create(sizeof(Program_t)));
+
+         program->globals.len = numDecls + numDefns;

+ 10 - 1
tests/benchmarks/libsodium/build.sh

@@ -16,6 +16,8 @@ libsodium_CASES="aead_aes256gcm2 aead_aes256gcm aead_chacha20poly13052 aead_chac
                  sodium_utils3 sodium_utils sodium_version stream2 stream3 stream4 stream verify1 \
                  xchacha20"
 
+PLATFORM=$(uname -s | tr A-Z a-z)
+
 readonly WAMRC_CMD=$PWD/../../../wamr-compiler/build/wamrc
 readonly OUT_DIR=$PWD/libsodium/zig-out/bin
 
@@ -34,9 +36,16 @@ zig build -Drelease-fast -Denable_benchmarks=true -Dtarget=wasm32-wasi
 for case in ${libsodium_CASES}
 do
     ${WAMRC_CMD} -o ${OUT_DIR}/${case}.aot ${OUT_DIR}/${case}.wasm
-
     if [ "$?" != 0 ]; then
         echo -e "Error while compiling ${case}.wasm to ${case}.aot"
         exit
     fi
+
+    if [[ ${PLATFORM} == "linux" ]]; then
+        ${WAMRC_CMD} --enable-segue -o ${OUT_DIR}/${case}_segue.aot ${OUT_DIR}/${case}.wasm
+        if [ "$?" != 0 ]; then
+            echo -e "Error while compiling ${case}.wasm to ${case}_segue.aot"
+            exit
+        fi
+    fi
 done

+ 39 - 6
tests/benchmarks/libsodium/test_aot.sh → tests/benchmarks/libsodium/run_aot.sh

@@ -13,12 +13,14 @@ libsodium_CASES="aead_aes256gcm2 aead_aes256gcm aead_chacha20poly13052 aead_chac
                  scalarmult6 scalarmult7 scalarmult8 scalarmult_ed25519 scalarmult_ristretto255 \
                  scalarmult secretbox2 secretbox7 secretbox8 secretbox_easy2 secretbox_easy \
                  secretbox secretstream shorthash sign siphashx24 sodium_core sodium_utils2 \
-                 sodium_utils3 sodium_utils sodium_version stream2 stream3 stream4 stream verify1 \
-                 xchacha20"
+                 sodium_utils stream2 stream3 stream4 stream verify1 xchacha20"
+
+PLATFORM=$(uname -s | tr A-Z a-z)
 
 readonly OUT_DIR=$PWD/libsodium/zig-out/bin
 readonly REPORT=$PWD/report.txt
-readonly IWASM_CMD=$PWD/../../../product-mini/platforms/linux/build/iwasm
+readonly IWASM_CMD=$PWD/../../../product-mini/platforms/${PLATFORM}/build/iwasm
+readonly TIME=/usr/bin/time
 
 BENCH_NAME_MAX_LEN=20
 
@@ -40,7 +42,11 @@ function print_bench_name()
 # run benchmarks
 cd $OUT_DIR
 
-echo -en "\t\t\t\t\t\tnative\tiwasm-aot\n" >> $REPORT
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo -en "\t\t\t\t\t\tnative\tiwasm-aot\tiwasm-aot-segue\n" >> $REPORT
+else
+    echo -en "\t\t\t\t\t\tnative\tiwasm-aot\n" >> $REPORT
+fi
 
 for t in $libsodium_CASES
 do
@@ -48,11 +54,38 @@ do
 
     echo "run $t with native..."
     echo -en "\t" >> $REPORT
-    ./${t} | awk -F '-' 'BEGIN{FIELDWIDTHS="10"}{ORS=""; print $1 / 1000000.0}' >> $REPORT
+    if [[ $t != "sodium_utils2" ]]; then
+        ./${t} | awk '{printf "%-10.2f", $0/1000000.0}' >> $REPORT
+    else
+        # sodium_utils2 doesn't print the result,
+        # use time command to get result instead
+        $TIME -f "real-%e-time" ./${t} 2>&1 | grep "real-.*-time" |
+            awk -F '-' '{printf "%-10.2f", $2}' >> $REPORT
+    fi
 
     echo "run $t with iwasm aot..."
     echo -en "\t  \t" >> $REPORT
-    $IWASM_CMD ${t}.aot | awk -F '-' 'BEGIN{FIELDWIDTHS="10"}{ORS=""; print $1 / 1000000.0}' >> $REPORT
+    if [[ $t != "sodium_utils2" ]]; then
+        $IWASM_CMD ${t}.aot | awk '{printf "%-10.2f", $0/1000000.0}' >> $REPORT
+    else
+        # sodium_utils2 doesn't print the result,
+        # use time command to get result instead
+        $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" |
+            awk -F '-' '{printf "%-10.2f", $2}' >> $REPORT
+    fi
+
+    if [[ ${PLATFORM} == "linux" ]]; then
+        echo "run $t with iwasm aot segue..."
+        echo -en "\t  \t" >> $REPORT
+        if [[ $t != "sodium_utils2" ]]; then
+            $IWASM_CMD ${t}_segue.aot | awk '{printf "%.2f", $0/1000000.0}' >> $REPORT
+        else
+            # sodium_utils2 doesn't print the result,
+            # use time command to get result instead
+            $TIME -f "real-%e-time" $IWASM_CMD ${t}_segue.aot 2>&1 | grep "real-.*-time" |
+                awk -F '-' '{printf "%.2f", $2}' >> $REPORT
+        fi
+    fi
 
     echo -en "\n" >> $REPORT
 done

+ 8 - 0
tests/benchmarks/polybench/build.sh

@@ -3,6 +3,8 @@
 # Copyright (C) 2019 Intel Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+PLATFORM=$(uname -s | tr A-Z a-z)
+
 OUT_DIR=$PWD/out
 WAMRC_CMD=$PWD/../../../wamr-compiler/build/wamrc
 POLYBENCH_CASES="datamining linear-algebra medley stencils"
@@ -40,6 +42,12 @@ do
         echo "Compile ${file_name%.*}.wasm into ${file_name%.*}.aot"
         ${WAMRC_CMD} -o ${OUT_DIR}/${file_name%.*}.aot \
                 ${OUT_DIR}/${file_name%.*}.wasm
+
+        if [[ ${PLATFORM} == "linux" ]]; then
+            echo "Compile ${file_name%.*}.wasm into ${file_name%.*}_segue.aot"
+            ${WAMRC_CMD} --enable-segue -o ${OUT_DIR}/${file_name%.*}_segue.aot \
+                    ${OUT_DIR}/${file_name%.*}.wasm
+        fi
     done
 done
 

+ 11 - 1
tests/benchmarks/polybench/run_aot.sh

@@ -37,7 +37,11 @@ echo "Start to run cases, the result is written to report.txt"
 
 #run benchmarks
 cd $OUT_DIR
-echo -en "\t\t\t\t\t  native\tiwasm-aot\n" >> $REPORT
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo -en "\t\t\t\t\t  native\tiwasm-aot\tiwasm-aot-segue\n" >> $REPORT
+else
+    echo -en "\t\t\t\t\t  native\tiwasm-aot\n" >> $REPORT
+fi
 
 for t in $POLYBENCH_CASES
 do
@@ -51,5 +55,11 @@ do
     echo -en "\t" >> $REPORT
     $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
 
+    if [[ ${PLATFORM} == "linux" ]]; then
+        echo "run $t with iwasm aot segue .."
+        echo -en "\t" >> $REPORT
+        $TIME -f "real-%e-time" $IWASM_CMD ${t}_segue.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+    fi
+
     echo -en "\n" >> $REPORT
 done

+ 1 - 1
tests/benchmarks/polybench/run_interp.sh

@@ -37,7 +37,7 @@ echo "Start to run cases, the result is written to report.txt"
 
 #run benchmarks
 cd $OUT_DIR
-echo -en "\t\t\t\t\t  native\tiwasm-aot\n" >> $REPORT
+echo -en "\t\t\t\t\t  native\tiwasm-interp\n" >> $REPORT
 
 for t in $POLYBENCH_CASES
 do

+ 6 - 1
tests/benchmarks/sightglass/build.sh

@@ -3,6 +3,8 @@
 # Copyright (C) 2019 Intel Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+PLATFORM=$(uname -s | tr A-Z a-z)
+
 OUT_DIR=$PWD/out
 WAMRC_CMD=$PWD/../../../wamr-compiler/build/wamrc
 SHOOTOUT_CASES="base64 fib2 gimli heapsort matrix memmove nestedloop \
@@ -34,9 +36,12 @@ do
         -Wl,--export=app_main -Wl,--export=_start \
         ${bench}.c main/main_${bench}.c main/my_libc.c
 
-
     echo "Compile ${bench}.wasm into ${bench}.aot"
     ${WAMRC_CMD} -o ${OUT_DIR}/${bench}.aot ${OUT_DIR}/${bench}.wasm
+    if [[ ${PLATFORM} == "linux" ]]; then
+        echo "Compile ${bench}.wasm into ${bench}_segue.aot"
+        ${WAMRC_CMD} --enable-segue -o ${OUT_DIR}/${bench}_segue.aot ${OUT_DIR}/${bench}.wasm
+    fi
 done
 
 cd ..

+ 11 - 1
tests/benchmarks/sightglass/run_aot.sh

@@ -36,7 +36,11 @@ echo "Start to run cases, the result is written to report.txt"
 
 #run benchmarks
 cd $OUT_DIR
-echo -en "\t\t\t\t\t  native\tiwasm-aot\n" >> $REPORT
+if [[ ${PLATFORM} == "linux" ]]; then
+    echo -en "\t\t\t\t\t  native\tiwasm-aot\tiwasm-aot-segue\n" >> $REPORT
+else
+    echo -en "\t\t\t\t\t  native\tiwasm-aot\n" >> $REPORT
+fi
 
 for t in $SHOOTOUT_CASES
 do
@@ -50,5 +54,11 @@ do
     echo -en "\t" >> $REPORT
     $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
 
+    if [[ ${PLATFORM} == "linux" ]]; then
+        echo "run $t with iwasm aot segue .."
+        echo -en "\t" >> $REPORT
+        $TIME -f "real-%e-time" $IWASM_CMD ${t}_segue.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+    fi
+
     echo -en "\n" >> $REPORT
 done

+ 2 - 2
tests/benchmarks/sightglass/run_interp.sh

@@ -46,9 +46,9 @@ do
     echo -en "\t" >> $REPORT
     $TIME -f "real-%e-time" ./${t}_native 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
 
-    echo "run $t with iwasm aot .."
+    echo "run $t with iwasm interp .."
     echo -en "\t" >> $REPORT
-    $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+    $TIME -f "real-%e-time" $IWASM_CMD ${t}.wasm 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
 
     echo -en "\n" >> $REPORT
 done

+ 67 - 1
wamr-compiler/main.c

@@ -65,6 +65,12 @@ print_help()
     printf("  --enable-indirect-mode    Enalbe call function through symbol table but not direct call\n");
     printf("  --disable-llvm-intrinsics Disable the LLVM built-in intrinsics\n");
     printf("  --disable-llvm-lto        Disable the LLVM link time optimization\n");
+    printf("  --enable-segue[=<flags>]  Enable using segment register GS as the base address of linear memory,\n");
+    printf("                            only available on linux/linux-sgx x86-64, which may improve performance,\n");
+    printf("                            flags can be: i32.load, i64.load, f32.load, f64.load, v128.load,\n");
+    printf("                                          i32.store, i64.store, f32.store, f64.store, v128.store\n");
+    printf("                            Use comma to separate, e.g. --enable-segue=i32.load,i64.store\n");
+    printf("                            and --enable-segue means all flags are added.\n");
     printf("  --emit-custom-sections=<section names>\n");
     printf("                            Emit the specified custom sections to AoT file, using comma to separate\n");
     printf("                            multiple names, e.g.\n");
@@ -84,7 +90,7 @@ print_help()
     } while (0)
 
 /**
- * Split a strings into an array of strings
+ * Split a string into an array of strings
  * Returns NULL on failure
  * Memory must be freed by caller
  * Based on: http://stackoverflow.com/a/11198630/471795
@@ -126,6 +132,57 @@ split_string(char *str, int *count, const char *delimer)
     return res;
 }
 
+static uint32
+resolve_segue_flags(char *str_flags)
+{
+    uint32 segue_flags = 0;
+    int32 flag_count, i;
+    char **flag_list;
+
+    flag_list = split_string(str_flags, &flag_count, ",");
+    if (flag_list) {
+        for (i = 0; i < flag_count; i++) {
+            if (!strcmp(flag_list[i], "i32.load")) {
+                segue_flags |= 1 << 0;
+            }
+            else if (!strcmp(flag_list[i], "i64.load")) {
+                segue_flags |= 1 << 1;
+            }
+            else if (!strcmp(flag_list[i], "f32.load")) {
+                segue_flags |= 1 << 2;
+            }
+            else if (!strcmp(flag_list[i], "f64.load")) {
+                segue_flags |= 1 << 3;
+            }
+            else if (!strcmp(flag_list[i], "v128.load")) {
+                segue_flags |= 1 << 4;
+            }
+            else if (!strcmp(flag_list[i], "i32.store")) {
+                segue_flags |= 1 << 8;
+            }
+            else if (!strcmp(flag_list[i], "i64.store")) {
+                segue_flags |= 1 << 9;
+            }
+            else if (!strcmp(flag_list[i], "f32.store")) {
+                segue_flags |= 1 << 10;
+            }
+            else if (!strcmp(flag_list[i], "f64.store")) {
+                segue_flags |= 1 << 11;
+            }
+            else if (!strcmp(flag_list[i], "v128.store")) {
+                segue_flags |= 1 << 12;
+            }
+            else {
+                /* invalid flag */
+                segue_flags = (uint32)-1;
+                break;
+            }
+        }
+        free(flag_list);
+    }
+    return segue_flags;
+}
+
 int
 main(int argc, char *argv[])
 {
@@ -272,6 +329,15 @@ main(int argc, char *argv[])
         else if (!strcmp(argv[0], "--disable-llvm-lto")) {
             option.disable_llvm_lto = true;
         }
+        else if (!strcmp(argv[0], "--enable-segue")) {
+            /* all flags are enabled */
+            option.segue_flags = 0x1F1F;
+        }
+        else if (!strncmp(argv[0], "--enable-segue=", 15)) {
+            option.segue_flags = resolve_segue_flags(argv[0] + 15);
+            if (option.segue_flags == (uint32)-1)
+                PRINT_HELP_AND_EXIT();
+        }
         else if (!strncmp(argv[0], "--emit-custom-sections=", 23)) {
             int len = 0;
             if (option.custom_sections) {