Просмотр исходного кода

Import SIMD feature and add some workload samples (#438)

Wenyong Huang 5 лет назад
Родитель
Сommit
a3074df21b
84 измененных файлов с 7780 добавлено и 318 удалено
  1. 1 0
      README.md
  2. 4 0
      build-scripts/config_common.cmake
  3. 5 0
      core/config.h
  4. 1 1
      core/deps/download.sh
  5. 19 3
      core/iwasm/aot/aot_loader.c
  6. 6 0
      core/iwasm/aot/aot_runtime.c
  7. 1 0
      core/iwasm/aot/aot_runtime.h
  8. 64 0
      core/iwasm/common/arch/invokeNative_em64_simd.s
  9. 11 3
      core/iwasm/common/iwasm_common.cmake
  10. 153 8
      core/iwasm/common/wasm_runtime_common.c
  11. 3 0
      core/iwasm/common/wasm_runtime_common.h
  12. 9 0
      core/iwasm/compilation/aot.c
  13. 14 0
      core/iwasm/compilation/aot.h
  14. 735 6
      core/iwasm/compilation/aot_compiler.c
  15. 55 0
      core/iwasm/compilation/aot_compiler.h
  16. 41 4
      core/iwasm/compilation/aot_emit_aot_file.c
  17. 13 23
      core/iwasm/compilation/aot_emit_control.c
  18. 4 22
      core/iwasm/compilation/aot_emit_exception.c
  19. 2 18
      core/iwasm/compilation/aot_emit_function.c
  20. 22 51
      core/iwasm/compilation/aot_emit_memory.c
  21. 4 0
      core/iwasm/compilation/aot_emit_memory.h
  22. 29 150
      core/iwasm/compilation/aot_emit_numberic.c
  23. 2 1
      core/iwasm/compilation/aot_emit_parametric.c
  24. 10 2
      core/iwasm/compilation/aot_emit_variable.c
  25. 240 2
      core/iwasm/compilation/aot_llvm.c
  26. 44 0
      core/iwasm/compilation/aot_llvm.h
  27. 381 0
      core/iwasm/compilation/simd/simd_access_lanes.c
  28. 89 0
      core/iwasm/compilation/simd/simd_access_lanes.h
  29. 164 0
      core/iwasm/compilation/simd/simd_bit_shifts.c
  30. 39 0
      core/iwasm/compilation/simd/simd_bit_shifts.h
  31. 109 0
      core/iwasm/compilation/simd/simd_bitmask_extracts.c
  32. 29 0
      core/iwasm/compilation/simd/simd_bitmask_extracts.h
  33. 146 0
      core/iwasm/compilation/simd/simd_bitwise_ops.c
  34. 24 0
      core/iwasm/compilation/simd/simd_bitwise_ops.h
  35. 183 0
      core/iwasm/compilation/simd/simd_bool_reductions.c
  36. 43 0
      core/iwasm/compilation/simd/simd_bool_reductions.h
  37. 47 0
      core/iwasm/compilation/simd/simd_common.c
  38. 23 0
      core/iwasm/compilation/simd/simd_common.h
  39. 231 0
      core/iwasm/compilation/simd/simd_comparisons.c
  40. 44 0
      core/iwasm/compilation/simd/simd_comparisons.h
  41. 190 0
      core/iwasm/compilation/simd/simd_construct_values.c
  42. 29 0
      core/iwasm/compilation/simd/simd_construct_values.h
  43. 422 0
      core/iwasm/compilation/simd/simd_conversions.c
  44. 51 0
      core/iwasm/compilation/simd/simd_conversions.h
  45. 273 0
      core/iwasm/compilation/simd/simd_floating_point.c
  46. 49 0
      core/iwasm/compilation/simd/simd_floating_point.h
  47. 207 0
      core/iwasm/compilation/simd/simd_int_arith.c
  48. 51 0
      core/iwasm/compilation/simd/simd_int_arith.h
  49. 301 0
      core/iwasm/compilation/simd/simd_load_store.c
  50. 45 0
      core/iwasm/compilation/simd/simd_load_store.h
  51. 367 0
      core/iwasm/compilation/simd/simd_sat_int_arith.c
  52. 66 0
      core/iwasm/compilation/simd/simd_sat_int_arith.h
  53. 1 0
      core/iwasm/include/aot_export.h
  54. 21 0
      core/iwasm/interpreter/wasm.h
  55. 674 12
      core/iwasm/interpreter/wasm_loader.c
  56. 215 0
      core/iwasm/interpreter/wasm_opcode.h
  57. 192 0
      core/iwasm/libraries/libc-emcc/libc_emcc_wrapper.c
  58. 4 0
      doc/build_wamr.md
  59. 5 0
      product-mini/platforms/linux/CMakeLists.txt
  60. 5 0
      samples/basic/build.sh
  61. 5 0
      samples/gui/build.sh
  62. 5 0
      samples/littlevgl/build.sh
  63. 5 0
      samples/simple/build.sh
  64. 34 0
      samples/workload/README.md
  65. 4 0
      samples/workload/bwa/.gitignore
  66. 134 0
      samples/workload/bwa/CMakeLists.bwa_wasm.txt
  67. 91 0
      samples/workload/bwa/CMakeLists.txt
  68. 47 0
      samples/workload/bwa/README.md
  69. 100 0
      samples/workload/cmake/toolchain.cmake
  70. 1 0
      samples/workload/docker/.gitignore
  71. 77 0
      samples/workload/docker/Dockerfile
  72. 48 0
      samples/workload/docker/build.sh
  73. 10 0
      samples/workload/docker/run.sh
  74. 2 0
      samples/workload/meshoptimizer/.gitignore
  75. 39 0
      samples/workload/meshoptimizer/CMakeLists.txt
  76. 59 0
      samples/workload/meshoptimizer/README.md
  77. 47 0
      samples/workload/meshoptimizer/codecbench.patch
  78. 25 7
      samples/workload/tensorflow/build.sh
  79. 9 3
      samples/workload/tensorflow/tf_lite.patch
  80. 22 0
      samples/workload/wasm-av1/README.md
  81. 100 0
      samples/workload/wasm-av1/build.sh
  82. 696 0
      samples/workload/wasm-av1/wasm-av1.patch
  83. 1 0
      wamr-compiler/CMakeLists.txt
  84. 7 2
      wamr-compiler/main.c

+ 1 - 0
README.md

@@ -36,6 +36,7 @@ iwasm VM core
 - [Multi-value](https://github.com/WebAssembly/multi-value)
 - [wasm-c-api](https://github.com/WebAssembly/wasm-c-api)
 - [Tail-call](https://github.com/WebAssembly/tail-call)
+- [128-bit SIMD](https://github.com/WebAssembly/simd)
 
 ### Supported architectures and platforms
 

+ 4 - 0
build-scripts/config_common.cmake

@@ -165,6 +165,10 @@ if (WAMR_DISABLE_HW_BOUND_CHECK EQUAL 1)
   add_definitions (-DWASM_DISABLE_HW_BOUND_CHECK=1)
   message ("     Hardware boundary check disabled")
 endif ()
+if (WAMR_BUILD_SIMD EQUAL 1)
+  add_definitions (-DWASM_ENABLE_SIMD=1)
+  message ("     SIMD enabled")
+endif ()
 if (WAMR_BUILD_MEMORY_PROFILING EQUAL 1)
   add_definitions (-DWASM_ENABLE_MEMORY_PROFILING=1)
   message ("     Memory profiling enabled")

+ 5 - 0
core/config.h

@@ -165,6 +165,11 @@
 #define WASM_DISABLE_HW_BOUND_CHECK 0
 #endif
 
+/* Disable SIMD unless it is manualy enabled somewhere */
+#ifndef WASM_ENABLE_SIMD
+#define WASM_ENABLE_SIMD 0
+#endif
+
 /* Memory profiling */
 #ifndef WASM_ENABLE_MEMORY_PROFILING
 #define WASM_ENABLE_MEMORY_PROFILING 0

+ 1 - 1
core/deps/download.sh

@@ -13,7 +13,7 @@ if [ ! -d "lvgl" ]; then
 fi
 if [ ! -d "lv_drivers" ]; then
         echo "git pull lv_drivers..."
-        git clone https://github.com/littlevgl/lv_drivers.git
+        git clone https://github.com/littlevgl/lv_drivers.git --branch v6.0.1
         [ $? -eq 0 ] || exit $?
 fi
 

+ 19 - 3
core/iwasm/aot/aot_loader.c

@@ -66,6 +66,11 @@ exchange_uint32(uint8 *p_data)
 static void
 exchange_uint64(uint8 *pData)
 {
+    uint32 value;
+
+    value = *(uint32 *)pData;
+    *(uint32 *)pData = *(uint32 *)(pData + 4);
+    *(uint32 *)(pData + 4) = value;
     exchange_uint32(pData);
     exchange_uint32(pData + 4);
 }
@@ -801,14 +806,22 @@ load_globals(const uint8 **p_buf, const uint8 *buf_end,
     /* Create each global */
     for (i = 0; i < module->global_count; i++) {
         uint16 init_expr_type;
-        uint64 init_expr_value;
 
         read_uint8(buf, buf_end, globals[i].type);
         read_uint8(buf, buf_end, globals[i].is_mutable);
         read_uint16(buf, buf_end, init_expr_type);
-        read_uint64(buf, buf_end, init_expr_value);
+
+        if (init_expr_type != INIT_EXPR_TYPE_V128_CONST) {
+            read_uint64(buf, buf_end, globals[i].init_expr.u.i64);
+        }
+        else {
+            uint64 *i64x2 = (uint64 *)globals[i].init_expr.u.v128.i64x2;
+            CHECK_BUF(buf, buf_end, sizeof(uint64) * 2);
+            wasm_runtime_read_v128(buf, &i64x2[0], &i64x2[1]);
+            buf += sizeof(uint64) * 2;
+        }
+
         globals[i].init_expr.init_expr_type = (uint8)init_expr_type;
-        globals[i].init_expr.u.i64 = (int64)init_expr_value;
 
         globals[i].size = wasm_value_type_size(globals[i].type);
         globals[i].data_offset = data_offset;
@@ -2101,6 +2114,9 @@ aot_convert_wasm_module(WASMModule *wasm_module,
 #endif
 #if WASM_ENABLE_TAIL_CALL != 0
     option.enable_tail_call = true;
+#endif
+#if WASM_ENABLE_SIMD != 0
+    option.enable_simd = true;
 #endif
     comp_ctx = aot_create_comp_context(comp_data, &option);
     if (!comp_ctx) {

+ 6 - 0
core/iwasm/aot/aot_runtime.c

@@ -385,12 +385,14 @@ memory_instantiate(AOTModuleInstance *module_inst, AOTModule *module,
            memory_inst->mem_bound_check_2bytes.u64 = total_size - 2;
            memory_inst->mem_bound_check_4bytes.u64 = total_size - 4;
            memory_inst->mem_bound_check_8bytes.u64 = total_size - 8;
+           memory_inst->mem_bound_check_16bytes.u64 = total_size - 16;
        }
        else {
            memory_inst->mem_bound_check_1byte.u32[0] = (uint32)total_size - 1;
            memory_inst->mem_bound_check_2bytes.u32[0] = (uint32)total_size - 2;
            memory_inst->mem_bound_check_4bytes.u32[0] = (uint32)total_size - 4;
            memory_inst->mem_bound_check_8bytes.u32[0] = (uint32)total_size - 8;
+           memory_inst->mem_bound_check_16bytes.u32[0] = (uint32)total_size - 16;
        }
     }
 
@@ -1545,12 +1547,14 @@ aot_enlarge_memory(AOTModuleInstance *module_inst, uint32 inc_page_count)
         memory_inst->mem_bound_check_2bytes.u64 = total_size - 2;
         memory_inst->mem_bound_check_4bytes.u64 = total_size - 4;
         memory_inst->mem_bound_check_8bytes.u64 = total_size - 8;
+        memory_inst->mem_bound_check_16bytes.u64 = total_size - 16;
     }
     else {
         memory_inst->mem_bound_check_1byte.u32[0] = (uint32)total_size - 1;
         memory_inst->mem_bound_check_2bytes.u32[0] = (uint32)total_size - 2;
         memory_inst->mem_bound_check_4bytes.u32[0] = (uint32)total_size - 4;
         memory_inst->mem_bound_check_8bytes.u32[0] = (uint32)total_size - 8;
+        memory_inst->mem_bound_check_16bytes.u32[0] = (uint32)total_size - 16;
     }
     return true;
 }
@@ -1593,12 +1597,14 @@ aot_enlarge_memory(AOTModuleInstance *module_inst, uint32 inc_page_count)
         memory_inst->mem_bound_check_2bytes.u64 = total_size - 2;
         memory_inst->mem_bound_check_4bytes.u64 = total_size - 4;
         memory_inst->mem_bound_check_8bytes.u64 = total_size - 8;
+        memory_inst->mem_bound_check_16bytes.u64 = total_size - 16;
     }
     else {
         memory_inst->mem_bound_check_1byte.u32[0] = (uint32)total_size - 1;
         memory_inst->mem_bound_check_2bytes.u32[0] = (uint32)total_size - 2;
         memory_inst->mem_bound_check_4bytes.u32[0] = (uint32)total_size - 4;
         memory_inst->mem_bound_check_8bytes.u32[0] = (uint32)total_size - 8;
+        memory_inst->mem_bound_check_16bytes.u32[0] = (uint32)total_size - 16;
     }
     return true;
 }

+ 1 - 0
core/iwasm/aot/aot_runtime.h

@@ -234,6 +234,7 @@ typedef struct AOTMemoryInstance {
     MemBound mem_bound_check_2bytes;
     MemBound mem_bound_check_4bytes;
     MemBound mem_bound_check_8bytes;
+    MemBound mem_bound_check_16bytes;
 } AOTMemoryInstance;
 
 typedef struct AOTModuleInstance {

+ 64 - 0
core/iwasm/common/arch/invokeNative_em64_simd.s

@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2019 Intel Corporation.  All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+    .text
+    .align 2
+#ifndef BH_PLATFORM_DARWIN
+.globl invokeNative
+    .type    invokeNative, @function
+invokeNative:
+#else
+.globl _invokeNative
+_invokeNative:
+#endif /* end of BH_PLATFORM_DARWIN */
+    /*  rdi - function ptr */
+    /*  rsi - argv */
+    /*  rdx - n_stacks */
+
+    push %rbp
+    mov %rsp, %rbp
+
+    mov %rdx, %r10
+    mov %rsp, %r11      /* Check that stack is aligned on */
+    and $8, %r11        /* 16 bytes. This code may be removed */
+    je check_stack_succ /* when we are sure that compiler always */
+    int3                /* calls us with aligned stack */
+check_stack_succ:
+    mov %r10, %r11      /* Align stack on 16 bytes before pushing */
+    and $1, %r11        /* stack arguments in case we have an odd */
+    shl $3, %r11        /* number of stack arguments */
+    sub %r11, %rsp
+    /* store memory args */
+    movq %rdi, %r11     /* func ptr */
+    movq %r10, %rcx     /* counter */
+    lea 128+48-8(%rsi,%rcx,8), %r10
+    sub %rsp, %r10
+    cmpq $0, %rcx
+    je push_args_end
+push_args:
+    push 0(%rsp,%r10)
+    loop push_args
+push_args_end:
+    /* fill all fp args */
+    movdqa 0x00(%rsi), %xmm0
+    movdqa 0x10(%rsi), %xmm1
+    movdqa 0x20(%rsi), %xmm2
+    movdqa 0x30(%rsi), %xmm3
+    movdqa 0x40(%rsi), %xmm4
+    movdqa 0x50(%rsi), %xmm5
+    movdqa 0x60(%rsi), %xmm6
+    movdqa 0x70(%rsi), %xmm7
+
+    /* fill all int args */
+    movq 0x80(%rsi), %rdi
+    movq 0x90(%rsi), %rdx
+    movq 0x98(%rsi), %rcx
+    movq 0xa0(%rsi), %r8
+    movq 0xa8(%rsi), %r9
+    movq 0x88(%rsi), %rsi
+
+    call *%r11
+    leave
+    ret
+

+ 11 - 3
core/iwasm/common/iwasm_common.cmake

@@ -11,10 +11,18 @@ add_definitions(-DBH_FREE=wasm_runtime_free)
 file (GLOB c_source_all ${IWASM_COMMON_DIR}/*.c)
 
 if (WAMR_BUILD_TARGET STREQUAL "X86_64" OR WAMR_BUILD_TARGET STREQUAL "AMD_64")
-  if (WAMR_BUILD_PLATFORM STREQUAL "windows")
-    set (source_all ${c_source_all} ${IWASM_COMMON_DIR}/arch/invokeNative_em64.asm)
+  if (NOT WAMR_BUILD_SIMD EQUAL 1)
+    if (WAMR_BUILD_PLATFORM STREQUAL "windows")
+      set (source_all ${c_source_all} ${IWASM_COMMON_DIR}/arch/invokeNative_em64.asm)
+    else ()
+      set (source_all ${c_source_all} ${IWASM_COMMON_DIR}/arch/invokeNative_em64.s)
+    endif ()
   else ()
-    set (source_all ${c_source_all} ${IWASM_COMMON_DIR}/arch/invokeNative_em64.s)
+    if (WAMR_BUILD_PLATFORM STREQUAL "windows")
+      message(FATAL_ERROR "need an implementation of SIMD on windows")
+    else()
+      set (source_all ${c_source_all} ${IWASM_COMMON_DIR}/arch/invokeNative_em64_simd.s)
+    endif()
   endif ()
 elseif (WAMR_BUILD_TARGET STREQUAL "X86_32")
   if (WAMR_BUILD_PLATFORM STREQUAL "windows")

+ 153 - 8
core/iwasm/common/wasm_runtime_common.c

@@ -2457,6 +2457,23 @@ wasm_application_execute_func(WASMModuleInstanceCommon *module_inst,
                 argv1[p++] = u.parts[1];
                 break;
             }
+#if WASM_ENABLE_SIMD != 0
+            case VALUE_TYPE_V128:
+            {
+                /* it likes 0x123\0x234 or 123\234 */
+                /* retrive first i64 */
+                *(uint64*)(argv1 + p) = strtoull(argv[i], &endptr, 0);
+                /* skip \ */
+                endptr++;
+                /* retrive second i64 */
+                *(uint64*)(argv1 + p + 2) = strtoull(endptr, &endptr, 0);
+                p += 4;
+                break;
+            }
+#endif /* WASM_ENABLE_SIMD != 0 */
+            default:
+                bh_assert(0);
+                break;
         }
         if (endptr && *endptr != '\0' && *endptr != '_') {
             snprintf(buf, sizeof(buf), "invalid input argument %d: %s",
@@ -2477,9 +2494,11 @@ wasm_application_execute_func(WASMModuleInstanceCommon *module_inst,
     for (j = 0; j < type->result_count; j++) {
         switch (type->types[type->param_count + j]) {
             case VALUE_TYPE_I32:
+            {
                 os_printf("0x%x:i32", argv1[k]);
                 k++;
                 break;
+            }
             case VALUE_TYPE_I64:
             {
                 union { uint64 val; uint32 parts[2]; } u;
@@ -2511,6 +2530,27 @@ wasm_application_execute_func(WASMModuleInstanceCommon *module_inst,
                 os_printf("%.7g:f64", u.val);
                 break;
             }
+#if WASM_ENABLE_SIMD != 0
+            case VALUE_TYPE_V128:
+            {
+                uint64 *v = (uint64*)(argv1 + k);
+#if defined(PRIx64)
+                os_printf("<0x%016"PRIx64" 0x%016"PRIx64">:v128", *v, *(v + 1));
+#else
+                if (4 == sizeof(long)) {
+                    os_printf("<0x%016llx 0x%016llx>:v128", *v, *(v + 1));
+                }
+                else {
+                    os_printf("<0x%016lx 0x%016lx>:v128", *v, *(v + 1));
+                }
+#endif /* PRIx64 */
+                k += 4;
+                break;
+            }
+#endif /*  WASM_ENABLE_SIMD != 0 */
+            default:
+                bh_assert(0);
+                break;
         }
         if (j < (uint32)(type->result_count - 1))
             os_printf(",");
@@ -3067,12 +3107,31 @@ fail:
 #if defined(BUILD_TARGET_X86_64) \
    || defined(BUILD_TARGET_AMD_64) \
    || defined(BUILD_TARGET_AARCH64)
+
+#if WASM_ENABLE_SIMD != 0
+#ifdef v128
+#undef v128
+#endif
+
+#if defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)
+#include <emmintrin.h>
+/* unaligned */
+#define v128 __m128i_u
+#else
+#warning "Include header files for v128 to support SIMD feature"
+#endif
+
+#ifndef v128
+#error "v128 type isn't defined"
+#endif
+#endif /* end of WASM_ENABLE_SIMD != 0 */
+
 typedef void (*GenericFunctionPointer)();
 int64 invokeNative(GenericFunctionPointer f, uint64 *args, uint64 n_stacks);
 
 typedef float64 (*Float64FuncPtr)(GenericFunctionPointer, uint64*, uint64);
 typedef float32 (*Float32FuncPtr)(GenericFunctionPointer, uint64*, uint64);
-typedef int64 (*Int64FuncPtr)(GenericFunctionPointer, uint64*,uint64);
+typedef int64 (*Int64FuncPtr)(GenericFunctionPointer, uint64*, uint64);
 typedef int32 (*Int32FuncPtr)(GenericFunctionPointer, uint64*, uint64);
 typedef void (*VoidFuncPtr)(GenericFunctionPointer, uint64*, uint64);
 
@@ -3082,10 +3141,15 @@ static Int64FuncPtr invokeNative_Int64 = (Int64FuncPtr)(uintptr_t)invokeNative;
 static Int32FuncPtr invokeNative_Int32 = (Int32FuncPtr)(uintptr_t)invokeNative;
 static VoidFuncPtr invokeNative_Void = (VoidFuncPtr)(uintptr_t)invokeNative;
 
+#if WASM_ENABLE_SIMD != 0
+typedef v128 (*V128FuncPtr)(GenericFunctionPointer, uint64*, uint64);
+static V128FuncPtr invokeNative_V128 = (V128FuncPtr)(uintptr_t)invokeNative;
+#endif
+
 #if defined(_WIN32) || defined(_WIN32_)
 #define MAX_REG_FLOATS  4
 #define MAX_REG_INTS  4
-#else
+#else /* else of defined(_WIN32) || defined(_WIN32_) */
 #define MAX_REG_FLOATS  8
 #if defined(BUILD_TARGET_AARCH64)
 #define MAX_REG_INTS  8
@@ -3101,12 +3165,17 @@ wasm_runtime_invoke_native(WASMExecEnv *exec_env, void *func_ptr,
                            uint32 *argv, uint32 argc, uint32 *argv_ret)
 {
     WASMModuleInstanceCommon *module = wasm_runtime_get_module_inst(exec_env);
-    uint64 argv_buf[32], *argv1 = argv_buf, *fps, *ints, *stacks, size, arg_i64;
+    uint64 argv_buf[32], *argv1 = argv_buf, *ints, *stacks, size, arg_i64;
     uint32 *argv_src = argv, i, argc1, n_ints = 0, n_stacks = 0;
     uint32 arg_i32, ptr_len;
     uint32 result_count = func_type->result_count;
     uint32 ext_ret_count = result_count > 1 ? result_count - 1 : 0;
     bool ret = false;
+#if WASM_ENABLE_SIMD == 0
+    uint64 *fps;
+#else
+    v128 *fps;
+#endif
 
 #if defined(_WIN32) || defined(_WIN32_)
     /* important difference in calling conventions */
@@ -3115,7 +3184,13 @@ wasm_runtime_invoke_native(WASMExecEnv *exec_env, void *func_ptr,
     int n_fps = 0;
 #endif
 
-    argc1 = 1 + MAX_REG_FLOATS + (uint32)func_type->param_count + ext_ret_count;
+#if WASM_ENABLE_SIMD == 0
+    argc1 = 1 + MAX_REG_FLOATS + (uint32)func_type->param_count
+              + ext_ret_count;
+#else
+    argc1 = 1 + MAX_REG_FLOATS * 2 + (uint32)func_type->param_count * 2
+              + ext_ret_count;
+#endif
     if (argc1 > sizeof(argv_buf) / sizeof(uint64)) {
         size = sizeof(uint64) * (uint64)argc1;
         if (!(argv1 = runtime_malloc((uint32)size, exec_env->module_inst,
@@ -3124,8 +3199,13 @@ wasm_runtime_invoke_native(WASMExecEnv *exec_env, void *func_ptr,
         }
     }
 
+#if WASM_ENABLE_SIMD == 0
     fps = argv1;
     ints = fps + MAX_REG_FLOATS;
+#else
+    fps = (v128 *)argv1;
+    ints = (uint64 *)(fps + MAX_REG_FLOATS);
+#endif
     stacks = ints + MAX_REG_INTS;
 
     ints[n_ints++] = (uint64)(uintptr_t)exec_env;
@@ -3175,18 +3255,34 @@ wasm_runtime_invoke_native(WASMExecEnv *exec_env, void *func_ptr,
                 argv_src += 2;
                 break;
             case VALUE_TYPE_F32:
-                if (n_fps < MAX_REG_FLOATS)
+                if (n_fps < MAX_REG_FLOATS) {
                     *(float32*)&fps[n_fps++] = *(float32*)argv_src++;
-                else
+                }
+                else {
                     *(float32*)&stacks[n_stacks++] = *(float32*)argv_src++;
+                }
                 break;
             case VALUE_TYPE_F64:
-                if (n_fps < MAX_REG_FLOATS)
+                if (n_fps < MAX_REG_FLOATS) {
                     *(float64*)&fps[n_fps++] = *(float64*)argv_src;
-                else
+                }
+                else {
                     *(float64*)&stacks[n_stacks++] = *(float64*)argv_src;
+                }
                 argv_src += 2;
                 break;
+#if WASM_ENABLE_SIMD != 0
+            case VALUE_TYPE_V128:
+                if (n_fps < MAX_REG_FLOATS) {
+                    *(v128*)&fps[n_fps++] = *(v128*)argv_src;
+                }
+                else {
+                    *(v128*)&stacks[n_stacks++] = *(v128*)argv_src;
+                    n_stacks++;
+                }
+                argv_src += 4;
+                break;
+#endif
             default:
                 bh_assert(0);
                 break;
@@ -3221,6 +3317,11 @@ wasm_runtime_invoke_native(WASMExecEnv *exec_env, void *func_ptr,
             case VALUE_TYPE_F64:
                 PUT_F64_TO_ADDR(argv_ret, invokeNative_Float64(func_ptr, argv1, n_stacks));
                 break;
+#if WASM_ENABLE_SIMD != 0
+            case VALUE_TYPE_V128:
+                *(v128*)argv_ret = invokeNative_V128(func_ptr, argv1, n_stacks);
+                break;
+#endif
             default:
                 bh_assert(0);
                 break;
@@ -3268,6 +3369,50 @@ wasm_runtime_call_indirect(WASMExecEnv *exec_env,
     return false;
 }
 
+static void
+exchange_uint32(uint8 *p_data)
+{
+    uint8 value = *p_data;
+    *p_data = *(p_data + 3);
+    *(p_data + 3) = value;
+
+    value = *(p_data + 1);
+    *(p_data + 1) = *(p_data + 2);
+    *(p_data + 2) = value;
+}
+
+static void
+exchange_uint64(uint8 *p_data)
+{
+    uint32 value;
+
+    value = *(uint32 *)p_data;
+    *(uint32 *)p_data = *(uint32 *)(p_data + 4);
+    *(uint32 *)(p_data + 4) = value;
+    exchange_uint32(p_data);
+    exchange_uint32(p_data + 4);
+}
+
+void
+wasm_runtime_read_v128(const uint8 *bytes, uint64 *ret1, uint64 *ret2)
+{
+    uint64 u1, u2;
+
+    bh_memcpy_s(&u1, 8, bytes, 8);
+    bh_memcpy_s(&u2, 8, bytes + 8, 8);
+
+    if (!is_little_endian()) {
+        exchange_uint64((uint8*)&u1);
+        exchange_uint64((uint8*)&u2);
+        *ret1 = u2;
+        *ret2 = u1;
+    }
+    else {
+        *ret1 = u1;
+        *ret2 = u2;
+    }
+}
+
 #if WASM_ENABLE_THREAD_MGR != 0
 typedef struct WASMThreadArg {
     WASMExecEnv *new_exec_env;

+ 3 - 0
core/iwasm/common/wasm_runtime_common.h

@@ -464,6 +464,9 @@ wasm_runtime_invoke_native_raw(WASMExecEnv *exec_env, void *func_ptr,
                                void *attachment,
                                uint32 *argv, uint32 argc, uint32 *ret);
 
+void
+wasm_runtime_read_v128(const uint8 *bytes, uint64 *ret1, uint64 *ret2);
+
 void
 wasm_runtime_dump_module_mem_consumption(const WASMModuleCommon *module);
 

+ 9 - 0
core/iwasm/compilation/aot.c

@@ -14,6 +14,15 @@ aot_get_last_error()
   return aot_error[0] == '\0' ? "" : aot_error;
 }
 
+void
+aot_set_last_error_v(const char *format, ...)
+{
+    va_list args;
+    va_start(args, format);
+    vsnprintf(aot_error, sizeof(aot_error), format, args);
+    va_end(args);
+}
+
 void
 aot_set_last_error(const char *error)
 {

+ 14 - 0
core/iwasm/compilation/aot.h

@@ -230,6 +230,20 @@ aot_get_last_error();
 void
 aot_set_last_error(const char *error);
 
+void
+aot_set_last_error_v(const char *format, ...);
+
+#if BH_DEBUG == 1
+#define HANDLE_FAILURE(callee) do {                          \
+    aot_set_last_error_v("call %s failed in %s:%d", (callee),\
+                         __FUNCTION__, __LINE__);            \
+  } while (0)
+#else
+#define HANDLE_FAILURE(callee) do {                          \
+    aot_set_last_error_v("call %s failed", (callee));        \
+  } while (0)
+#endif
+
 #ifdef __cplusplus
 } /* end of extern "C" */
 #endif

+ 735 - 6
core/iwasm/compilation/aot_compiler.c

@@ -14,6 +14,18 @@
 #include "aot_emit_control.h"
 #include "aot_emit_function.h"
 #include "aot_emit_parametric.h"
+#include "simd/simd_access_lanes.h"
+#include "simd/simd_bitmask_extracts.h"
+#include "simd/simd_bit_shifts.h"
+#include "simd/simd_bitwise_ops.h"
+#include "simd/simd_bool_reductions.h"
+#include "simd/simd_comparisons.h"
+#include "simd/simd_construct_values.h"
+#include "simd/simd_conversions.h"
+#include "simd/simd_floating_point.h"
+#include "simd/simd_int_arith.h"
+#include "simd/simd_load_store.h"
+#include "simd/simd_sat_int_arith.h"
 #include "../aot/aot_runtime.h"
 #include "../interpreter/wasm_opcode.h"
 #include <errno.h>
@@ -163,6 +175,7 @@ aot_compile_func(AOTCompContext *comp_ctx, uint32 func_index)
             || value_type == VALUE_TYPE_I64
             || value_type == VALUE_TYPE_F32
             || value_type == VALUE_TYPE_F64
+            || value_type == VALUE_TYPE_V128
             || value_type == VALUE_TYPE_VOID) {
           param_count = 0;
           param_types = NULL;
@@ -280,12 +293,12 @@ aot_compile_func(AOTCompContext *comp_ctx, uint32 func_index)
 
       case WASM_OP_DROP:
         if (!aot_compile_op_drop(comp_ctx, func_ctx, true))
-            return false;
+          return false;
         break;
 
       case WASM_OP_DROP_64:
         if (!aot_compile_op_drop(comp_ctx, func_ctx, false))
-            return false;
+          return false;
         break;
 
       case WASM_OP_SELECT:
@@ -761,22 +774,22 @@ aot_compile_func(AOTCompContext *comp_ctx, uint32 func_index)
 
       case WASM_OP_I32_REINTERPRET_F32:
         if (!aot_compile_op_i32_reinterpret_f32(comp_ctx, func_ctx))
-            return false;
+          return false;
         break;
 
       case WASM_OP_I64_REINTERPRET_F64:
         if (!aot_compile_op_i64_reinterpret_f64(comp_ctx, func_ctx))
-            return false;
+          return false;
         break;
 
       case WASM_OP_F32_REINTERPRET_I32:
         if (!aot_compile_op_f32_reinterpret_i32(comp_ctx, func_ctx))
-            return false;
+          return false;
         break;
 
       case WASM_OP_F64_REINTERPRET_I64:
         if (!aot_compile_op_f64_reinterpret_i64(comp_ctx, func_ctx))
-            return false;
+          return false;
         break;
 
       case WASM_OP_I32_EXTEND8_S:
@@ -1019,6 +1032,722 @@ build_atomic_rmw:
       }
 #endif /* end of WASM_ENABLE_SHARED_MEMORY */
 
+#if WASM_ENABLE_SIMD != 0
+      case WASM_OP_SIMD_PREFIX:
+      {
+        if (!comp_ctx->enable_simd) {
+            aot_set_last_error(
+              "current building does not support SIMD instructions");
+            return false;
+        }
+
+        opcode = *frame_ip++;
+        switch (opcode) {
+          case SIMD_v128_load:
+          {
+            read_leb_uint32(frame_ip, frame_ip_end, align);
+            read_leb_uint32(frame_ip, frame_ip_end, offset);
+            if (!aot_compile_simd_v128_load(comp_ctx, func_ctx, align, offset))
+              return false;
+            break;
+          }
+
+          case SIMD_i16x8_load8x8_s:
+          case SIMD_i16x8_load8x8_u:
+          case SIMD_i32x4_load16x4_s:
+          case SIMD_i32x4_load16x4_u:
+          case SIMD_i64x2_load32x2_s:
+          case SIMD_i64x2_load32x2_u:
+          {
+            read_leb_uint32(frame_ip, frame_ip_end, align);
+            read_leb_uint32(frame_ip, frame_ip_end, offset);
+            if (!aot_compile_simd_load_extend(comp_ctx, func_ctx,
+                                              opcode, align, offset))
+              return false;
+            break;
+          }
+
+          case SIMD_v8x16_load_splat:
+          case SIMD_v16x8_load_splat:
+          case SIMD_v32x4_load_splat:
+          case SIMD_v64x2_load_splat:
+          {
+            read_leb_uint32(frame_ip, frame_ip_end, align);
+            read_leb_uint32(frame_ip, frame_ip_end, offset);
+            if (!aot_compile_simd_load_splat(comp_ctx, func_ctx,
+                                             opcode, align, offset))
+              return false;
+            break;
+          }
+
+          case SIMD_v128_store:
+          {
+            read_leb_uint32(frame_ip, frame_ip_end, align);
+            read_leb_uint32(frame_ip, frame_ip_end, offset);
+            if (!aot_compile_simd_v128_store(comp_ctx, func_ctx, align, offset))
+              return false;
+            break;
+          }
+
+          case SIMD_v128_const:
+          {
+            if (!aot_compile_simd_v128_const(comp_ctx, func_ctx, frame_ip))
+              return false;
+            frame_ip += 16;
+            break;
+          }
+
+          case SIMD_v8x16_shuffle:
+          {
+            if (!aot_compile_simd_shuffle(comp_ctx, func_ctx, frame_ip))
+              return false;
+            frame_ip += 16;
+            break;
+          }
+
+          case SIMD_v8x16_swizzle:
+          {
+            if (!aot_compile_simd_swizzle(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+
+          case SIMD_i8x16_splat:
+          case SIMD_i16x8_splat:
+          case SIMD_i32x4_splat:
+          case SIMD_i64x2_splat:
+          case SIMD_f32x4_splat:
+          case SIMD_f64x2_splat:
+          {
+            if (!aot_compile_simd_splat(comp_ctx, func_ctx, opcode))
+              return false;
+            break;
+          }
+
+          case SIMD_i8x16_extract_lane_s:
+          {
+            if (!aot_compile_simd_extract_i8x16(comp_ctx, func_ctx, *frame_ip++,
+                                                true))
+              return false;
+            break;
+          }
+          case SIMD_i8x16_extract_lane_u:
+          {
+            if (!aot_compile_simd_extract_i8x16(comp_ctx, func_ctx, *frame_ip++,
+                                                false))
+              return false;
+            break;
+          }
+          case SIMD_i16x8_extract_lane_s:
+          {
+            if (!aot_compile_simd_extract_i16x8(comp_ctx, func_ctx, *frame_ip++,
+                                                true))
+              return false;
+            break;
+          }
+          case SIMD_i16x8_extract_lane_u:
+          {
+            if (!aot_compile_simd_extract_i16x8(comp_ctx, func_ctx, *frame_ip++,
+                                                false))
+              return false;
+            break;
+          }
+          case SIMD_i32x4_extract_lane:
+          {
+            if (!aot_compile_simd_extract_i32x4(comp_ctx, func_ctx, *frame_ip++))
+              return false;
+            break;
+          }
+          case SIMD_i64x2_extract_lane:
+          {
+            if (!aot_compile_simd_extract_i64x2(comp_ctx, func_ctx, *frame_ip++))
+              return false;
+            break;
+          }
+          case SIMD_f32x4_extract_lane:
+          {
+            if (!aot_compile_simd_extract_f32x4(comp_ctx, func_ctx, *frame_ip++))
+              return false;
+            break;
+          }
+          case SIMD_f64x2_extract_lane:
+          {
+            if (!aot_compile_simd_extract_f64x2(comp_ctx, func_ctx, *frame_ip++))
+              return false;
+            break;
+          }
+
+          case SIMD_i8x16_replace_lane:
+          {
+            if (!aot_compile_simd_replace_i8x16(comp_ctx, func_ctx, *frame_ip++))
+              return false;
+            break;
+          }
+          case SIMD_i16x8_replace_lane:
+          {
+            if (!aot_compile_simd_replace_i16x8(comp_ctx, func_ctx, *frame_ip++))
+              return false;
+            break;
+          }
+          case SIMD_i32x4_replace_lane:
+          {
+            if (!aot_compile_simd_replace_i32x4(comp_ctx, func_ctx, *frame_ip++))
+              return false;
+            break;
+          }
+          case SIMD_i64x2_replace_lane:
+          {
+            if (!aot_compile_simd_replace_i64x2(comp_ctx, func_ctx, *frame_ip++))
+              return false;
+            break;
+          }
+          case SIMD_f32x4_replace_lane:
+          {
+            if (!aot_compile_simd_replace_f32x4(comp_ctx, func_ctx, *frame_ip++))
+              return false;
+            break;
+          }
+          case SIMD_f64x2_replace_lane:
+          {
+            if (!aot_compile_simd_replace_f64x2(comp_ctx, func_ctx, *frame_ip++))
+              return false;
+            break;
+          }
+
+          case SIMD_i8x16_eq:
+          case SIMD_i8x16_ne:
+          case SIMD_i8x16_lt_s:
+          case SIMD_i8x16_lt_u:
+          case SIMD_i8x16_gt_s:
+          case SIMD_i8x16_gt_u:
+          case SIMD_i8x16_le_s:
+          case SIMD_i8x16_le_u:
+          case SIMD_i8x16_ge_s:
+          case SIMD_i8x16_ge_u:
+          {
+            if (!aot_compile_simd_i8x16_compare(comp_ctx, func_ctx,
+                                                INT_EQ + opcode - SIMD_i8x16_eq))
+              return false;
+            break;
+          }
+
+          case SIMD_i16x8_eq:
+          case SIMD_i16x8_ne:
+          case SIMD_i16x8_lt_s:
+          case SIMD_i16x8_lt_u:
+          case SIMD_i16x8_gt_s:
+          case SIMD_i16x8_gt_u:
+          case SIMD_i16x8_le_s:
+          case SIMD_i16x8_le_u:
+          case SIMD_i16x8_ge_s:
+          case SIMD_i16x8_ge_u:
+          {
+            if (!aot_compile_simd_i16x8_compare(comp_ctx, func_ctx,
+                                                INT_EQ + opcode - SIMD_i16x8_eq))
+              return false;
+            break;
+          }
+
+          case SIMD_i32x4_eq:
+          case SIMD_i32x4_ne:
+          case SIMD_i32x4_lt_s:
+          case SIMD_i32x4_lt_u:
+          case SIMD_i32x4_gt_s:
+          case SIMD_i32x4_gt_u:
+          case SIMD_i32x4_le_s:
+          case SIMD_i32x4_le_u:
+          case SIMD_i32x4_ge_s:
+          case SIMD_i32x4_ge_u:
+          {
+            if (!aot_compile_simd_i32x4_compare(comp_ctx, func_ctx,
+                                                INT_EQ + opcode - SIMD_i32x4_eq))
+              return false;
+            break;
+          }
+
+          case SIMD_f32x4_eq:
+          case SIMD_f32x4_ne:
+          case SIMD_f32x4_lt:
+          case SIMD_f32x4_gt:
+          case SIMD_f32x4_le:
+          case SIMD_f32x4_ge:
+          {
+            if (!aot_compile_simd_f32x4_compare(comp_ctx, func_ctx,
+                                                FLOAT_EQ + opcode - SIMD_f32x4_eq))
+              return false;
+            break;
+          }
+
+          case SIMD_f64x2_eq:
+          case SIMD_f64x2_ne:
+          case SIMD_f64x2_lt:
+          case SIMD_f64x2_gt:
+          case SIMD_f64x2_le:
+          case SIMD_f64x2_ge:
+          {
+            if (!aot_compile_simd_f64x2_compare(comp_ctx, func_ctx,
+                                                FLOAT_EQ + opcode - SIMD_f64x2_eq))
+              return false;
+            break;
+          }
+
+          case SIMD_v128_not:
+          case SIMD_v128_and:
+          case SIMD_v128_andnot:
+          case SIMD_v128_or:
+          case SIMD_v128_xor:
+          case SIMD_v128_bitselect:
+          {
+            if (!aot_compile_simd_v128_bitwise(comp_ctx, func_ctx,
+                                               V128_NOT + opcode - SIMD_v128_not))
+              return false;
+            break;
+          }
+
+          case SIMD_i8x16_add:
+          case SIMD_i8x16_sub:
+          {
+            V128Arithmetic arith_op = (opcode == SIMD_i8x16_add)
+                                      ? V128_ADD : V128_SUB;
+            if (!aot_compile_simd_i8x16_arith(comp_ctx, func_ctx, arith_op))
+              return false;
+            break;
+          }
+
+          case SIMD_i16x8_add:
+          case SIMD_i16x8_sub:
+          case SIMD_i16x8_mul:
+          {
+            V128Arithmetic arith_op = V128_ADD;
+            if (opcode == SIMD_i16x8_sub)
+              arith_op = V128_SUB;
+            else if (opcode == SIMD_i16x8_mul)
+              arith_op = V128_MUL;
+            if (!aot_compile_simd_i16x8_arith(comp_ctx, func_ctx, arith_op))
+              return false;
+            break;
+          }
+
+          case SIMD_i32x4_add:
+          case SIMD_i32x4_sub:
+          case SIMD_i32x4_mul:
+          {
+            V128Arithmetic arith_op = V128_ADD;
+            if (opcode == SIMD_i32x4_sub)
+              arith_op = V128_SUB;
+            else if (opcode == SIMD_i32x4_mul)
+              arith_op = V128_MUL;
+            if (!aot_compile_simd_i32x4_arith(comp_ctx, func_ctx, arith_op))
+              return false;
+            break;
+          }
+
+          case SIMD_i64x2_add:
+          case SIMD_i64x2_sub:
+          case SIMD_i64x2_mul:
+          {
+            V128Arithmetic arith_op = V128_ADD;
+            if (opcode == SIMD_i64x2_sub)
+              arith_op = V128_SUB;
+            else if (opcode == SIMD_i64x2_mul)
+              arith_op = V128_MUL;
+            if (!aot_compile_simd_i64x2_arith(comp_ctx, func_ctx, arith_op))
+              return false;
+            break;
+          }
+
+          case SIMD_i8x16_neg:
+          {
+            if (!aot_compile_simd_i8x16_neg(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_i16x8_neg:
+          {
+            if (!aot_compile_simd_i16x8_neg(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_i32x4_neg:
+          {
+            if (!aot_compile_simd_i32x4_neg(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_i64x2_neg:
+          {
+            if (!aot_compile_simd_i64x2_neg(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+
+          case SIMD_i8x16_add_saturate_s:
+          case SIMD_i8x16_add_saturate_u:
+          {
+            if (!aot_compile_simd_i8x16_saturate(comp_ctx, func_ctx, V128_ADD,
+                                                 opcode == SIMD_i8x16_add_saturate_s
+                                                 ? true : false))
+              return false;
+            break;
+          }
+          case SIMD_i8x16_sub_saturate_s:
+          case SIMD_i8x16_sub_saturate_u:
+          {
+            if (!aot_compile_simd_i8x16_saturate(comp_ctx, func_ctx, V128_SUB,
+                                                 opcode == SIMD_i8x16_sub_saturate_s
+                                                 ? true : false))
+              return false;
+            break;
+          }
+          case SIMD_i16x8_add_saturate_s:
+          case SIMD_i16x8_add_saturate_u:
+          {
+            if (!aot_compile_simd_i16x8_saturate(comp_ctx, func_ctx, V128_ADD,
+                                                 opcode == SIMD_i16x8_add_saturate_s
+                                                 ? true : false))
+              return false;
+            break;
+          }
+          case SIMD_i16x8_sub_saturate_s:
+          case SIMD_i16x8_sub_saturate_u:
+          {
+            if (!aot_compile_simd_i16x8_saturate(comp_ctx, func_ctx, V128_SUB,
+                                                 opcode == SIMD_i16x8_sub_saturate_s
+                                                 ? true : false))
+              return false;
+            break;
+          }
+
+          case SIMD_i8x16_min_s:
+          case SIMD_i8x16_min_u:
+          {
+            if (!aot_compile_simd_i8x16_cmp(comp_ctx, func_ctx, V128_MIN,
+                                            opcode == SIMD_i8x16_min_s
+                                            ? true : false))
+              return false;
+            break;
+          }
+          case SIMD_i8x16_max_s:
+          case SIMD_i8x16_max_u:
+          {
+            if (!aot_compile_simd_i8x16_cmp(comp_ctx, func_ctx, V128_MAX,
+                                            opcode == SIMD_i8x16_max_s
+                                            ? true : false))
+              return false;
+            break;
+          }
+          case SIMD_i16x8_min_s:
+          case SIMD_i16x8_min_u:
+          {
+            if (!aot_compile_simd_i16x8_cmp(comp_ctx, func_ctx, V128_MIN,
+                                            opcode == SIMD_i16x8_min_s
+                                            ? true : false))
+              return false;
+            break;
+          }
+          case SIMD_i16x8_max_s:
+          case SIMD_i16x8_max_u:
+          {
+            if (!aot_compile_simd_i16x8_cmp(comp_ctx, func_ctx, V128_MAX,
+                                            opcode == SIMD_i16x8_max_s
+                                            ? true : false))
+              return false;
+            break;
+          }
+          case SIMD_i32x4_min_s:
+          case SIMD_i32x4_min_u:
+          {
+            if (!aot_compile_simd_i32x4_cmp(comp_ctx, func_ctx, V128_MIN,
+                                            opcode == SIMD_i32x4_min_s
+                                            ? true : false))
+              return false;
+            break;
+          }
+          case SIMD_i32x4_max_s:
+          case SIMD_i32x4_max_u:
+          {
+            if (!aot_compile_simd_i32x4_cmp(comp_ctx, func_ctx, V128_MAX,
+                                            opcode == SIMD_i32x4_max_s
+                                            ? true : false))
+              return false;
+            break;
+          }
+
+          case SIMD_i8x16_abs:
+          {
+            if (!aot_compile_simd_i8x16_abs(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_i16x8_abs:
+          {
+            if (!aot_compile_simd_i16x8_abs(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_i32x4_abs:
+          {
+            if (!aot_compile_simd_i32x4_abs(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+
+          case SIMD_i8x16_avgr_u:
+          {
+            if (!aot_compile_simd_i8x16_avgr_u(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_i16x8_avgr_u:
+          {
+            if (!aot_compile_simd_i16x8_avgr_u(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+
+          case SIMD_i8x16_any_true:
+          {
+            if (!aot_compile_simd_i8x16_any_true(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_i16x8_any_true:
+          {
+            if (!aot_compile_simd_i16x8_any_true(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_i32x4_any_true:
+          {
+            if (!aot_compile_simd_i32x4_any_true(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_i8x16_all_true:
+          {
+            if (!aot_compile_simd_i8x16_all_true(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_i16x8_all_true:
+          {
+            if (!aot_compile_simd_i16x8_all_true(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_i32x4_all_true:
+          {
+            if (!aot_compile_simd_i32x4_all_true(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_i8x16_bitmask:
+          {
+            if (!aot_compile_simd_i8x16_bitmask(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_i16x8_bitmask:
+          {
+            if (!aot_compile_simd_i16x8_bitmask(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_i32x4_bitmask:
+          {
+            if (!aot_compile_simd_i32x4_bitmask(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+
+          case SIMD_i8x16_shl:
+          case SIMD_i8x16_shr_s:
+          case SIMD_i8x16_shr_u:
+          {
+            if (!aot_compile_simd_i8x16_shift(comp_ctx, func_ctx,
+                                              INT_SHL + opcode - SIMD_i8x16_shl))
+              return false;
+            break;
+          }
+          case SIMD_i16x8_shl:
+          case SIMD_i16x8_shr_s:
+          case SIMD_i16x8_shr_u:
+          {
+            if (!aot_compile_simd_i16x8_shift(comp_ctx, func_ctx,
+                                              INT_SHL + opcode - SIMD_i16x8_shl))
+              return false;
+            break;
+          }
+          case SIMD_i32x4_shl:
+          case SIMD_i32x4_shr_s:
+          case SIMD_i32x4_shr_u:
+          {
+            if (!aot_compile_simd_i32x4_shift(comp_ctx, func_ctx,
+                                              INT_SHL + opcode - SIMD_i32x4_shl))
+              return false;
+            break;
+          }
+          case SIMD_i64x2_shl:
+          case SIMD_i64x2_shr_s:
+          case SIMD_i64x2_shr_u:
+          {
+            if (!aot_compile_simd_i64x2_shift(comp_ctx, func_ctx,
+                                              INT_SHL + opcode - SIMD_i64x2_shl))
+              return false;
+            break;
+          }
+
+          case SIMD_i8x16_narrow_i16x8_s:
+          case SIMD_i8x16_narrow_i16x8_u:
+          {
+            bool is_signed = (opcode == SIMD_i8x16_narrow_i16x8_s)
+                             ? true : false;
+            if (!aot_compile_simd_i8x16_narrow_i16x8(comp_ctx, func_ctx,
+                                                     is_signed))
+              return false;
+            break;
+          }
+          case SIMD_i16x8_narrow_i32x4_s:
+          case SIMD_i16x8_narrow_i32x4_u:
+          {
+            bool is_signed = (opcode == SIMD_i16x8_narrow_i32x4_s)
+                             ? true : false;
+            if (!aot_compile_simd_i16x8_narrow_i32x4(comp_ctx, func_ctx,
+                                                     is_signed))
+              return false;
+            break;
+          }
+          case SIMD_i16x8_widen_low_i8x16_s:
+          case SIMD_i16x8_widen_high_i8x16_s:
+          {
+            bool is_low = (opcode == SIMD_i16x8_widen_low_i8x16_s)
+                          ? true : false;
+            if (!aot_compile_simd_i16x8_widen_i8x16(comp_ctx, func_ctx,
+                                                    is_low, true))
+              return false;
+            break;
+          }
+          case SIMD_i16x8_widen_low_i8x16_u:
+          case SIMD_i16x8_widen_high_i8x16_u:
+          {
+            bool is_low = (opcode == SIMD_i16x8_widen_low_i8x16_u)
+                          ? true : false;
+            if (!aot_compile_simd_i16x8_widen_i8x16(comp_ctx, func_ctx,
+                                                    is_low, false))
+              return false;
+            break;
+          }
+          case SIMD_i32x4_widen_low_i16x8_s:
+          case SIMD_i32x4_widen_high_i16x8_s:
+          {
+            bool is_low = (opcode == SIMD_i32x4_widen_low_i16x8_s)
+                          ? true : false;
+            if (!aot_compile_simd_i32x4_widen_i16x8(comp_ctx, func_ctx,
+                                                    is_low, true))
+              return false;
+            break;
+          }
+          case SIMD_i32x4_widen_low_i16x8_u:
+          case SIMD_i32x4_widen_high_i16x8_u:
+          {
+            bool is_low = (opcode == SIMD_i32x4_widen_low_i16x8_u)
+                          ? true : false;
+            if (!aot_compile_simd_i32x4_widen_i16x8(comp_ctx, func_ctx,
+                                                    is_low, false))
+              return false;
+            break;
+          }
+
+          case SIMD_i32x4_trunc_sat_f32x4_s:
+          case SIMD_i32x4_trunc_sat_f32x4_u:
+          {
+            bool is_signed = (opcode == SIMD_i32x4_trunc_sat_f32x4_s)
+                             ? true : false;
+            if (!aot_compile_simd_i32x4_trunc_sat_f32x4(comp_ctx, func_ctx,
+                                                        is_signed))
+              return false;
+            break;
+          }
+          case SIMD_f32x4_convert_i32x4_s:
+          case SIMD_f32x4_convert_i32x4_u:
+          {
+            bool is_signed = (opcode == SIMD_f32x4_convert_i32x4_s)
+                             ? true : false;
+            if (!aot_compile_simd_f32x4_convert_i32x4(comp_ctx, func_ctx,
+                                                      is_signed))
+              return false;
+            break;
+          }
+
+          case SIMD_f32x4_add:
+          case SIMD_f32x4_sub:
+          case SIMD_f32x4_mul:
+          case SIMD_f32x4_div:
+          case SIMD_f32x4_min:
+          case SIMD_f32x4_max:
+          {
+            if (!aot_compile_simd_f32x4_arith(comp_ctx, func_ctx,
+                                              FLOAT_ADD + opcode - SIMD_f32x4_add))
+              return false;
+            break;
+          }
+          case SIMD_f64x2_add:
+          case SIMD_f64x2_sub:
+          case SIMD_f64x2_mul:
+          case SIMD_f64x2_div:
+          case SIMD_f64x2_min:
+          case SIMD_f64x2_max:
+          {
+            if (!aot_compile_simd_f64x2_arith(comp_ctx, func_ctx,
+                                              FLOAT_ADD + opcode - SIMD_f64x2_add))
+              return false;
+            break;
+          }
+
+          case SIMD_f32x4_neg:
+          {
+            if (!aot_compile_simd_f32x4_neg(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_f64x2_neg:
+          {
+            if (!aot_compile_simd_f64x2_neg(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_f32x4_abs:
+          {
+            if (!aot_compile_simd_f32x4_abs(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_f64x2_abs:
+          {
+            if (!aot_compile_simd_f64x2_abs(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_f32x4_sqrt:
+          {
+            if (!aot_compile_simd_f32x4_sqrt(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+          case SIMD_f64x2_sqrt:
+          {
+            if (!aot_compile_simd_f64x2_sqrt(comp_ctx, func_ctx))
+              return false;
+            break;
+          }
+
+          default:
+            break;
+        }
+        break;
+      }
+#endif /* end of WASM_ENABLE_SIMD */
+
       default:
         aot_set_last_error("unsupported opcode");
         break;

+ 55 - 0
core/iwasm/compilation/aot_compiler.h

@@ -46,12 +46,35 @@ typedef enum IntArithmetic {
   INT_REM_U
 } IntArithmetic;
 
+typedef enum V128Arithmetic {
+  V128_ADD = 0,
+  V128_ADD_SATURATE_S,
+  V128_ADD_SATURATE_U,
+  V128_SUB,
+  V128_SUB_SATURATE_S,
+  V128_SUB_SATURATE_U,
+  V128_MUL,
+  V128_DIV,
+  V128_NEG,
+  V128_MIN,
+  V128_MAX,
+} V128Arithmetic;
+
 typedef enum IntBitwise {
   INT_AND = 0,
   INT_OR,
   INT_XOR,
 } IntBitwise;
 
+typedef enum V128Bitwise {
+  V128_NOT,
+  V128_AND,
+  V128_ANDNOT,
+  V128_OR,
+  V128_XOR,
+  V128_BITSELECT
+} V128Bitwise;
+
 typedef enum IntShift {
   INT_SHL = 0,
   INT_SHR_S,
@@ -123,6 +146,7 @@ typedef enum FloatArithmetic {
 #define POP_I64(v) POP(v, VALUE_TYPE_I64)
 #define POP_F32(v) POP(v, VALUE_TYPE_F32)
 #define POP_F64(v) POP(v, VALUE_TYPE_F64)
+#define POP_V128(v) POP(v, VALUE_TYPE_V128)
 
 #define POP_COND(llvm_value) do {                           \
     AOTValue *aot_value;                                    \
@@ -172,6 +196,7 @@ typedef enum FloatArithmetic {
 #define PUSH_I64(v) PUSH(v, VALUE_TYPE_I64)
 #define PUSH_F32(v) PUSH(v, VALUE_TYPE_F32)
 #define PUSH_F64(v) PUSH(v, VALUE_TYPE_F64)
+#define PUSH_V128(v) PUSH(v, VALUE_TYPE_V128)
 #define PUSH_COND(v) PUSH(v, VALUE_TYPE_I1)
 
 #define TO_LLVM_TYPE(wasm_type) \
@@ -218,6 +243,36 @@ typedef enum FloatArithmetic {
 #define I64_63     (comp_ctx->llvm_consts.i64_63)
 #define I64_64     (comp_ctx->llvm_consts.i64_64)
 
+#define V128_TYPE       comp_ctx->basic_types.v128_type
+#define V128_PTR_TYPE   comp_ctx->basic_types.v128_ptr_type
+#define V128_i8x16_TYPE comp_ctx->basic_types.i8x16_vec_type
+#define V128_i16x8_TYPE comp_ctx->basic_types.i16x8_vec_type
+#define V128_i32x4_TYPE comp_ctx->basic_types.i32x4_vec_type
+#define V128_i64x2_TYPE comp_ctx->basic_types.i64x2_vec_type
+#define V128_f32x4_TYPE comp_ctx->basic_types.f32x4_vec_type
+#define V128_f64x2_TYPE comp_ctx->basic_types.f64x2_vec_type
+
+#define V128_ZERO       (comp_ctx->llvm_consts.v128_zero)
+#define V128_i8x16_ZERO (comp_ctx->llvm_consts.i8x16_vec_zero)
+#define V128_i16x8_ZERO (comp_ctx->llvm_consts.i16x8_vec_zero)
+#define V128_i32x4_ZERO (comp_ctx->llvm_consts.i32x4_vec_zero)
+#define V128_i64x2_ZERO (comp_ctx->llvm_consts.i64x2_vec_zero)
+#define V128_f32x4_ZERO (comp_ctx->llvm_consts.f32x4_vec_zero)
+#define V128_f64x2_ZERO (comp_ctx->llvm_consts.f64x2_vec_zero)
+
+#define TO_V128_i8x16(v) LLVMBuildBitCast(comp_ctx->builder, v, \
+                                          V128_i8x16_TYPE, "i8x16_val")
+#define TO_V128_i16x8(v) LLVMBuildBitCast(comp_ctx->builder, v, \
+                                          V128_i16x8_TYPE, "i16x8_val")
+#define TO_V128_i32x4(v) LLVMBuildBitCast(comp_ctx->builder, v, \
+                                          V128_i32x4_TYPE, "i32x4_val")
+#define TO_V128_i64x2(v) LLVMBuildBitCast(comp_ctx->builder, v, \
+                                          V128_i64x2_TYPE, "i64x2_val")
+#define TO_V128_f32x4(v) LLVMBuildBitCast(comp_ctx->builder, v, \
+                                          V128_f32x4_TYPE, "f32x4_val")
+#define TO_V128_f64x2(v) LLVMBuildBitCast(comp_ctx->builder, v, \
+                                          V128_f64x2_TYPE, "f64x2_val")
+
 #define CHECK_LLVM_CONST(v) do {                        \
     if (!v) {                                           \
       aot_set_last_error("create llvm const failed.");  \

+ 41 - 4
core/iwasm/compilation/aot_emit_aot_file.c

@@ -299,9 +299,14 @@ get_import_global_info_size(AOTCompData *comp_data)
 static uint32
 get_global_size(AOTGlobal *global)
 {
-    /* type (1 byte) + is_mutable (1 byte)
-       + init expr type (2 byes) + init expr value (8 byes) */
-    return sizeof(uint8) * 2 + sizeof(uint16) + sizeof(uint64);
+    if (global->init_expr.init_expr_type != INIT_EXPR_TYPE_V128_CONST)
+        /* type (1 byte) + is_mutable (1 byte)
+           + init expr type (2 byes) + init expr value (8 byes) */
+        return sizeof(uint8) * 2 + sizeof(uint16) + sizeof(uint64);
+    else
+        /* type (1 byte) + is_mutable (1 byte)
+           + init expr type (2 byes) + v128 value (16 byes) */
+        return sizeof(uint8) * 2 + sizeof(uint16) + sizeof(uint64) * 2;
 }
 
 static uint32
@@ -800,10 +805,28 @@ exchange_uint32(uint8 *p_data)
 static void
 exchange_uint64(uint8 *pData)
 {
+    uint32 value;
+
+    value = *(uint32 *)pData;
+    *(uint32 *)pData = *(uint32 *)(pData + 4);
+    *(uint32 *)(pData + 4) = value;
     exchange_uint32(pData);
     exchange_uint32(pData + 4);
 }
 
+static void
+exchange_uint128(uint8 *pData)
+{
+    /* swap high 64bit and low 64bit */
+    uint64 value = *(uint64*)pData;
+    *(uint64*)pData = *(uint64*)(pData + 8);
+    *(uint64*)(pData + 8) = value;
+    /* exchange high 64bit */
+    exchange_uint64(pData);
+    /* exchange low 64bit */
+    exchange_uint64(pData + 8);
+}
+
 static union {
     int a;
     char b;
@@ -851,6 +874,17 @@ static union {
     offset += (uint32)sizeof(uint64);       \
   } while (0)
 
+#define EMIT_V128(v)  do {                  \
+    uint64 *t = (uint64*)v.i64x2;           \
+    CHECK_BUF(16);                          \
+    if (!is_little_endian())                \
+        exchange_uint128((uint8 *)&t);      \
+    PUT_U64_TO_ADDR(buf + offset, t[0]);    \
+    offset += (uint32)sizeof(uint64);       \
+    PUT_U64_TO_ADDR(buf + offset, t[1]);    \
+    offset += (uint32)sizeof(uint64);       \
+  } while (0)
+
 #define EMIT_BUF(v, len)  do {              \
     CHECK_BUF(len);                         \
     memcpy(buf + offset, v, len);           \
@@ -1093,7 +1127,10 @@ aot_emit_global_info(uint8 *buf, uint8 *buf_end, uint32 *p_offset,
         EMIT_U8(global->type);
         EMIT_U8(global->is_mutable);
         EMIT_U16(global->init_expr.init_expr_type);
-        EMIT_U64(global->init_expr.u.i64);
+        if (global->init_expr.init_expr_type != INIT_EXPR_TYPE_V128_CONST)
+            EMIT_U64(global->init_expr.u.i64);
+        else
+            EMIT_V128(global->init_expr.u.v128);
     }
 
     if (offset - *p_offset != get_global_info_size(comp_data)) {

+ 13 - 23
core/iwasm/compilation/aot_emit_control.c

@@ -96,11 +96,17 @@ format_block_name(char *name, uint32 name_size,
     }                                                               \
   } while (0)
 
-#define ADD_TO_RESULT_PHIS(block, value, idx) do {           \
-    LLVMBasicBlockRef block_curr = CURR_BLOCK();             \
-    LLVMAddIncoming(block->result_phis[idx],                 \
-                    &value, &block_curr, 1);                 \
-  } while (0)
+#define ADD_TO_RESULT_PHIS(block, value, idx) do {                        \
+    LLVMBasicBlockRef block_curr = CURR_BLOCK();                          \
+    LLVMTypeRef phi_ty = LLVMTypeOf(block->result_phis[idx]);             \
+    LLVMTypeRef value_ty = LLVMTypeOf(value);                             \
+    bh_assert(LLVMGetTypeKind(phi_ty) == LLVMGetTypeKind(value_ty));      \
+    bh_assert(LLVMGetTypeContext(phi_ty)                                  \
+              == LLVMGetTypeContext(value_ty));                           \
+    LLVMAddIncoming(block->result_phis[idx], &value, &block_curr, 1);     \
+    (void)phi_ty;                                                         \
+    (void)value_ty;                                                       \
+ } while (0)
 
 #define BUILD_ICMP(op, left, right, res, name) do {     \
     if (!(res = LLVMBuildICmp(comp_ctx->builder, op,    \
@@ -686,24 +692,8 @@ check_suspend_flags(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
 
     /* Move builder to terminate block */
     SET_BUILDER_POS(terminate_block);
-    if (aot_func_type->result_count) {
-        switch (aot_func_type->types[aot_func_type->param_count]) {
-            case VALUE_TYPE_I32:
-                LLVMBuildRet(comp_ctx->builder, I32_ZERO);
-                break;
-            case VALUE_TYPE_I64:
-                LLVMBuildRet(comp_ctx->builder, I64_ZERO);
-                break;
-            case VALUE_TYPE_F32:
-                LLVMBuildRet(comp_ctx->builder, F32_ZERO);
-                break;
-            case VALUE_TYPE_F64:
-                LLVMBuildRet(comp_ctx->builder, F64_ZERO);
-                break;
-        }
-    }
-    else {
-        LLVMBuildRetVoid(comp_ctx->builder);
+    if (!aot_build_zero_function_ret(comp_ctx, aot_func_type)) {
+        goto fail;
     }
 
     /* Move builder to terminate block */

+ 4 - 22
core/iwasm/compilation/aot_emit_exception.c

@@ -53,10 +53,8 @@ aot_emit_exception(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                                  func_ctx->got_exception_block);
 
         /* Create exection id phi */
-        if (!(func_ctx->exception_id_phi =
-                LLVMBuildPhi(comp_ctx->builder,
-                             comp_ctx->basic_types.int32_type,
-                             "exception_id_phi"))) {
+        if (!(func_ctx->exception_id_phi = LLVMBuildPhi(
+                comp_ctx->builder, I32_TYPE, "exception_id_phi"))) {
             aot_set_last_error("llvm build phi failed.");
             return false;
         }
@@ -110,24 +108,8 @@ aot_emit_exception(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
         /* Create return IR */
         AOTFuncType *aot_func_type = func_ctx->aot_func->func_type;
-        if (aot_func_type->result_count) {
-            switch (aot_func_type->types[aot_func_type->param_count]) {
-                case VALUE_TYPE_I32:
-                    LLVMBuildRet(comp_ctx->builder, I32_ZERO);
-                    break;
-                case VALUE_TYPE_I64:
-                    LLVMBuildRet(comp_ctx->builder, I64_ZERO);
-                    break;
-                case VALUE_TYPE_F32:
-                    LLVMBuildRet(comp_ctx->builder, F32_ZERO);
-                    break;
-                case VALUE_TYPE_F64:
-                    LLVMBuildRet(comp_ctx->builder, F64_ZERO);
-                    break;
-            }
-        }
-        else {
-            LLVMBuildRetVoid(comp_ctx->builder);
+        if (!aot_build_zero_function_ret(comp_ctx, aot_func_type)) {
+            return false;
         }
 
         /* Resume the builder position */

+ 2 - 18
core/iwasm/compilation/aot_emit_function.c

@@ -25,24 +25,8 @@ create_func_return_block(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
 
         /* Create return IR */
         LLVMPositionBuilderAtEnd(comp_ctx->builder, func_ctx->func_return_block);
-        if (aot_func_type->result_count) {
-            switch (aot_func_type->types[aot_func_type->param_count]) {
-                case VALUE_TYPE_I32:
-                    LLVMBuildRet(comp_ctx->builder, I32_ZERO);
-                    break;
-                case VALUE_TYPE_I64:
-                    LLVMBuildRet(comp_ctx->builder, I64_ZERO);
-                    break;
-                case VALUE_TYPE_F32:
-                    LLVMBuildRet(comp_ctx->builder, F32_ZERO);
-                    break;
-                case VALUE_TYPE_F64:
-                    LLVMBuildRet(comp_ctx->builder, F64_ZERO);
-                    break;
-            }
-        }
-        else {
-            LLVMBuildRetVoid(comp_ctx->builder);
+        if (!aot_build_zero_function_ret(comp_ctx, aot_func_type)) {
+            return false;
         }
     }
 

+ 22 - 51
core/iwasm/compilation/aot_emit_memory.c

@@ -53,6 +53,9 @@ get_memory_check_bound(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         case 8:
             mem_check_bound = func_ctx->mem_info[0].mem_bound_check_8bytes;
             break;
+        case 16:
+            mem_check_bound = func_ctx->mem_info[0].mem_bound_check_16bytes;
+            break;
         default:
             bh_assert(0);
             return NULL;
@@ -73,9 +76,9 @@ get_memory_check_bound(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 static LLVMValueRef
 get_memory_size(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
 
-static LLVMValueRef
-check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
-                      uint32 offset, uint32 bytes)
+LLVMValueRef
+aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
+                          uint32 offset, uint32 bytes)
 {
     LLVMValueRef offset_const = I32_CONST(offset);
     LLVMValueRef addr, maddr, offset1, cmp1, cmp2, cmp;
@@ -348,7 +351,7 @@ aot_compile_op_i32_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 {
     LLVMValueRef maddr, value = NULL;
 
-    if (!(maddr = check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
         return false;
 
     switch (bytes) {
@@ -400,7 +403,7 @@ aot_compile_op_i64_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 {
     LLVMValueRef maddr, value = NULL;
 
-    if (!(maddr = check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
         return false;
 
     switch (bytes) {
@@ -454,7 +457,7 @@ aot_compile_op_f32_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 {
     LLVMValueRef maddr, value;
 
-    if (!(maddr = check_memory_overflow(comp_ctx, func_ctx, offset, 4)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 4)))
         return false;
 
     BUILD_PTR_CAST(F32_PTR_TYPE);
@@ -471,7 +474,7 @@ aot_compile_op_f64_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 {
     LLVMValueRef maddr, value;
 
-    if (!(maddr = check_memory_overflow(comp_ctx, func_ctx, offset, 8)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 8)))
         return false;
 
     BUILD_PTR_CAST(F64_PTR_TYPE);
@@ -490,7 +493,7 @@ aot_compile_op_i32_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     POP_I32(value);
 
-    if (!(maddr = check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
         return false;
 
     switch (bytes) {
@@ -529,7 +532,7 @@ aot_compile_op_i64_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     POP_I64(value);
 
-    if (!(maddr = check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
         return false;
 
     switch (bytes) {
@@ -572,7 +575,7 @@ aot_compile_op_f32_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     POP_F32(value);
 
-    if (!(maddr = check_memory_overflow(comp_ctx, func_ctx, offset, 4)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 4)))
         return false;
 
     BUILD_PTR_CAST(F32_PTR_TYPE);
@@ -590,7 +593,7 @@ aot_compile_op_f64_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     POP_F64(value);
 
-    if (!(maddr = check_memory_overflow(comp_ctx, func_ctx, offset, 8)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 8)))
         return false;
 
     BUILD_PTR_CAST(F64_PTR_TYPE);
@@ -877,24 +880,8 @@ aot_compile_op_memory_init(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     /* If memory.init failed, return this function
         so the runtime can catch the exception */
     LLVMPositionBuilderAtEnd(comp_ctx->builder, mem_init_fail);
-    if (aot_func_type->result_count) {
-        switch (aot_func_type->types[aot_func_type->param_count]) {
-            case VALUE_TYPE_I32:
-                LLVMBuildRet(comp_ctx->builder, I32_ZERO);
-                break;
-            case VALUE_TYPE_I64:
-                LLVMBuildRet(comp_ctx->builder, I64_ZERO);
-                break;
-            case VALUE_TYPE_F32:
-                LLVMBuildRet(comp_ctx->builder, F32_ZERO);
-                break;
-            case VALUE_TYPE_F64:
-                LLVMBuildRet(comp_ctx->builder, F64_ZERO);
-                break;
-        }
-    }
-    else {
-        LLVMBuildRetVoid(comp_ctx->builder);
+    if (!aot_build_zero_function_ret(comp_ctx, aot_func_type)) {
+        goto fail;
     }
 
     LLVMPositionBuilderAtEnd(comp_ctx->builder, init_success);
@@ -1002,7 +989,7 @@ aot_compile_op_atomic_rmw(AOTCompContext *comp_ctx,
     else
         POP_I64(value);
 
-    if (!(maddr = check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
         return false;
 
     if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align))
@@ -1076,7 +1063,7 @@ aot_compile_op_atomic_cmpxchg(AOTCompContext *comp_ctx,
         POP_I64(expect);
     }
 
-    if (!(maddr = check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
         return false;
 
     if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align))
@@ -1175,7 +1162,7 @@ aot_compile_op_atomic_wait(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     CHECK_LLVM_CONST(is_wait64);
 
-    if (!(maddr = check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
         return false;
 
     if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align))
@@ -1219,24 +1206,8 @@ aot_compile_op_atomic_wait(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     /* If atomic wait failed, return this function
         so the runtime can catch the exception */
     LLVMPositionBuilderAtEnd(comp_ctx->builder, wait_fail);
-    if (aot_func_type->result_count) {
-        switch (aot_func_type->types[aot_func_type->param_count]) {
-            case VALUE_TYPE_I32:
-                LLVMBuildRet(comp_ctx->builder, I32_ZERO);
-                break;
-            case VALUE_TYPE_I64:
-                LLVMBuildRet(comp_ctx->builder, I64_ZERO);
-                break;
-            case VALUE_TYPE_F32:
-                LLVMBuildRet(comp_ctx->builder, F32_ZERO);
-                break;
-            case VALUE_TYPE_F64:
-                LLVMBuildRet(comp_ctx->builder, F64_ZERO);
-                break;
-        }
-    }
-    else {
-        LLVMBuildRetVoid(comp_ctx->builder);
+    if (!aot_build_zero_function_ret(comp_ctx, aot_func_type)) {
+        goto fail;
     }
 
     LLVMPositionBuilderAtEnd(comp_ctx->builder, wait_success);
@@ -1259,7 +1230,7 @@ aot_compiler_op_atomic_notify(AOTCompContext *comp_ctx,
 
     POP_I32(count);
 
-    if (!(maddr = check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes)))
         return false;
 
     if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align))

+ 4 - 0
core/iwasm/compilation/aot_emit_memory.h

@@ -49,6 +49,10 @@ bool
 aot_compile_op_f64_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                          uint32 align, uint32 offset);
 
+LLVMValueRef
+aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
+                          uint32 offset, uint32 bytes);
+
 bool
 aot_compile_op_memory_size(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
 

+ 29 - 150
core/iwasm/compilation/aot_emit_numberic.c

@@ -135,110 +135,6 @@
 } while (0)
 
 
-static LLVMValueRef
-__call_llvm_intrinsic(AOTCompContext *comp_ctx,
-                      const char *name,
-                      LLVMTypeRef ret_type,
-                      LLVMTypeRef *param_types,
-                      int param_count,
-                      LLVMValueRef *param_values)
-{
-    LLVMValueRef func, ret;
-    LLVMTypeRef func_type;
-
-    /* Declare llvm intrinsic function if necessary */
-    if (!(func = LLVMGetNamedFunction(comp_ctx->module, name))) {
-        if (!(func_type =
-                LLVMFunctionType(ret_type, param_types, (uint32)param_count, false))) {
-            aot_set_last_error("create LLVM function type failed.");
-            return NULL;
-        }
-
-        if (!(func = LLVMAddFunction(comp_ctx->module, name, func_type))) {
-            aot_set_last_error("add LLVM function failed.");
-            return NULL;
-        }
-    }
-
-    /* Call the LLVM intrinsic function */
-    if (!(ret = LLVMBuildCall(comp_ctx->builder, func, param_values,
-                              (uint32)param_count, "call"))) {
-        aot_set_last_error("llvm build call failed.");
-        return NULL;
-    }
-
-    return ret;
-}
-
-static LLVMValueRef
-call_llvm_intrinsic(AOTCompContext *comp_ctx,
-                    const char *name,
-                    LLVMTypeRef ret_type,
-                    LLVMTypeRef *param_types,
-                    int param_count,
-                    ...)
-{
-    LLVMValueRef *param_values, ret;
-    va_list argptr;
-    uint64 total_size;
-    int i = 0;
-
-    /* Create param values */
-    total_size = sizeof(LLVMValueRef) * (uint64)param_count;
-    if (total_size >= UINT32_MAX
-        || !(param_values = wasm_runtime_malloc((uint32)total_size))) {
-        aot_set_last_error("allocate memory for param values failed.");
-        return false;
-    }
-
-    /* Load each param value */
-    va_start(argptr, param_count);
-    while (i < param_count)
-        param_values[i++] = va_arg(argptr, LLVMValueRef);
-    va_end(argptr);
-
-    ret = __call_llvm_intrinsic(comp_ctx, name, ret_type,
-                                param_types, param_count,
-                                param_values);
-
-    wasm_runtime_free(param_values);
-
-    return ret;
-}
-
-static LLVMValueRef
-call_llvm_intrinsic_v(AOTCompContext *comp_ctx,
-                      const char *name,
-                      LLVMTypeRef ret_type,
-                      LLVMTypeRef *param_types,
-                      int param_count,
-                      va_list param_value_list)
-{
-    LLVMValueRef *param_values, ret;
-    uint64 total_size;
-    int i = 0;
-
-    /* Create param values */
-    total_size = sizeof(LLVMValueRef) * (uint64)param_count;
-    if (total_size >= UINT32_MAX
-        || !(param_values = wasm_runtime_malloc((uint32)total_size))) {
-        aot_set_last_error("allocate memory for param values failed.");
-        return false;
-    }
-
-    /* Load each param value */
-    while (i < param_count)
-        param_values[i++] = va_arg(param_value_list, LLVMValueRef);
-
-    ret = __call_llvm_intrinsic(comp_ctx, name, ret_type,
-                                param_types, param_count,
-                                param_values);
-
-    wasm_runtime_free(param_values);
-
-    return ret;
-}
-
 /* Call llvm constrained floating-point intrinsic */
 static LLVMValueRef
 call_llvm_float_experimental_constrained_intrinsic(AOTCompContext *comp_ctx,
@@ -255,12 +151,8 @@ call_llvm_float_experimental_constrained_intrinsic(AOTCompContext *comp_ctx,
 
     va_start(param_value_list, intrinsic);
 
-    ret = call_llvm_intrinsic_v(comp_ctx,
-                                intrinsic,
-                                ret_type,
-                                param_types,
-                                4,
-                                param_value_list);
+    ret = aot_call_llvm_intrinsic_v(comp_ctx, intrinsic, ret_type, param_types,
+                                    4, param_value_list);
 
     va_end(param_value_list);
 
@@ -283,12 +175,8 @@ call_llvm_libm_experimental_constrained_intrinsic(AOTCompContext *comp_ctx,
 
     va_start(param_value_list, intrinsic);
 
-    ret = call_llvm_intrinsic_v(comp_ctx,
-                                intrinsic,
-                                ret_type,
-                                param_types,
-                                3,
-                                param_value_list);
+    ret = aot_call_llvm_intrinsic_v(comp_ctx, intrinsic, ret_type, param_types,
+                                    3, param_value_list);
 
     va_end(param_value_list);
 
@@ -342,13 +230,8 @@ compile_op_float_min_max(AOTCompContext *comp_ctx,
         return NULL;
     }
 
-    if (!(cmp = call_llvm_intrinsic(comp_ctx,
-                                    intrinsic,
-                                    ret_type,
-                                    param_types,
-                                    2,
-                                    left,
-                                    right)))
+    if (!(cmp = aot_call_llvm_intrinsic(comp_ctx, intrinsic, ret_type,
+                                        param_types, 2, left, right)))
         return NULL;
 
     if (!(cmp = LLVMBuildSelect(comp_ctx->builder,
@@ -406,21 +289,21 @@ aot_compile_int_bit_count(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     /* Call the LLVM intrinsic function */
     if (type < POP_CNT32)
-        DEF_INT_UNARY_OP(call_llvm_intrinsic(comp_ctx,
-                                             bit_cnt_llvm_intrinsic[type],
-                                             ret_type,
-                                             param_types,
-                                             2,
-                                             operand,
-                                             zero_undef),
+        DEF_INT_UNARY_OP(aot_call_llvm_intrinsic(comp_ctx,
+                                                 bit_cnt_llvm_intrinsic[type],
+                                                 ret_type,
+                                                 param_types,
+                                                 2,
+                                                 operand,
+                                                 zero_undef),
                          NULL);
     else
-        DEF_INT_UNARY_OP(call_llvm_intrinsic(comp_ctx,
-                                             bit_cnt_llvm_intrinsic[type],
-                                             ret_type,
-                                             param_types,
-                                             1,
-                                             operand),
+        DEF_INT_UNARY_OP(aot_call_llvm_intrinsic(comp_ctx,
+                                                 bit_cnt_llvm_intrinsic[type],
+                                                 ret_type,
+                                                 param_types,
+                                                 1,
+                                                 operand),
                          NULL);
 
     return true;
@@ -1032,12 +915,8 @@ call_llvm_float_math_intrinsic(AOTCompContext *comp_ctx,
 
     va_start(param_value_list, intrinsic);
 
-    ret = call_llvm_intrinsic_v(comp_ctx,
-                                intrinsic,
-                                ret_type,
-                                &param_type,
-                                1,
-                                param_value_list);
+    ret = aot_call_llvm_intrinsic_v(comp_ctx, intrinsic, ret_type, &param_type,
+                                    1, param_value_list);
 
     va_end(param_value_list);
 
@@ -1133,14 +1012,14 @@ compile_float_copysign(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     param_types[0] = param_types[1] = ret_type = is_f32 ? F32_TYPE : F64_TYPE;
 
-    DEF_FP_BINARY_OP(call_llvm_intrinsic(comp_ctx,
-                                         is_f32 ? "llvm.copysign.f32" :
-                                                  "llvm.copysign.f64",
-                                         ret_type,
-                                         param_types,
-                                         2,
-                                         left,
-                                         right),
+    DEF_FP_BINARY_OP(aot_call_llvm_intrinsic(comp_ctx,
+                                             is_f32 ? "llvm.copysign.f32" :
+                                                      "llvm.copysign.f64",
+                                             ret_type,
+                                             param_types,
+                                             2,
+                                             left,
+                                             right),
                     NULL);
     return true;
 

+ 2 - 1
core/iwasm/compilation/aot_emit_parametric.c

@@ -46,7 +46,8 @@ pop_value_from_wasm_stack(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     wasm_runtime_free(aot_value);
 
     if ((is_32
-         && (type != VALUE_TYPE_I32 && type != VALUE_TYPE_F32))
+         && (type != VALUE_TYPE_I32 && type != VALUE_TYPE_F32
+             && type != VALUE_TYPE_V128))
         || (!is_32
             && (type != VALUE_TYPE_I64 && type != VALUE_TYPE_F64))) {
         aot_set_last_error("invalid WASM stack data type.");

+ 10 - 2
core/iwasm/compilation/aot_emit_variable.c

@@ -116,7 +116,7 @@ compile_global(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         + sizeof(AOTMemoryInstance) * comp_ctx->comp_data->memory_count;
     uint32 global_offset;
     uint8 global_type;
-    LLVMValueRef offset, global_ptr, global;
+    LLVMValueRef offset, global_ptr, global, res;
     LLVMTypeRef ptr_type = NULL;
 
     bh_assert(global_idx < import_global_count + comp_data->global_count);
@@ -153,6 +153,9 @@ compile_global(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         case VALUE_TYPE_F64:
             ptr_type = comp_ctx->basic_types.float64_ptr_type;
             break;
+        case VALUE_TYPE_V128:
+            ptr_type = comp_ctx->basic_types.v128_ptr_type;
+            break;
         default:
             bh_assert(0);
             break;
@@ -170,14 +173,19 @@ compile_global(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
             aot_set_last_error("llvm build load failed.");
             return false;
         }
+        /* All globals' data is 4-byte aligned */
+        LLVMSetAlignment(global, 4);
         PUSH(global, global_type);
     }
     else {
         POP(global, global_type);
-        if (!LLVMBuildStore(comp_ctx->builder, global, global_ptr)) {
+        if (!(res = LLVMBuildStore(comp_ctx->builder,
+                                   global, global_ptr))) {
             aot_set_last_error("llvm build store failed.");
             return false;
         }
+        /* All globals' data is 4-byte aligned */
+        LLVMSetAlignment(res, 4);
     }
 
     return true;

+ 240 - 2
core/iwasm/compilation/aot_llvm.c

@@ -21,6 +21,10 @@ wasm_type_to_llvm_type(AOTLLVMTypes *llvm_types, uint8 wasm_type)
             return llvm_types->float32_type;
         case VALUE_TYPE_F64:
             return llvm_types->float64_type;
+#if WASM_ENABLE_SIMD != 0
+        case VALUE_TYPE_V128:
+            return llvm_types->i64x2_vec_type;
+#endif
         case VALUE_TYPE_VOID:
             return llvm_types->void_type;
     }
@@ -444,6 +448,31 @@ create_memory_info(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         }
     }
 
+    offset = I32_CONST(offsetof(AOTMemoryInstance, mem_bound_check_16bytes)
+                       - offsetof(AOTMemoryInstance, memory_data.ptr));
+    if (!(func_ctx->mem_info[0].mem_bound_check_16bytes =
+                LLVMBuildInBoundsGEP(comp_ctx->builder, mem_info_base,
+                                     &offset, 1, "bound_check_16bytes_offset"))) {
+        aot_set_last_error("llvm build in bounds gep failed");
+        return false;
+    }
+    if (!(func_ctx->mem_info[0].mem_bound_check_16bytes =
+                LLVMBuildBitCast(comp_ctx->builder,
+                                 func_ctx->mem_info[0].mem_bound_check_16bytes,
+                                 bound_check_type, "bound_check_16bytes_ptr"))) {
+        aot_set_last_error("llvm build bit cast failed");
+        return false;
+    }
+    if (mem_space_unchanged) {
+        if (!(func_ctx->mem_info[0].mem_bound_check_16bytes =
+                LLVMBuildLoad(comp_ctx->builder,
+                              func_ctx->mem_info[0].mem_bound_check_16bytes,
+                              "bound_check_16bytes"))) {
+            aot_set_last_error("llvm build load failed");
+            return false;
+        }
+    }
+
     return true;
 }
 
@@ -676,6 +705,11 @@ aot_create_func_context(AOTCompData *comp_data, AOTCompContext *comp_ctx,
             case VALUE_TYPE_F64:
                 local_value = F64_ZERO;
                 break;
+#if WASM_ENABLE_SIMD != 0
+            case VALUE_TYPE_V128:
+                local_value = V128_ZERO;
+                break;
+#endif
             default:
                 bh_assert(0);
                 break;
@@ -814,23 +848,55 @@ aot_set_llvm_basic_types(AOTLLVMTypes *basic_types, LLVMContextRef context)
     basic_types->float32_ptr_type = LLVMPointerType(basic_types->float32_type, 0);
     basic_types->float64_ptr_type = LLVMPointerType(basic_types->float64_type, 0);
 
+    basic_types->i8x16_vec_type = LLVMVectorType(basic_types->int8_type, 16);
+    basic_types->i16x8_vec_type = LLVMVectorType(basic_types->int16_type, 8);
+    basic_types->i32x4_vec_type = LLVMVectorType(basic_types->int32_type, 4);
+    basic_types->i64x2_vec_type = LLVMVectorType(basic_types->int64_type, 2);
+    basic_types->f32x4_vec_type = LLVMVectorType(basic_types->float32_type, 4);
+    basic_types->f64x2_vec_type = LLVMVectorType(basic_types->float64_type, 2);
+
+    basic_types->v128_type = basic_types->i64x2_vec_type;
+    basic_types->v128_ptr_type = LLVMPointerType(basic_types->v128_type, 0);
+
     return (basic_types->int8_ptr_type
             && basic_types->int16_ptr_type
             && basic_types->int32_ptr_type
             && basic_types->int64_ptr_type
             && basic_types->float32_ptr_type
             && basic_types->float64_ptr_type
+            && basic_types->i8x16_vec_type
+            && basic_types->i16x8_vec_type
+            && basic_types->i32x4_vec_type
+            && basic_types->i64x2_vec_type
+            && basic_types->f32x4_vec_type
+            && basic_types->f64x2_vec_type
             && basic_types->meta_data_type) ? true : false;
 }
 
 static bool
 aot_create_llvm_consts(AOTLLVMConsts *consts, AOTCompContext *comp_ctx)
 {
+    LLVMValueRef i64_consts[2];
+
     consts->i8_zero = I8_CONST(0);
     consts->i32_zero = I32_CONST(0);
     consts->i64_zero = I64_CONST(0);
     consts->f32_zero = F32_CONST(0);
     consts->f64_zero = F64_CONST(0);
+
+    if (consts->i64_zero) {
+        i64_consts[0] = i64_consts[1] = consts->i64_zero;
+        consts->v128_zero = consts->i64x2_vec_zero =
+                                    LLVMConstVector(i64_consts, 2);
+        if (consts->i64x2_vec_zero) {
+            consts->i8x16_vec_zero = TO_V128_i8x16(consts->i64x2_vec_zero);
+            consts->i16x8_vec_zero = TO_V128_i16x8(consts->i64x2_vec_zero);
+            consts->i32x4_vec_zero = TO_V128_i32x4(consts->i64x2_vec_zero);
+            consts->f32x4_vec_zero = TO_V128_f32x4(consts->i64x2_vec_zero);
+            consts->f64x2_vec_zero = TO_V128_f64x2(consts->i64x2_vec_zero);
+        }
+    }
+
     consts->i32_one = I32_CONST(1);
     consts->i32_two = I32_CONST(2);
     consts->i32_three = I32_CONST(3);
@@ -850,6 +916,12 @@ aot_create_llvm_consts(AOTLLVMConsts *consts, AOTCompContext *comp_ctx)
             && consts->i64_zero
             && consts->f32_zero
             && consts->f64_zero
+            && consts->i8x16_vec_zero
+            && consts->i16x8_vec_zero
+            && consts->i32x4_vec_zero
+            && consts->i64x2_vec_zero
+            && consts->f32x4_vec_zero
+            && consts->f64x2_vec_zero
             && consts->i32_one
             && consts->i32_two
             && consts->i32_three
@@ -1014,7 +1086,7 @@ aot_create_comp_context(AOTCompData *comp_data,
     /*LLVMTypeRef elem_types[8];*/
     struct LLVMMCJITCompilerOptions jit_options;
     LLVMTargetRef target;
-    char *triple = NULL, *triple_jit = NULL, *triple_norm, *arch, *abi;
+    char *triple = NULL, *triple_norm, *arch, *abi;
     char *cpu = NULL, *features, buf[128];
     char *triple_norm_new = NULL, *cpu_new = NULL;
     char *err = NULL, *fp_round= "round.tonearest", *fp_exce = "fpexcept.strict";
@@ -1065,7 +1137,12 @@ aot_create_comp_context(AOTCompData *comp_data,
     if (option->enable_tail_call)
         comp_ctx->enable_tail_call = true;
 
+    if (option->enable_simd)
+        comp_ctx->enable_simd = true;
+
     if (option->is_jit_mode) {
+        char *triple_jit = NULL;
+
         /* Create LLVM execution engine */
         LLVMInitializeMCJITCompilerOptions(&jit_options, sizeof(jit_options));
         jit_options.OptLevel = LLVMCodeGenLevelAggressive;
@@ -1186,7 +1263,8 @@ aot_create_comp_context(AOTCompData *comp_data,
             if (!cpu)
                 cpu = "";
         }
-        else { /* triple is NULL, cpu isn't NULL */
+        else {
+            /* triple is NULL, cpu isn't NULL */
             snprintf(buf, sizeof(buf),
                     "target isn't specified for cpu %s.", cpu);
             aot_set_last_error(buf);
@@ -1283,6 +1361,23 @@ aot_create_comp_context(AOTCompData *comp_data,
         }
     }
 
+    if (option->enable_simd) {
+        char *tmp;
+        bool ret;
+
+        if (!(tmp = LLVMGetTargetMachineCPU(comp_ctx->target_machine))) {
+            aot_set_last_error("get CPU from Target Machine fail");
+            goto fail;
+        }
+
+        ret = aot_check_simd_compatibility(comp_ctx->target_arch, tmp);
+        LLVMDisposeMessage(tmp);
+        if (!ret) {
+            aot_set_last_error("SIMD compatibility check failed");
+            goto fail;
+        }
+    }
+
     if (!(target_data_ref =
             LLVMCreateTargetDataLayout(comp_ctx->target_machine))) {
         aot_set_last_error("create LLVM target data layout failed.");
@@ -1349,11 +1444,13 @@ aot_create_comp_context(AOTCompData *comp_data,
 fail:
     if (triple_norm_new)
         LLVMDisposeMessage(triple_norm_new);
+
     if (cpu_new)
         LLVMDisposeMessage(cpu_new);
 
     if (!ret)
         aot_destroy_comp_context(comp_ctx);
+
     return ret;
 }
 
@@ -1567,3 +1664,144 @@ aot_checked_addr_list_destroy(AOTFuncContext *func_ctx)
     func_ctx->checked_addr_list = NULL;
 }
 
+bool
+aot_build_zero_function_ret(AOTCompContext *comp_ctx,
+                            AOTFuncType *func_type)
+{
+    LLVMValueRef ret = NULL;
+
+    if (func_type->result_count) {
+        switch (func_type->types[func_type->param_count]) {
+            case VALUE_TYPE_I32:
+                ret = LLVMBuildRet(comp_ctx->builder, I32_ZERO);
+                break;
+            case VALUE_TYPE_I64:
+                ret = LLVMBuildRet(comp_ctx->builder, I64_ZERO);
+                break;
+            case VALUE_TYPE_F32:
+                ret = LLVMBuildRet(comp_ctx->builder, F32_ZERO);
+                break;
+            case VALUE_TYPE_F64:
+                ret = LLVMBuildRet(comp_ctx->builder, F64_ZERO);
+                break;
+#if WASM_ENABLE_SIMD != 0
+            case VALUE_TYPE_V128:
+                ret = LLVMBuildRet(comp_ctx->builder, V128_ZERO);
+                break;
+#endif
+            default:
+                bh_assert(0);
+        }
+    }
+    else {
+        ret = LLVMBuildRetVoid(comp_ctx->builder);
+    }
+
+    if (!ret) {
+        aot_set_last_error("llvm build ret failed.");
+        return false;
+    }
+    return true;
+}
+
+static LLVMValueRef
+__call_llvm_intrinsic(const AOTCompContext *comp_ctx,
+                      const char *name,
+                      LLVMTypeRef ret_type,
+                      LLVMTypeRef *param_types,
+                      int param_count,
+                      LLVMValueRef *param_values)
+{
+    LLVMValueRef func, ret;
+    LLVMTypeRef func_type;
+
+    /* Declare llvm intrinsic function if necessary */
+    if (!(func = LLVMGetNamedFunction(comp_ctx->module, name))) {
+        if (!(func_type = LLVMFunctionType(ret_type, param_types,
+                                           (uint32)param_count, false))) {
+            aot_set_last_error("create LLVM function type failed.");
+            return NULL;
+        }
+
+        if (!(func = LLVMAddFunction(comp_ctx->module, name, func_type))) {
+            aot_set_last_error("add LLVM function failed.");
+            return NULL;
+        }
+    }
+
+    /* Call the LLVM intrinsic function */
+    if (!(ret = LLVMBuildCall(comp_ctx->builder, func, param_values,
+                              (uint32)param_count, "call"))) {
+        aot_set_last_error("llvm build call failed.");
+        return NULL;
+    }
+
+    return ret;
+}
+
+LLVMValueRef
+aot_call_llvm_intrinsic(const AOTCompContext *comp_ctx,
+                        const char *name,
+                        LLVMTypeRef ret_type,
+                        LLVMTypeRef *param_types,
+                        int param_count,
+                        ...)
+{
+    LLVMValueRef *param_values, ret;
+    va_list argptr;
+    uint64 total_size;
+    int i = 0;
+
+    /* Create param values */
+    total_size = sizeof(LLVMValueRef) * (uint64)param_count;
+    if (total_size >= UINT32_MAX
+        || !(param_values = wasm_runtime_malloc((uint32)total_size))) {
+        aot_set_last_error("allocate memory for param values failed.");
+        return false;
+    }
+
+    /* Load each param value */
+    va_start(argptr, param_count);
+    while (i < param_count)
+        param_values[i++] = va_arg(argptr, LLVMValueRef);
+    va_end(argptr);
+
+    ret = __call_llvm_intrinsic(comp_ctx, name, ret_type, param_types,
+                                param_count, param_values);
+
+    wasm_runtime_free(param_values);
+
+    return ret;
+}
+
+LLVMValueRef
+aot_call_llvm_intrinsic_v(const AOTCompContext *comp_ctx,
+                          const char *name,
+                          LLVMTypeRef ret_type,
+                          LLVMTypeRef *param_types,
+                          int param_count,
+                          va_list param_value_list)
+{
+    LLVMValueRef *param_values, ret;
+    uint64 total_size;
+    int i = 0;
+
+    /* Create param values */
+    total_size = sizeof(LLVMValueRef) * (uint64)param_count;
+    if (total_size >= UINT32_MAX
+        || !(param_values = wasm_runtime_malloc((uint32)total_size))) {
+        aot_set_last_error("allocate memory for param values failed.");
+        return false;
+    }
+
+    /* Load each param value */
+    while (i < param_count)
+        param_values[i++] = va_arg(param_value_list, LLVMValueRef);
+
+    ret = __call_llvm_intrinsic(comp_ctx, name, ret_type, param_types,
+                                param_count, param_values);
+
+    wasm_runtime_free(param_values);
+
+    return ret;
+}

+ 44 - 0
core/iwasm/compilation/aot_llvm.h

@@ -106,6 +106,7 @@ typedef struct AOTMemInfo {
   LLVMValueRef mem_bound_check_2bytes;
   LLVMValueRef mem_bound_check_4bytes;
   LLVMValueRef mem_bound_check_8bytes;
+  LLVMValueRef mem_bound_check_16bytes;
 } AOTMemInfo;
 
 typedef struct AOTFuncContext {
@@ -152,6 +153,15 @@ typedef struct AOTLLVMTypes {
   LLVMTypeRef float32_ptr_type;
   LLVMTypeRef float64_ptr_type;
 
+  LLVMTypeRef v128_type;
+  LLVMTypeRef v128_ptr_type;
+  LLVMTypeRef i8x16_vec_type;
+  LLVMTypeRef i16x8_vec_type;
+  LLVMTypeRef i32x4_vec_type;
+  LLVMTypeRef i64x2_vec_type;
+  LLVMTypeRef f32x4_vec_type;
+  LLVMTypeRef f64x2_vec_type;
+
   LLVMTypeRef meta_data_type;
 } AOTLLVMTypes;
 
@@ -161,6 +171,13 @@ typedef struct AOTLLVMConsts {
     LLVMValueRef i64_zero;
     LLVMValueRef f32_zero;
     LLVMValueRef f64_zero;
+    LLVMValueRef v128_zero;
+    LLVMValueRef i8x16_vec_zero;
+    LLVMValueRef i16x8_vec_zero;
+    LLVMValueRef i32x4_vec_zero;
+    LLVMValueRef i64x2_vec_zero;
+    LLVMValueRef f32x4_vec_zero;
+    LLVMValueRef f64x2_vec_zero;
     LLVMValueRef i32_one;
     LLVMValueRef i32_two;
     LLVMValueRef i32_three;
@@ -201,6 +218,9 @@ typedef struct AOTCompContext {
   /* Bounday Check */
   bool enable_bound_check;
 
+  /* 128-bit SIMD */
+  bool enable_simd;
+
   /* Thread Manager */
   bool enable_thread_mgr;
 
@@ -248,6 +268,7 @@ typedef struct AOTCompOption{
     bool enable_bulk_memory;
     bool enable_thread_mgr;
     bool enable_tail_call;
+    bool enable_simd;
     bool is_sgx_platform;
     uint32 opt_level;
     uint32 size_level;
@@ -309,6 +330,29 @@ aot_checked_addr_list_find(AOTFuncContext *func_ctx,
 void
 aot_checked_addr_list_destroy(AOTFuncContext *func_ctx);
 
+bool
+aot_build_zero_function_ret(AOTCompContext *comp_ctx,
+                            AOTFuncType *func_type);
+
+LLVMValueRef
+aot_call_llvm_intrinsic(const AOTCompContext *comp_ctx,
+                        const char *name,
+                        LLVMTypeRef ret_type,
+                        LLVMTypeRef *param_types,
+                        int param_count,
+                        ...);
+
+LLVMValueRef
+aot_call_llvm_intrinsic_v(const AOTCompContext *comp_ctx,
+                          const char *name,
+                          LLVMTypeRef ret_type,
+                          LLVMTypeRef *param_types,
+                          int param_count,
+                          va_list param_value_list);
+
+bool
+aot_check_simd_compatibility(const char *arch_c_str, const char *cpu_c_str);
+
 #ifdef __cplusplus
 } /* end of extern "C" */
 #endif

+ 381 - 0
core/iwasm/compilation/simd/simd_access_lanes.c

@@ -0,0 +1,381 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include "simd_access_lanes.h"
+#include "simd_common.h"
+#include "../aot_emit_exception.h"
+#include "../../aot/aot_runtime.h"
+
+static LLVMValueRef
+build_intx16_vector(const AOTCompContext *comp_ctx,
+                    const LLVMTypeRef element_type,
+                    const int *element_value)
+{
+    LLVMValueRef vector, elements[16];
+    unsigned i;
+
+    for (i = 0; i < 16; i++) {
+        if (!(elements[i] =
+                LLVMConstInt(element_type, element_value[i], true))) {
+            HANDLE_FAILURE("LLVMConstInst");
+            goto fail;
+        }
+    }
+
+    if (!(vector = LLVMConstVector(elements, 16))) {
+        HANDLE_FAILURE("LLVMConstVector");
+        goto fail;
+    }
+
+    return vector;
+fail:
+    return NULL;
+}
+
+bool
+aot_compile_simd_shuffle(AOTCompContext *comp_ctx,
+                         AOTFuncContext *func_ctx,
+                         const uint8 *frame_ip)
+{
+    LLVMValueRef vec1, vec2, mask, result;
+    uint8 imm[16] = { 0 };
+    int values[16];
+    unsigned i;
+
+    wasm_runtime_read_v128(frame_ip, (uint64 *)imm, (uint64 *)(imm + 8));
+    for (i = 0; i < 16; i++) {
+        values[i] = imm[i];
+    }
+
+    if (!(vec2 = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, V128_i8x16_TYPE,
+                                           "vec2"))) {
+        goto fail;
+    }
+
+    if (!(vec1 = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, V128_i8x16_TYPE,
+                                           "vec1"))) {
+        goto fail;
+    }
+
+    /* build a vector <16 x i32> */
+    if (!(mask = build_intx16_vector(comp_ctx, I32_TYPE, values))) {
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildShuffleVector(comp_ctx->builder, vec1, vec2, mask,
+                                          "new_vector"))) {
+        HANDLE_FAILURE("LLVMBuildShuffleVector");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "ret"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    PUSH_V128(result);
+
+    return true;
+fail:
+    return false;
+}
+
+// TODO: instructions for other CPUs
+/* shufflevector is not an option, since it requires *mask as a const */
+bool
+aot_compile_simd_swizzle(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+{
+    LLVMValueRef vector, mask, max_lanes, condition, mask_lanes, result;
+    LLVMTypeRef param_types[2];
+    int max_lane_id[16] = { 16, 16, 16, 16, 16, 16, 16, 16,
+                            16, 16, 16, 16, 16, 16, 16, 16 },
+        mask_lane_id[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                             0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+
+    if (!(mask = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, V128_i8x16_TYPE,
+                                           "mask"))) {
+        goto fail;
+    }
+
+    if (!(vector = simd_pop_v128_and_bitcast(comp_ctx, func_ctx,
+                                             V128_i8x16_TYPE, "vec"))) {
+        goto fail;
+    }
+
+    /* icmp uge <16 x i8> mask, <16, 16, 16, 16, ...> */
+    if (!(max_lanes = build_intx16_vector(comp_ctx, INT8_TYPE, max_lane_id))) {
+        goto fail;
+    }
+
+    if (!(condition = LLVMBuildICmp(comp_ctx->builder, LLVMIntUGE, mask,
+                                    max_lanes, "compare_with_16"))) {
+        HANDLE_FAILURE("LLVMBuldICmp");
+        goto fail;
+    }
+
+    /*  if the highest bit of every i8 of mask is 1, means doesn't pick up from vector */
+    /* select <16 x i1> %condition, <16 x i8> <0x80, 0x80, ...>, <16 x i8> %mask */
+    if (!(mask_lanes =
+            build_intx16_vector(comp_ctx, INT8_TYPE, mask_lane_id))) {
+        goto fail;
+    }
+
+    if (!(mask = LLVMBuildSelect(comp_ctx->builder, condition, mask_lanes,
+                                 mask, "mask"))) {
+        HANDLE_FAILURE("LLVMBuildSelect");
+        goto fail;
+    }
+
+    param_types[0] = V128_i8x16_TYPE;
+    param_types[1] = V128_i8x16_TYPE;
+    if (!(result = aot_call_llvm_intrinsic(
+            comp_ctx, "llvm.x86.ssse3.pshuf.b.128", V128_i8x16_TYPE,
+            param_types, 2, vector, mask))) {
+        HANDLE_FAILURE("LLVMBuildCall");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "ret"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    PUSH_V128(result);
+
+    return true;
+fail:
+    return false;
+}
+
+static bool
+aot_compile_simd_extract(AOTCompContext *comp_ctx,
+                         AOTFuncContext *func_ctx,
+                         uint8 lane_id,
+                         bool need_extend,
+                         bool is_signed,
+                         LLVMTypeRef vector_type,
+                         LLVMTypeRef result_type,
+                         unsigned aot_value_type)
+{
+    LLVMValueRef vector, idx, result;
+
+    if (!(idx = I8_CONST(lane_id))) {
+        HANDLE_FAILURE("LLVMConstInt");
+        goto fail;
+    }
+
+    /* bitcast <2 x i64> %0 to <vector_type> */
+    if (!(vector = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                             "vec"))) {
+        goto fail;
+    }
+
+    /* extractelement <vector_type> %vector, i8 lane_id*/
+    if (!(result = LLVMBuildExtractElement(comp_ctx->builder, vector, idx,
+                                           "element"))) {
+        HANDLE_FAILURE("LLVMBuildExtractElement");
+        goto fail;
+    }
+
+    if (need_extend) {
+        if (is_signed) {
+            /* sext <element_type> %element to <result_type> */
+            if (!(result = LLVMBuildSExt(comp_ctx->builder, result,
+                                         result_type, "ret"))) {
+                HANDLE_FAILURE("LLVMBuildSExt");
+                goto fail;
+            }
+        }
+        else {
+            /* sext <element_type> %element to <result_type> */
+            if (!(result = LLVMBuildZExt(comp_ctx->builder, result,
+                                         result_type, "ret"))) {
+                HANDLE_FAILURE("LLVMBuildZExt");
+                goto fail;
+            }
+        }
+    }
+
+    PUSH(result, aot_value_type);
+
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_extract_i8x16(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id,
+                               bool is_signed)
+{
+    return aot_compile_simd_extract(comp_ctx, func_ctx, lane_id, true,
+                                    is_signed, V128_i8x16_TYPE, I32_TYPE,
+                                    VALUE_TYPE_I32);
+}
+
+bool
+aot_compile_simd_extract_i16x8(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id,
+                               bool is_signed)
+{
+    return aot_compile_simd_extract(comp_ctx, func_ctx, lane_id, true,
+                                    is_signed, V128_i16x8_TYPE, I32_TYPE,
+                                    VALUE_TYPE_I32);
+}
+
+bool
+aot_compile_simd_extract_i32x4(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id)
+{
+    return aot_compile_simd_extract(comp_ctx, func_ctx, lane_id, false, false,
+                                    V128_i32x4_TYPE, I32_TYPE, VALUE_TYPE_I32);
+}
+
+bool
+aot_compile_simd_extract_i64x2(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id)
+{
+    return aot_compile_simd_extract(comp_ctx, func_ctx, lane_id, false, false,
+                                    V128_i64x2_TYPE, I64_TYPE, VALUE_TYPE_I64);
+}
+
+bool
+aot_compile_simd_extract_f32x4(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id)
+{
+    return aot_compile_simd_extract(comp_ctx, func_ctx, lane_id, false, false,
+                                    V128_f32x4_TYPE, F32_TYPE, VALUE_TYPE_F32);
+}
+
+bool
+aot_compile_simd_extract_f64x2(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id)
+{
+    return aot_compile_simd_extract(comp_ctx, func_ctx, lane_id, false, false,
+                                    V128_f64x2_TYPE, F64_TYPE, VALUE_TYPE_F64);
+}
+
+static bool
+aot_compile_simd_replace(AOTCompContext *comp_ctx,
+                         AOTFuncContext *func_ctx,
+                         uint8 lane_id,
+                         unsigned new_value_type,
+                         LLVMTypeRef vector_type,
+                         bool need_reduce,
+                         LLVMTypeRef element_type)
+{
+    LLVMValueRef vector, new_value, idx, result;
+
+    POP(new_value, new_value_type);
+
+    if (!(idx = I8_CONST(lane_id))) {
+        HANDLE_FAILURE("LLVMConstInt");
+        goto fail;
+    }
+
+    /* bitcast <2 x i64> %0 to <vector_type> */
+
+    if (!(vector = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                             "vec"))) {
+        goto fail;
+    }
+
+    /* bitcast <new_value_type> to <element_type> */
+    if (need_reduce) {
+        if (!(new_value = LLVMBuildTrunc(comp_ctx->builder, new_value,
+                                         element_type, "element"))) {
+            HANDLE_FAILURE("LLVMBuildTrunc");
+            goto fail;
+        }
+    }
+
+    /* insertelement <vector_type> %vector, <element_type>  %element, i8 idx */
+    if (!(result = LLVMBuildInsertElement(comp_ctx->builder, vector, new_value,
+                                          idx, "new_vector"))) {
+        HANDLE_FAILURE("LLVMBuildInsertElement");
+        goto fail;
+    }
+
+    /* bitcast <vector_type> %result to <2 x i64> */
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "ret"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    PUSH_V128(result);
+
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_replace_i8x16(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id)
+{
+    return aot_compile_simd_replace(comp_ctx, func_ctx, lane_id,
+                                    VALUE_TYPE_I32, V128_i8x16_TYPE, true,
+                                    INT8_TYPE);
+}
+
+bool
+aot_compile_simd_replace_i16x8(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id)
+{
+    return aot_compile_simd_replace(comp_ctx, func_ctx, lane_id,
+                                    VALUE_TYPE_I32, V128_i16x8_TYPE, true,
+                                    INT16_TYPE);
+}
+
+bool
+aot_compile_simd_replace_i32x4(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id)
+{
+    return aot_compile_simd_replace(comp_ctx, func_ctx, lane_id,
+                                    VALUE_TYPE_I32, V128_i32x4_TYPE, false,
+                                    I32_TYPE);
+}
+
+bool
+aot_compile_simd_replace_i64x2(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id)
+{
+    return aot_compile_simd_replace(comp_ctx, func_ctx, lane_id,
+                                    VALUE_TYPE_I64, V128_i64x2_TYPE, false,
+                                    I64_TYPE);
+}
+
+bool
+aot_compile_simd_replace_f32x4(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id)
+{
+    return aot_compile_simd_replace(comp_ctx, func_ctx, lane_id,
+                                    VALUE_TYPE_F32, V128_f32x4_TYPE, false,
+                                    F32_TYPE);
+}
+
+bool
+aot_compile_simd_replace_f64x2(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id)
+{
+    return aot_compile_simd_replace(comp_ctx, func_ctx, lane_id,
+                                    VALUE_TYPE_F64, V128_f64x2_TYPE, false,
+                                    F64_TYPE);
+}

+ 89 - 0
core/iwasm/compilation/simd/simd_access_lanes.h

@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _SIMD_ACCESS_LANES_H_
+#define _SIMD_ACCESS_LANES_H_
+
+#include "../aot_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool
+aot_compile_simd_shuffle(AOTCompContext *comp_ctx,
+                         AOTFuncContext *func_ctx,
+                         const uint8 *frame_ip);
+
+bool
+aot_compile_simd_swizzle(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_extract_i8x16(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id,
+                               bool is_signed);
+
+bool
+aot_compile_simd_extract_i16x8(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id,
+                               bool is_signed);
+
+bool
+aot_compile_simd_extract_i32x4(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id);
+
+bool
+aot_compile_simd_extract_i64x2(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id);
+
+bool
+aot_compile_simd_extract_f32x4(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id);
+
+bool
+aot_compile_simd_extract_f64x2(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id);
+
+bool
+aot_compile_simd_replace_i8x16(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id);
+
+bool
+aot_compile_simd_replace_i16x8(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id);
+
+bool
+aot_compile_simd_replace_i32x4(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id);
+
+bool
+aot_compile_simd_replace_i64x2(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id);
+
+bool
+aot_compile_simd_replace_f32x4(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id);
+
+bool
+aot_compile_simd_replace_f64x2(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               uint8 lane_id);
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif /* end of _SIMD_ACCESS_LANES_H_ */

+ 164 - 0
core/iwasm/compilation/simd/simd_bit_shifts.c

@@ -0,0 +1,164 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include "simd_bit_shifts.h"
+#include "simd_common.h"
+#include "../aot_emit_exception.h"
+#include "../../aot/aot_runtime.h"
+
+static bool
+simd_shift(AOTCompContext *comp_ctx,
+           AOTFuncContext *func_ctx,
+           IntShift shift_op,
+           LLVMTypeRef vector_type,
+           LLVMTypeRef element_type,
+           unsigned lane_width)
+{
+    LLVMValueRef vector, offset, width, undef, zeros, result;
+    LLVMTypeRef zeros_type;
+
+    POP_I32(offset);
+
+    if (!(vector = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                             "vec"))) {
+        goto fail;
+    }
+
+    if (!(width = LLVMConstInt(I32_TYPE, lane_width, true))) {
+        HANDLE_FAILURE("LLVMConstInt");
+        goto fail;
+    }
+
+    if (!(offset =
+            LLVMBuildURem(comp_ctx->builder, offset, width, "remainder"))) {
+        HANDLE_FAILURE("LLVMBuildURem");
+        goto fail;
+    }
+
+    if (I64_TYPE == element_type) {
+        if (!(offset = LLVMBuildZExt(comp_ctx->builder, offset, element_type,
+                                     "offset_scalar"))) {
+            HANDLE_FAILURE("LLVMBuildZExt");
+            goto fail;
+        }
+    }
+    else {
+        if (!(offset = LLVMBuildTruncOrBitCast(
+                comp_ctx->builder, offset, element_type, "offset_scalar"))) {
+            HANDLE_FAILURE("LLVMBuildTrunc");
+            goto fail;
+        }
+    }
+
+    /* create a vector with offset */
+    if (!(undef = LLVMGetUndef(vector_type))) {
+        HANDLE_FAILURE("LLVMGetUndef");
+        goto fail;
+    }
+
+    if (!(zeros_type = LLVMVectorType(I32_TYPE, 128 / lane_width))) {
+        HANDLE_FAILURE("LVMVectorType");
+        goto fail;
+    }
+
+    if (!(zeros = LLVMConstNull(zeros_type))) {
+        HANDLE_FAILURE("LLVMConstNull");
+        goto fail;
+    }
+
+    if (!(offset = LLVMBuildInsertElement(comp_ctx->builder, undef, offset,
+                                          I32_ZERO, "base_vector"))) {
+        HANDLE_FAILURE("LLVMBuildInsertElement");
+        goto fail;
+    }
+
+    if (!(offset = LLVMBuildShuffleVector(comp_ctx->builder, offset, undef,
+                                          zeros, "offset_vector"))) {
+        HANDLE_FAILURE("LLVMBuildShuffleVector");
+        goto fail;
+    }
+
+    switch (shift_op) {
+        case INT_SHL:
+        {
+            if (!(result =
+                    LLVMBuildShl(comp_ctx->builder, vector, offset, "shl"))) {
+                HANDLE_FAILURE("LLVMBuildShl");
+                goto fail;
+            }
+            break;
+        }
+        case INT_SHR_S:
+        {
+            if (!(result = LLVMBuildAShr(comp_ctx->builder, vector, offset,
+                                         "ashr"))) {
+                HANDLE_FAILURE("LLVMBuildAShr");
+                goto fail;
+            }
+            break;
+        }
+        case INT_SHR_U:
+        {
+            if (!(result = LLVMBuildLShr(comp_ctx->builder, vector, offset,
+                                         "lshr"))) {
+                HANDLE_FAILURE("LLVMBuildLShr");
+                goto fail;
+            }
+            break;
+        }
+        default:
+        {
+            bh_assert(0);
+            goto fail;
+        }
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "result"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    PUSH_V128(result);
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_i8x16_shift(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             IntShift shift_op)
+{
+    return simd_shift(comp_ctx, func_ctx, shift_op, V128_i8x16_TYPE, INT8_TYPE,
+                      8);
+}
+
+bool
+aot_compile_simd_i16x8_shift(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             IntShift shift_op)
+{
+    return simd_shift(comp_ctx, func_ctx, shift_op, V128_i16x8_TYPE,
+                      INT16_TYPE, 16);
+}
+
+bool
+aot_compile_simd_i32x4_shift(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             IntShift shift_op)
+{
+    return simd_shift(comp_ctx, func_ctx, shift_op, V128_i32x4_TYPE, I32_TYPE,
+                      32);
+}
+
+bool
+aot_compile_simd_i64x2_shift(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             IntShift shift_op)
+{
+    return simd_shift(comp_ctx, func_ctx, shift_op, V128_i64x2_TYPE, I64_TYPE,
+                      64);
+}

+ 39 - 0
core/iwasm/compilation/simd/simd_bit_shifts.h

@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _SIMD_BIT_SHIFTS_H_
+#define _SIMD_BIT_SHIFTS_H_
+
+#include "../aot_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool
+aot_compile_simd_i8x16_shift(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             IntShift shift_op);
+
+bool
+aot_compile_simd_i16x8_shift(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             IntShift shift_op);
+
+bool
+aot_compile_simd_i32x4_shift(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             IntShift shift_op);
+
+bool
+aot_compile_simd_i64x2_shift(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             IntShift shift_op);
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif /* end of _SIMD_BIT_SHIFTS_H_ */

+ 109 - 0
core/iwasm/compilation/simd/simd_bitmask_extracts.c

@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include "simd_bitmask_extracts.h"
+#include "simd_common.h"
+#include "../aot_emit_exception.h"
+#include "../../aot/aot_runtime.h"
+
+static bool
+simd_build_bitmask(const AOTCompContext *comp_ctx,
+                   const AOTFuncContext *func_ctx,
+                   uint8 length,
+                   LLVMTypeRef vector_type,
+                   LLVMTypeRef element_type,
+                   const char *intrinsic)
+{
+    LLVMValueRef vector, zeros, mask, mask_elements[16], cond, result;
+    LLVMTypeRef param_types[1], vector_ext_type;
+    const uint32 numbers[16] = { 0x1,    0x2,    0x4,    0x8,   0x10,  0x20,
+                                 0x40,   0x80,   0x100,  0x200, 0x400, 0x800,
+                                 0x1000, 0x2000, 0x4000, 0x8000 };
+    uint8 i;
+
+    if (!(vector = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                             "vec"))) {
+        goto fail;
+    }
+
+    if (!(vector_ext_type = LLVMVectorType(I32_TYPE, length))) {
+        HANDLE_FAILURE("LLVMVectorType");
+        goto fail;
+    }
+
+    if (!(vector = LLVMBuildSExt(comp_ctx->builder, vector, vector_ext_type,
+                                 "vec_ext"))) {
+        HANDLE_FAILURE("LLVMBuildSExt");
+        goto fail;
+    }
+
+    if (!(zeros = LLVMConstNull(vector_ext_type))) {
+        HANDLE_FAILURE("LLVMConstNull");
+        goto fail;
+    }
+
+    for (i = 0; i < 16; i++) {
+        if (!(mask_elements[i] = LLVMConstInt(I32_TYPE, numbers[i], false))) {
+            HANDLE_FAILURE("LLVMConstInt");
+            goto fail;
+        }
+    }
+
+    if (!(mask = LLVMConstVector(mask_elements, length))) {
+        HANDLE_FAILURE("LLVMConstVector");
+        goto fail;
+    }
+
+    if (!(cond = LLVMBuildICmp(comp_ctx->builder, LLVMIntSLT, vector, zeros,
+                               "lt_zero"))) {
+        HANDLE_FAILURE("LLVMBuildICmp");
+        goto fail;
+    }
+
+    if (!(result =
+            LLVMBuildSelect(comp_ctx->builder, cond, mask, zeros, "select"))) {
+        HANDLE_FAILURE("LLVMBuildSelect");
+        goto fail;
+    }
+
+    param_types[0] = vector_ext_type;
+    if (!(result = aot_call_llvm_intrinsic(comp_ctx, intrinsic, I32_TYPE,
+                                           param_types, 1, result))) {
+        HANDLE_FAILURE("LLVMBuildCall");
+        goto fail;
+    }
+
+    PUSH_I32(result);
+
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_i8x16_bitmask(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx)
+{
+    return simd_build_bitmask(comp_ctx, func_ctx, 16, V128_i8x16_TYPE,
+                              INT8_TYPE,
+                              "llvm.experimental.vector.reduce.or.v16i32");
+}
+
+bool
+aot_compile_simd_i16x8_bitmask(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx)
+{
+    return simd_build_bitmask(comp_ctx, func_ctx, 8, V128_i16x8_TYPE,
+                              INT16_TYPE,
+                              "llvm.experimental.vector.reduce.or.v8i32");
+}
+
+bool
+aot_compile_simd_i32x4_bitmask(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx)
+{
+    return simd_build_bitmask(comp_ctx, func_ctx, 4, V128_i32x4_TYPE, I32_TYPE,
+                              "llvm.experimental.vector.reduce.or.v4i32");
+}

+ 29 - 0
core/iwasm/compilation/simd/simd_bitmask_extracts.h

@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _SIMD_BITMASK_EXTRACTS_H_
+#define _SIMD_BITMASK_EXTRACTS_H_
+
+#include "../aot_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool
+aot_compile_simd_i8x16_bitmask(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_i16x8_bitmask(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_i32x4_bitmask(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif /* end of _SIMD_BITMASK_EXTRACTS_H_ */
+

+ 146 - 0
core/iwasm/compilation/simd/simd_bitwise_ops.c

@@ -0,0 +1,146 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include "simd_bitwise_ops.h"
+#include "../aot_emit_exception.h"
+#include "../../aot/aot_runtime.h"
+
+static bool
+v128_bitwise_two_component(AOTCompContext *comp_ctx,
+                           AOTFuncContext *func_ctx,
+                           V128Bitwise bitwise_op)
+{
+    LLVMValueRef vector1, vector2, result;
+
+    POP_V128(vector2);
+    POP_V128(vector1);
+
+    switch (bitwise_op) {
+        case V128_AND:
+            if (!(result = LLVMBuildAnd(comp_ctx->builder, vector1, vector2,
+                                        "and"))) {
+                HANDLE_FAILURE("LLVMBuildAnd");
+                goto fail;
+            }
+            break;
+        case V128_OR:
+            if (!(result =
+                    LLVMBuildOr(comp_ctx->builder, vector1, vector2, "or"))) {
+                HANDLE_FAILURE("LLVMBuildAnd");
+                goto fail;
+            }
+            break;
+        case V128_XOR:
+            if (!(result = LLVMBuildXor(comp_ctx->builder, vector1, vector2,
+                                        "xor"))) {
+                HANDLE_FAILURE("LLVMBuildAnd");
+                goto fail;
+            }
+            break;
+        case V128_ANDNOT:
+        {
+            /* v128.and(a, v128.not(b)) */
+            if (!(vector2 = LLVMBuildNot(comp_ctx->builder, vector2, "not"))) {
+                HANDLE_FAILURE("LLVMBuildNot");
+                goto fail;
+            }
+
+            if (!(result = LLVMBuildAnd(comp_ctx->builder, vector1, vector2,
+                                        "and"))) {
+                HANDLE_FAILURE("LLVMBuildAnd");
+                goto fail;
+            }
+
+            break;
+        }
+        default:
+            bh_assert(0);
+            goto fail;
+    }
+
+    PUSH_V128(result);
+    return true;
+fail:
+    return false;
+}
+
+static bool
+v128_bitwise_not(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+{
+    LLVMValueRef vector, result;
+
+    POP_V128(vector);
+
+    if (!(result = LLVMBuildNot(comp_ctx->builder, vector, "not"))) {
+        HANDLE_FAILURE("LLVMBuildNot");
+        goto fail;
+    }
+
+    PUSH_V128(result);
+    return true;
+fail:
+    return false;
+}
+
+/* v128.or(v128.and(v1, c), v128.and(v2, v128.not(c))) */
+static bool
+v128_bitwise_bit_select(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+{
+    LLVMValueRef vector1, vector2, vector3, result;
+
+    POP_V128(vector3);
+    POP_V128(vector2);
+    POP_V128(vector1);
+
+    if (!(vector1 =
+            LLVMBuildAnd(comp_ctx->builder, vector1, vector3, "a_and_c"))) {
+        HANDLE_FAILURE("LLVMBuildAdd");
+        goto fail;
+    }
+
+    if (!(vector3 = LLVMBuildNot(comp_ctx->builder, vector3, "not_c"))) {
+        HANDLE_FAILURE("LLVMBuildNot");
+        goto fail;
+    }
+
+    if (!(vector2 =
+            LLVMBuildAnd(comp_ctx->builder, vector2, vector3, "b_and_c"))) {
+        HANDLE_FAILURE("LLVMBuildAdd");
+        goto fail;
+    }
+
+    if (!(result =
+            LLVMBuildOr(comp_ctx->builder, vector1, vector2, "a_or_b"))) {
+        HANDLE_FAILURE("LLVMBuildOr");
+        goto fail;
+    }
+
+    PUSH_V128(result);
+
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_v128_bitwise(AOTCompContext *comp_ctx,
+                              AOTFuncContext *func_ctx,
+                              V128Bitwise bitwise_op)
+{
+    switch (bitwise_op) {
+        case V128_AND:
+        case V128_OR:
+        case V128_XOR:
+        case V128_ANDNOT:
+            return v128_bitwise_two_component(comp_ctx, func_ctx, bitwise_op);
+        case V128_NOT:
+            return v128_bitwise_not(comp_ctx, func_ctx);
+        case V128_BITSELECT:
+            return v128_bitwise_bit_select(comp_ctx, func_ctx);
+        default:
+            bh_assert(0);
+            return false;
+    }
+}

+ 24 - 0
core/iwasm/compilation/simd/simd_bitwise_ops.h

@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _SIMD_BITWISE_OPS_H_
+#define _SIMD_BITWISE_OPS_H_
+
+#include "../aot_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool
+aot_compile_simd_v128_bitwise(AOTCompContext *comp_ctx,
+                              AOTFuncContext *func_ctx,
+                              V128Bitwise bitwise_op);
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif /* end of _SIMD_BITWISE_OPS_H_ */

+ 183 - 0
core/iwasm/compilation/simd/simd_bool_reductions.c

@@ -0,0 +1,183 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include "simd_bool_reductions.h"
+#include "simd_common.h"
+#include "../aot_emit_exception.h"
+#include "../../aot/aot_runtime.h"
+
+static bool
+simd_any_true(AOTCompContext *comp_ctx,
+              AOTFuncContext *func_ctx,
+              LLVMTypeRef vector_type,
+              LLVMTypeRef element_type,
+              const char *intrinsic)
+{
+    LLVMValueRef vector, zeros, non_zero, result;
+
+    if (!(vector = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                             "vec"))) {
+        goto fail;
+    }
+
+    if (!(zeros = LLVMConstNull(vector_type))) {
+        HANDLE_FAILURE("LLVMConstNull");
+        goto fail;
+    }
+
+    /* icmp eq <N x iX> %vector, zeroinitialize */
+    if (!(non_zero = LLVMBuildICmp(comp_ctx->builder, LLVMIntNE, vector, zeros,
+                                   "non_zero"))) {
+        HANDLE_FAILURE("LLVMBuildICmp");
+        goto fail;
+    }
+
+    /* zext <N x i1> to <N x iX> */
+    if (!(non_zero = LLVMBuildZExt(comp_ctx->builder, non_zero, vector_type,
+                                   "non_zero_ex"))) {
+        HANDLE_FAILURE("LLVMBuildZExt");
+        goto fail;
+    }
+
+    if (!(result = aot_call_llvm_intrinsic(comp_ctx, intrinsic, element_type,
+                                           &vector_type, 1, non_zero))) {
+        HANDLE_FAILURE("LLVMBuildCall");
+        goto fail;
+    }
+
+    if (!(zeros = LLVMConstNull(element_type))) {
+        HANDLE_FAILURE("LLVMConstNull");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildICmp(comp_ctx->builder, LLVMIntNE, result, zeros,
+                                 "gt_zero"))) {
+        HANDLE_FAILURE("LLVMBuildICmp");
+        goto fail;
+    }
+
+    if (!(result =
+            LLVMBuildZExt(comp_ctx->builder, result, I32_TYPE, "ret"))) {
+        HANDLE_FAILURE("LLVMBuildZExt");
+        goto fail;
+    }
+
+    PUSH_I32(result);
+
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_i8x16_any_true(AOTCompContext *comp_ctx,
+                                AOTFuncContext *func_ctx)
+{
+    return simd_any_true(comp_ctx, func_ctx, V128_i8x16_TYPE, INT8_TYPE,
+                         "llvm.experimental.vector.reduce.add.v16i8");
+}
+
+bool
+aot_compile_simd_i16x8_any_true(AOTCompContext *comp_ctx,
+                                AOTFuncContext *func_ctx)
+{
+    return simd_any_true(comp_ctx, func_ctx, V128_i16x8_TYPE, INT16_TYPE,
+                         "llvm.experimental.vector.reduce.add.v8i16");
+}
+
+bool
+aot_compile_simd_i32x4_any_true(AOTCompContext *comp_ctx,
+                                AOTFuncContext *func_ctx)
+{
+    return simd_any_true(comp_ctx, func_ctx, V128_i32x4_TYPE, I32_TYPE,
+                         "llvm.experimental.vector.reduce.add.v4i32");
+}
+
+static bool
+simd_all_true(AOTCompContext *comp_ctx,
+              AOTFuncContext *func_ctx,
+              LLVMTypeRef vector_type,
+              LLVMTypeRef element_type,
+              const char *intrinsic)
+{
+    LLVMValueRef vector, zeros, is_zero, result;
+
+    if (!(vector = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                             "vec"))) {
+        goto fail;
+    }
+
+    if (!(zeros = LLVMConstNull(vector_type))) {
+        HANDLE_FAILURE("LLVMConstNull");
+        goto fail;
+    }
+
+    /* icmp eq <N x iX> %vector, zeroinitialize */
+    if (!(is_zero = LLVMBuildICmp(comp_ctx->builder, LLVMIntEQ, vector, zeros,
+                                  "is_zero"))) {
+        HANDLE_FAILURE("LLVMBuildICmp");
+        goto fail;
+    }
+
+    /* zext <N x i1> to <N x iX> */
+    if (!(is_zero = LLVMBuildZExt(comp_ctx->builder, is_zero, vector_type,
+                                  "is_zero_ex"))) {
+        HANDLE_FAILURE("LLVMBuildZExt");
+        goto fail;
+    }
+
+    if (!(result = aot_call_llvm_intrinsic(comp_ctx, intrinsic, element_type,
+                                           &vector_type, 1, is_zero))) {
+        HANDLE_FAILURE("LLVMBuildCall");
+        goto fail;
+    }
+
+    if (!(zeros = LLVMConstNull(element_type))) {
+        HANDLE_FAILURE("LLVMConstNull");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildICmp(comp_ctx->builder, LLVMIntEQ, result, zeros,
+                                 "none"))) {
+        HANDLE_FAILURE("LLVMBuildICmp");
+        goto fail;
+    }
+
+    if (!(result =
+            LLVMBuildZExt(comp_ctx->builder, result, I32_TYPE, "ret"))) {
+        HANDLE_FAILURE("LLVMBuildZExt");
+        goto fail;
+    }
+
+    PUSH_I32(result);
+
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_i8x16_all_true(AOTCompContext *comp_ctx,
+                                AOTFuncContext *func_ctx)
+{
+    return simd_all_true(comp_ctx, func_ctx, V128_i8x16_TYPE, INT8_TYPE,
+                         "llvm.experimental.vector.reduce.add.v16i8");
+}
+
+bool
+aot_compile_simd_i16x8_all_true(AOTCompContext *comp_ctx,
+                                AOTFuncContext *func_ctx)
+{
+    return simd_all_true(comp_ctx, func_ctx, V128_i16x8_TYPE, INT16_TYPE,
+                         "llvm.experimental.vector.reduce.add.v8i16");
+}
+
+bool
+aot_compile_simd_i32x4_all_true(AOTCompContext *comp_ctx,
+                                AOTFuncContext *func_ctx)
+{
+    return simd_all_true(comp_ctx, func_ctx, V128_i32x4_TYPE, I32_TYPE,
+                         "llvm.experimental.vector.reduce.add.v4i32");
+}

+ 43 - 0
core/iwasm/compilation/simd/simd_bool_reductions.h

@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _SIMD_BOOL_REDUCTIONS_H_
+#define _SIMD_BOOL_REDUCTIONS_H_
+
+#include "../aot_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool
+aot_compile_simd_i8x16_any_true(AOTCompContext *comp_ctx,
+                                AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_i16x8_any_true(AOTCompContext *comp_ctx,
+                                AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_i32x4_any_true(AOTCompContext *comp_ctx,
+                                AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_i8x16_all_true(AOTCompContext *comp_ctx,
+                                AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_i16x8_all_true(AOTCompContext *comp_ctx,
+                                AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_i32x4_all_true(AOTCompContext *comp_ctx,
+                                AOTFuncContext *func_ctx);
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif /* end of _SIMD_BOOL_REDUCTIONS_H_ */

+ 47 - 0
core/iwasm/compilation/simd/simd_common.c

@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include "simd_common.h"
+
+LLVMValueRef
+simd_pop_v128_and_bitcast(const AOTCompContext *comp_ctx,
+                          const AOTFuncContext *func_ctx,
+                          LLVMTypeRef vec_type,
+                          const char *name)
+{
+    LLVMValueRef number;
+
+    POP_V128(number);
+
+    if (!(number =
+            LLVMBuildBitCast(comp_ctx->builder, number, vec_type, name))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    return number;
+fail:
+    return NULL;
+}
+
+bool
+simd_bitcast_and_push_v128(const AOTCompContext *comp_ctx,
+                           const AOTFuncContext *func_ctx,
+                           LLVMValueRef vector,
+                           const char *name)
+{
+    if (!(vector = LLVMBuildBitCast(comp_ctx->builder, vector, V128_i64x2_TYPE,
+                                    name))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    /* push result into the stack */
+    PUSH_V128(vector);
+
+    return true;
+fail:
+    return false;
+}

+ 23 - 0
core/iwasm/compilation/simd/simd_common.h

@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _SIMD_COMMON_H_
+#define _SIMD_COMMON_H_
+
+#include "../aot_compiler.h"
+
+LLVMValueRef
+simd_pop_v128_and_bitcast(const AOTCompContext *comp_ctx,
+                          const AOTFuncContext *func_ctx,
+                          LLVMTypeRef vec_type,
+                          const char *name);
+
+bool
+simd_bitcast_and_push_v128(const AOTCompContext *comp_ctx,
+                           const AOTFuncContext *func_ctx,
+                           LLVMValueRef vector,
+                           const char *name);
+
+#endif /* _SIMD_COMMON_H_ */

+ 231 - 0
core/iwasm/compilation/simd/simd_comparisons.c

@@ -0,0 +1,231 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include "simd_comparisons.h"
+#include "simd_common.h"
+#include "../aot_emit_exception.h"
+#include "../../aot/aot_runtime.h"
+
+static bool
+float_cond_2_predicate(FloatCond cond, LLVMRealPredicate *out)
+{
+    switch (cond) {
+        case FLOAT_EQ:
+            *out = LLVMRealOEQ;
+            break;
+        case FLOAT_NE:
+            *out = LLVMRealUNE;
+            break;
+        case FLOAT_LT:
+            *out = LLVMRealOLT;
+            break;
+        case FLOAT_GT:
+            *out = LLVMRealOGT;
+            break;
+        case FLOAT_LE:
+            *out = LLVMRealOLE;
+            break;
+        case FLOAT_GE:
+            *out = LLVMRealOGE;
+            break;
+        default:
+            bh_assert(0);
+            goto fail;
+    }
+
+    return true;
+fail:
+    return false;
+}
+
+static bool
+int_cond_2_predicate(IntCond cond, LLVMIntPredicate *out)
+{
+    switch (cond) {
+        case INT_EQZ:
+        case INT_EQ:
+            *out = LLVMIntEQ;
+            break;
+        case INT_NE:
+            *out = LLVMIntNE;
+            break;
+        case INT_LT_S:
+            *out = LLVMIntSLT;
+            break;
+        case INT_LT_U:
+            *out = LLVMIntULT;
+            break;
+        case INT_GT_S:
+            *out = LLVMIntSGT;
+            break;
+        case INT_GT_U:
+            *out = LLVMIntUGT;
+            break;
+        case INT_LE_S:
+            *out = LLVMIntSLE;
+            break;
+        case INT_LE_U:
+            *out = LLVMIntULE;
+            break;
+        case INT_GE_S:
+            *out = LLVMIntSGE;
+            break;
+        case INT_GE_U:
+            *out = LLVMIntUGE;
+            break;
+        default:
+            bh_assert(0);
+            goto fail;
+    }
+
+    return true;
+fail:
+    return false;
+}
+
+static bool
+interger_vector_compare(AOTCompContext *comp_ctx,
+                        AOTFuncContext *func_ctx,
+                        IntCond cond,
+                        LLVMTypeRef vector_type)
+{
+    LLVMValueRef vec1, vec2, result;
+    LLVMIntPredicate int_pred;
+
+    if (!(vec2 = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                           "vec2"))) {
+        goto fail;
+    }
+
+    if (!(vec1 = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                           "vec1"))) {
+        goto fail;
+    }
+
+    if (!int_cond_2_predicate(cond, &int_pred)) {
+        HANDLE_FAILURE("int_cond_2_predicate");
+        goto fail;
+    }
+    /* icmp <N x iX> %vec1, %vec2 */
+    if (!(result =
+            LLVMBuildICmp(comp_ctx->builder, int_pred, vec1, vec2, "cmp"))) {
+        HANDLE_FAILURE("LLVMBuildICmp");
+        goto fail;
+    }
+
+    /* sext <N x i1> %result to <N x iX> */
+    if (!(result =
+            LLVMBuildSExt(comp_ctx->builder, result, vector_type, "ext"))) {
+        HANDLE_FAILURE("LLVMBuildSExt");
+        goto fail;
+    }
+
+    /* bitcast <N x iX> %result to <2 x i64> */
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "result"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    PUSH_V128(result);
+
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_i8x16_compare(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               IntCond cond)
+{
+    return interger_vector_compare(comp_ctx, func_ctx, cond, V128_i8x16_TYPE);
+}
+
+bool
+aot_compile_simd_i16x8_compare(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               IntCond cond)
+{
+    return interger_vector_compare(comp_ctx, func_ctx, cond, V128_i16x8_TYPE);
+}
+
+bool
+aot_compile_simd_i32x4_compare(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               IntCond cond)
+{
+    return interger_vector_compare(comp_ctx, func_ctx, cond, V128_i32x4_TYPE);
+}
+
+static bool
+float_vector_compare(AOTCompContext *comp_ctx,
+                     AOTFuncContext *func_ctx,
+                     FloatCond cond,
+                     LLVMTypeRef vector_type,
+                     LLVMTypeRef result_type)
+{
+    LLVMValueRef vec1, vec2, result;
+    LLVMRealPredicate real_pred;
+
+    if (!(vec2 = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                           "vec2"))) {
+        goto fail;
+    }
+
+    if (!(vec1 = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                           "vec1"))) {
+        goto fail;
+    }
+
+    if (!float_cond_2_predicate(cond, &real_pred)) {
+        HANDLE_FAILURE("float_cond_2_predicate");
+        goto fail;
+    }
+    /* fcmp <N x iX> %vec1, %vec2 */
+    if (!(result =
+            LLVMBuildFCmp(comp_ctx->builder, real_pred, vec1, vec2, "cmp"))) {
+        HANDLE_FAILURE("LLVMBuildFCmp");
+        goto fail;
+    }
+
+    /* sext <N x i1> %result to <N x iX> */
+    if (!(result =
+            LLVMBuildSExt(comp_ctx->builder, result, result_type, "ext"))) {
+        HANDLE_FAILURE("LLVMBuildSExt");
+        goto fail;
+    }
+
+    /* bitcast <N x iX> %result to <2 x i64> */
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "result"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    PUSH_V128(result);
+
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_f32x4_compare(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               FloatCond cond)
+{
+    return float_vector_compare(comp_ctx, func_ctx, cond, V128_f32x4_TYPE,
+                                V128_i32x4_TYPE);
+}
+
+bool
+aot_compile_simd_f64x2_compare(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               FloatCond cond)
+{
+    return float_vector_compare(comp_ctx, func_ctx, cond, V128_f64x2_TYPE,
+                                V128_i64x2_TYPE);
+}

+ 44 - 0
core/iwasm/compilation/simd/simd_comparisons.h

@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _SIMD_COMPARISONS_H_
+#define _SIMD_COMPARISONS_H_
+
+#include "../aot_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool
+aot_compile_simd_i8x16_compare(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               IntCond cond);
+
+bool
+aot_compile_simd_i16x8_compare(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               IntCond cond);
+
+bool
+aot_compile_simd_i32x4_compare(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               IntCond cond);
+
+bool
+aot_compile_simd_f32x4_compare(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               FloatCond cond);
+
+bool
+aot_compile_simd_f64x2_compare(AOTCompContext *comp_ctx,
+                               AOTFuncContext *func_ctx,
+                               FloatCond cond);
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif /* end of _SIMD_COMPARISONS_H_ */

+ 190 - 0
core/iwasm/compilation/simd/simd_construct_values.c

@@ -0,0 +1,190 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include "simd_construct_values.h"
+#include "../aot_emit_exception.h"
+#include "../interpreter/wasm_opcode.h"
+#include "../../aot/aot_runtime.h"
+
+bool
+aot_compile_simd_v128_const(AOTCompContext *comp_ctx,
+                            AOTFuncContext *func_ctx,
+                            const uint8 *imm_bytes)
+{
+    uint64 imm1, imm2;
+    LLVMValueRef undef, first_long, agg1, second_long, agg2;
+
+    wasm_runtime_read_v128(imm_bytes, &imm1, &imm2);
+
+    if (!(undef = LLVMGetUndef(V128_i64x2_TYPE))) {
+        HANDLE_FAILURE("LLVMGetUndef");
+        goto fail;
+    }
+
+    /* %agg1 = insertelement <2 x i64> undef, i16 0, i64 ${*imm} */
+    if (!(first_long = I64_CONST(imm1))) {
+        HANDLE_FAILURE("LLVMConstInt");
+        goto fail;
+    }
+
+    if (!(agg1 = LLVMBuildInsertElement(comp_ctx->builder, undef, first_long,
+                                        I32_ZERO, "agg1"))) {
+        HANDLE_FAILURE("LLVMBuildInsertElement");
+        goto fail;
+    }
+
+    /* %agg2 = insertelement <2 x i64> %agg1, i16 1, i64 ${*(imm + 1)} */
+    if (!(second_long = I64_CONST(imm2))) {
+        HANDLE_FAILURE("LLVMGetUndef");
+        goto fail;
+    }
+
+    if (!(agg2 = LLVMBuildInsertElement(comp_ctx->builder, agg1, second_long,
+                                        I32_ONE, "agg2"))) {
+        HANDLE_FAILURE("LLVMBuildInsertElement");
+        goto fail;
+    }
+
+    PUSH_V128(agg2);
+
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_splat(AOTCompContext *comp_ctx,
+                       AOTFuncContext *func_ctx,
+                       uint8 splat_opcode)
+{
+    LLVMValueRef value, undef, base, mask, new_vector, result;
+    LLVMTypeRef all_zero_ty;
+
+    switch (splat_opcode) {
+        case SIMD_i8x16_splat:
+        {
+            LLVMValueRef input;
+            POP_I32(input);
+
+            /* trunc i32 %input to i8 */
+            if (!(value = LLVMBuildTrunc(comp_ctx->builder, input, INT8_TYPE,
+                                         "trunc"))) {
+                HANDLE_FAILURE("LLVMBuildTrunc");
+                goto fail;
+            }
+            undef = LLVMGetUndef(V128_i8x16_TYPE);
+            if (!(all_zero_ty = LLVMVectorType(I32_TYPE, 16))) {
+                HANDLE_FAILURE("LLVMVectorType");
+                goto fail;
+            }
+            break;
+        }
+        case SIMD_i16x8_splat:
+        {
+            LLVMValueRef input;
+            POP_I32(input);
+
+            /* trunc i32 %input to i16 */
+            if (!(value = LLVMBuildTrunc(comp_ctx->builder, input, INT16_TYPE,
+                                         "trunc"))) {
+                HANDLE_FAILURE("LLVMBuildTrunc");
+                goto fail;
+            }
+            undef = LLVMGetUndef(V128_i16x8_TYPE);
+            if (!(all_zero_ty = LLVMVectorType(I32_TYPE, 8))) {
+                HANDLE_FAILURE("LLVMVectorType");
+                goto fail;
+            }
+            break;
+        }
+        case SIMD_i32x4_splat:
+        {
+            POP_I32(value);
+            undef = LLVMGetUndef(V128_i32x4_TYPE);
+
+            if (!(all_zero_ty = LLVMVectorType(I32_TYPE, 4))) {
+                HANDLE_FAILURE("LLVMVectorType");
+                goto fail;
+            }
+            break;
+        }
+        case SIMD_i64x2_splat:
+        {
+            POP(value, VALUE_TYPE_I64);
+            undef = LLVMGetUndef(V128_i64x2_TYPE);
+
+            if (!(all_zero_ty = LLVMVectorType(I32_TYPE, 2))) {
+                HANDLE_FAILURE("LLVMVectorType");
+                goto fail;
+            }
+            break;
+        }
+        case SIMD_f32x4_splat:
+        {
+            POP(value, VALUE_TYPE_F32);
+            undef = LLVMGetUndef(V128_f32x4_TYPE);
+
+            if (!(all_zero_ty = LLVMVectorType(I32_TYPE, 4))) {
+                HANDLE_FAILURE("LLVMVectorType");
+                goto fail;
+            }
+            break;
+        }
+        case SIMD_f64x2_splat:
+        {
+            POP(value, VALUE_TYPE_F64);
+            undef = LLVMGetUndef(V128_f64x2_TYPE);
+
+            if (!(all_zero_ty = LLVMVectorType(I32_TYPE, 2))) {
+                HANDLE_FAILURE("LLVMVectorType");
+                goto fail;
+            }
+            break;
+        }
+        default:
+        {
+            bh_assert(0);
+            goto fail;
+        }
+    }
+    if (!undef) {
+        HANDLE_FAILURE("LVMGetUndef");
+        goto fail;
+    }
+
+    /* insertelement <n x ty> undef, ty %value, i32 0 */
+    if (!(base = LLVMBuildInsertElement(comp_ctx->builder, undef, value,
+                                        I32_ZERO, "base"))) {
+        HANDLE_FAILURE("LLVMBuildInsertElement");
+        goto fail;
+    }
+
+    /* <n x i32> zeroinitializer */
+    if (!(mask = LLVMConstNull(all_zero_ty))) {
+        HANDLE_FAILURE("LLVMConstNull");
+        goto fail;
+    }
+
+    /* shufflevector <ty1> %base, <ty2> undef, <n x i32> zeroinitializer */
+    if (!(new_vector = LLVMBuildShuffleVector(comp_ctx->builder, base, undef,
+                                              mask, "new_vector"))) {
+        HANDLE_FAILURE("LLVMBuildShuffleVector");
+        goto fail;
+    }
+
+    /* bitcast <ty> <value> to <2 x i64> */
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, new_vector,
+                                    V128_i64x2_TYPE, "ret"))) {
+        HANDLE_FAILURE("LLVMBuidlCast");
+        goto fail;
+    }
+
+    /* push result into the stack */
+    PUSH_V128(result);
+
+    return true;
+fail:
+    return false;
+}

+ 29 - 0
core/iwasm/compilation/simd/simd_construct_values.h

@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _SIMD_CONSTRUCT_VALUES_H_
+#define _SIMD_CONSTRUCT_VALUES_H_
+
+#include "../aot_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool
+aot_compile_simd_v128_const(AOTCompContext *comp_ctx,
+                            AOTFuncContext *func_ctx,
+                            const uint8 *imm_bytes);
+
+bool
+aot_compile_simd_splat(AOTCompContext *comp_ctx,
+                       AOTFuncContext *func_ctx,
+                       uint8 splat_opcode);
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif /* end of _SIMD_CONSTRUCT_VALUES_H_ */

+ 422 - 0
core/iwasm/compilation/simd/simd_conversions.c

@@ -0,0 +1,422 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include "simd_conversions.h"
+#include "simd_common.h"
+#include "../aot_emit_exception.h"
+#include "../aot_emit_numberic.h"
+#include "../../aot/aot_runtime.h"
+
+static bool
+simd_integer_narrow(AOTCompContext *comp_ctx,
+                    AOTFuncContext *func_ctx,
+                    bool is_signed,
+                    LLVMTypeRef in_vector_type,
+                    LLVMTypeRef out_vector_type,
+                    const char *instrinsic)
+{
+    LLVMValueRef vector1, vector2, result;
+    LLVMTypeRef param_types[2] = { in_vector_type, in_vector_type };
+
+    if (!(vector2 = simd_pop_v128_and_bitcast(comp_ctx, func_ctx,
+                                              in_vector_type, "vec2"))) {
+        goto fail;
+    }
+
+    if (!(vector1 = simd_pop_v128_and_bitcast(comp_ctx, func_ctx,
+                                              in_vector_type, "vec1"))) {
+        goto fail;
+    }
+
+    if (!(result =
+            aot_call_llvm_intrinsic(comp_ctx, instrinsic, out_vector_type,
+                                    param_types, 2, vector1, vector2))) {
+        HANDLE_FAILURE("LLVMBuildCall");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "ret"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    PUSH_V128(result);
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_i8x16_narrow_i16x8(AOTCompContext *comp_ctx,
+                                    AOTFuncContext *func_ctx,
+                                    bool is_signed)
+{
+    return simd_integer_narrow(
+      comp_ctx, func_ctx, is_signed, V128_i16x8_TYPE, V128_i8x16_TYPE,
+      is_signed ? "llvm.x86.sse2.packsswb.128" : "llvm.x86.sse2.packuswb.128");
+}
+
+bool
+aot_compile_simd_i16x8_narrow_i32x4(AOTCompContext *comp_ctx,
+                                    AOTFuncContext *func_ctx,
+                                    bool is_signed)
+{
+    return simd_integer_narrow(
+      comp_ctx, func_ctx, is_signed, V128_i32x4_TYPE, V128_i16x8_TYPE,
+      is_signed ? "llvm.x86.sse2.packssdw.128" : "llvm.x86.sse41.packusdw");
+}
+
+bool
+aot_compile_simd_i16x8_widen_i8x16(AOTCompContext *comp_ctx,
+                                   AOTFuncContext *func_ctx,
+                                   bool is_low_half,
+                                   bool is_signed)
+{
+    LLVMValueRef vector, undef, mask_high[8], mask_low[8], mask, shuffled,
+      result;
+    uint8 mask_high_value[8] = { 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf },
+          mask_low_value[8] = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 }, i;
+
+    if (!(vector = simd_pop_v128_and_bitcast(comp_ctx, func_ctx,
+                                             V128_i8x16_TYPE, "vec"))) {
+        goto fail;
+    }
+
+    if (!(undef = LLVMGetUndef(V128_i8x16_TYPE))) {
+        HANDLE_FAILURE("LLVMGetUndef");
+        goto fail;
+    }
+
+    /* create a mask */
+    for (i = 0; i < 8; i++) {
+        mask_high[i] = LLVMConstInt(I32_TYPE, mask_high_value[i], true);
+        mask_low[i] = LLVMConstInt(I32_TYPE, mask_low_value[i], true);
+    }
+
+    mask = is_low_half ? LLVMConstVector(mask_low, 8)
+                       : LLVMConstVector(mask_high, 8);
+    if (!mask) {
+        HANDLE_FAILURE("LLVMConstVector");
+        goto fail;
+    }
+
+    /* retrive the low or high half */
+    if (!(shuffled = LLVMBuildShuffleVector(comp_ctx->builder, vector, undef,
+                                            mask, "shuffled"))) {
+        HANDLE_FAILURE("LLVMBuildShuffleVector");
+        goto fail;
+    }
+
+    if (is_signed) {
+        if (!(result = LLVMBuildSExt(comp_ctx->builder, shuffled,
+                                     V128_i16x8_TYPE, "ext"))) {
+            HANDLE_FAILURE("LLVMBuildSExt");
+            goto fail;
+        }
+    }
+    else {
+        if (!(result = LLVMBuildZExt(comp_ctx->builder, shuffled,
+                                     V128_i16x8_TYPE, "ext"))) {
+            HANDLE_FAILURE("LLVMBuildZExt");
+            goto fail;
+        }
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "ret"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    PUSH_V128(result);
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_i32x4_widen_i16x8(AOTCompContext *comp_ctx,
+                                   AOTFuncContext *func_ctx,
+                                   bool is_low_half,
+                                   bool is_signed)
+{
+    LLVMValueRef vector, undef, mask_high[4], mask_low[4], mask, shuffled,
+      result;
+    uint8 mask_high_value[4] = { 0x4, 0x5, 0x6, 0x7 },
+          mask_low_value[4] = { 0x0, 0x1, 0x2, 0x3 }, i;
+
+    if (!(vector = simd_pop_v128_and_bitcast(comp_ctx, func_ctx,
+                                             V128_i16x8_TYPE, "vec"))) {
+        goto fail;
+    }
+
+    if (!(undef = LLVMGetUndef(V128_i16x8_TYPE))) {
+        HANDLE_FAILURE("LLVMGetUndef");
+        goto fail;
+    }
+
+    /* create a mask */
+    for (i = 0; i < 4; i++) {
+        mask_high[i] = LLVMConstInt(I32_TYPE, mask_high_value[i], true);
+        mask_low[i] = LLVMConstInt(I32_TYPE, mask_low_value[i], true);
+    }
+
+    mask = is_low_half ? LLVMConstVector(mask_low, 4)
+                       : LLVMConstVector(mask_high, 4);
+    if (!mask) {
+        HANDLE_FAILURE("LLVMConstVector");
+        goto fail;
+    }
+
+    /* retrive the low or high half */
+    if (!(shuffled = LLVMBuildShuffleVector(comp_ctx->builder, vector, undef,
+                                            mask, "shuffled"))) {
+        HANDLE_FAILURE("LLVMBuildShuffleVector");
+        goto fail;
+    }
+
+    if (is_signed) {
+        if (!(result = LLVMBuildSExt(comp_ctx->builder, shuffled,
+                                     V128_i32x4_TYPE, "ext"))) {
+            HANDLE_FAILURE("LLVMBuildSExt");
+            goto fail;
+        }
+    }
+    else {
+        if (!(result = LLVMBuildZExt(comp_ctx->builder, shuffled,
+                                     V128_i32x4_TYPE, "ext"))) {
+            HANDLE_FAILURE("LLVMBuildZExt");
+            goto fail;
+        }
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "ret"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    PUSH_V128(result);
+    return true;
+fail:
+    return false;
+}
+
+static LLVMValueRef
+simd_build_const_f32x4(AOTCompContext *comp_ctx,
+                       AOTFuncContext *func_ctx,
+                       float f)
+{
+    LLVMValueRef elements[4], vector;
+
+    if (!(elements[0] = LLVMConstReal(F32_TYPE, f))) {
+        HANDLE_FAILURE("LLVMConstInt");
+        goto fail;
+    }
+
+    elements[1] = elements[2] = elements[3] = elements[0];
+
+    if (!(vector = LLVMConstVector(elements, 4))) {
+        HANDLE_FAILURE("LLVMConstVector");
+        goto fail;
+    }
+
+    return vector;
+fail:
+    return NULL;
+}
+
+static LLVMValueRef
+simd_build_const_i32x4(AOTCompContext *comp_ctx,
+                       AOTFuncContext *func_ctx,
+                       uint64 integer,
+                       bool is_signed)
+{
+    LLVMValueRef elements[4], vector;
+
+    if (!(elements[0] = LLVMConstInt(I32_TYPE, integer, is_signed))) {
+        HANDLE_FAILURE("LLVMConstInt");
+        goto fail;
+    }
+
+    elements[1] = elements[2] = elements[3] = elements[0];
+
+    if (!(vector = LLVMConstVector(elements, 4))) {
+        HANDLE_FAILURE("LLVMConstVector");
+        goto fail;
+    }
+
+    return vector;
+fail:
+    return NULL;
+}
+
+bool
+aot_compile_simd_i32x4_trunc_sat_f32x4(AOTCompContext *comp_ctx,
+                                       AOTFuncContext *func_ctx,
+                                       bool is_signed)
+{
+    LLVMValueRef vector, zeros, is_nan, max_float_v, min_float_v, is_ge_max,
+      is_le_min, result, max_int_v, min_int_v;
+    uint32 max_ui = 0xFFffFFff, min_ui = 0x0;
+    int32 max_si = 0x7FFFffff, min_si = 0x80000000;
+    float max_f_ui = 4294967296.0f, min_f_ui = 0.0f, max_f_si = 2147483647.0f,
+          min_f_si = -2147483648.0f;
+
+    if (!(vector = simd_pop_v128_and_bitcast(comp_ctx, func_ctx,
+                                             V128_f32x4_TYPE, "vec"))) {
+        goto fail;
+    }
+
+    if (!(zeros = LLVMConstNull(V128_f32x4_TYPE))) {
+        HANDLE_FAILURE("LLVMConstNull");
+        goto fail;
+    }
+
+    if (is_signed) {
+        if (!(max_float_v =
+                simd_build_const_f32x4(comp_ctx, func_ctx, max_f_si))) {
+            goto fail;
+        }
+
+        if (!(min_float_v =
+                simd_build_const_f32x4(comp_ctx, func_ctx, min_f_si))) {
+            goto fail;
+        }
+
+        if (!(max_int_v =
+                simd_build_const_i32x4(comp_ctx, func_ctx, max_si, true))) {
+            goto fail;
+        }
+
+        if (!(min_int_v =
+                simd_build_const_i32x4(comp_ctx, func_ctx, min_si, true))) {
+            goto fail;
+        }
+    }
+    else {
+        if (!(max_float_v =
+                simd_build_const_f32x4(comp_ctx, func_ctx, max_f_ui))) {
+            goto fail;
+        }
+
+        if (!(min_float_v =
+                simd_build_const_f32x4(comp_ctx, func_ctx, min_f_ui))) {
+            goto fail;
+        }
+
+        if (!(max_int_v =
+                simd_build_const_i32x4(comp_ctx, func_ctx, max_ui, false))) {
+            goto fail;
+        }
+
+        if (!(min_int_v =
+                simd_build_const_i32x4(comp_ctx, func_ctx, min_ui, false))) {
+            goto fail;
+        }
+    }
+
+    if (!(is_nan = LLVMBuildFCmp(comp_ctx->builder, LLVMRealORD, vector, zeros,
+                                 "is_nan"))) {
+        HANDLE_FAILURE("LLVMBuildFCmp");
+        goto fail;
+    }
+
+    if (!(is_le_min = LLVMBuildFCmp(comp_ctx->builder, LLVMRealOLE, vector,
+                                    min_float_v, "le_min"))) {
+        HANDLE_FAILURE("LLVMBuildFCmp");
+        goto fail;
+    }
+
+    if (!(is_ge_max = LLVMBuildFCmp(comp_ctx->builder, LLVMRealOGE, vector,
+                                    max_float_v, "ge_max"))) {
+        HANDLE_FAILURE("LLVMBuildFCmp");
+        goto fail;
+    }
+
+    if (is_signed) {
+        if (!(result = LLVMBuildFPToSI(comp_ctx->builder, vector,
+                                       V128_i32x4_TYPE, "truncated"))) {
+            HANDLE_FAILURE("LLVMBuildSIToFP");
+            goto fail;
+        }
+    }
+    else {
+        if (!(result = LLVMBuildFPToUI(comp_ctx->builder, vector,
+                                       V128_i32x4_TYPE, "truncated"))) {
+            HANDLE_FAILURE("LLVMBuildUIToFP");
+            goto fail;
+        }
+    }
+
+    if (!(result = LLVMBuildSelect(comp_ctx->builder, is_ge_max, max_int_v,
+                                   result, "sat_w_max"))) {
+        HANDLE_FAILURE("LLVMBuildSelect");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildSelect(comp_ctx->builder, is_le_min, min_int_v,
+                                   result, "sat_w_min"))) {
+        HANDLE_FAILURE("LLVMBuildSelect");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildSelect(comp_ctx->builder, is_nan, result,
+                                   V128_i32x4_ZERO, "sat_w_nan"))) {
+        HANDLE_FAILURE("LLVMBuildSelect");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "ret"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    PUSH_V128(result);
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_f32x4_convert_i32x4(AOTCompContext *comp_ctx,
+                                     AOTFuncContext *func_ctx,
+                                     bool is_signed)
+{
+    LLVMValueRef vector, result;
+
+    if (!(vector = simd_pop_v128_and_bitcast(comp_ctx, func_ctx,
+                                             V128_i32x4_TYPE, "vec"))) {
+        goto fail;
+    }
+
+    if (is_signed) {
+        if (!(result = LLVMBuildSIToFP(comp_ctx->builder, vector,
+                                       V128_f32x4_TYPE, "converted"))) {
+            HANDLE_FAILURE("LLVMBuildSIToFP");
+            goto fail;
+        }
+    }
+    else {
+        if (!(result = LLVMBuildUIToFP(comp_ctx->builder, vector,
+                                       V128_f32x4_TYPE, "converted"))) {
+            HANDLE_FAILURE("LLVMBuildSIToFP");
+            goto fail;
+        }
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "ret"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    PUSH_V128(result);
+    return true;
+fail:
+    return false;
+}

+ 51 - 0
core/iwasm/compilation/simd/simd_conversions.h

@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _SIMD_CONVERSIONS_H_
+#define _SIMD_CONVERSIONS_H_
+
+#include "../aot_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool
+aot_compile_simd_i8x16_narrow_i16x8(AOTCompContext *comp_ctx,
+                                    AOTFuncContext *func_ctx,
+                                    bool is_signed);
+
+bool
+aot_compile_simd_i16x8_narrow_i32x4(AOTCompContext *comp_ctx,
+                                    AOTFuncContext *func_ctx,
+                                    bool is_signed);
+
+bool
+aot_compile_simd_i16x8_widen_i8x16(AOTCompContext *comp_ctx,
+                                   AOTFuncContext *func_ctx,
+                                   bool is_low,
+                                   bool is_signed);
+
+bool
+aot_compile_simd_i32x4_widen_i16x8(AOTCompContext *comp_ctx,
+                                   AOTFuncContext *func_ctx,
+                                   bool is_low,
+                                   bool is_signed);
+
+bool
+aot_compile_simd_i32x4_trunc_sat_f32x4(AOTCompContext *comp_ctx,
+                                       AOTFuncContext *func_ctx,
+                                       bool is_signed);
+
+bool
+aot_compile_simd_f32x4_convert_i32x4(AOTCompContext *comp_ctx,
+                                     AOTFuncContext *func_ctx,
+                                     bool is_signed);
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif /* end of _SIMD_CONVERSIONS_H_ */

+ 273 - 0
core/iwasm/compilation/simd/simd_floating_point.c

@@ -0,0 +1,273 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include "simd_floating_point.h"
+#include "simd_common.h"
+#include "../aot_emit_exception.h"
+#include "../aot_emit_numberic.h"
+#include "../../aot/aot_runtime.h"
+
+static LLVMValueRef
+simd_v128_float_cmp(AOTCompContext *comp_ctx,
+                    AOTFuncContext *func_ctx,
+                    FloatArithmetic arith_op,
+                    LLVMValueRef lhs,
+                    LLVMValueRef rhs)
+{
+    LLVMValueRef result;
+    LLVMRealPredicate op;
+
+    op = FLOAT_MIN == arith_op ? LLVMRealULT : LLVMRealUGT;
+
+    if (!(result = LLVMBuildFCmp(comp_ctx->builder, op, lhs, rhs, "cmp"))) {
+        HANDLE_FAILURE("LLVMBuildFCmp");
+        goto fail;
+    }
+
+    if (!(result =
+            LLVMBuildSelect(comp_ctx->builder, result, lhs, rhs, "select"))) {
+        HANDLE_FAILURE("LLVMBuildSelect");
+        goto fail;
+    }
+
+    return result;
+fail:
+    return NULL;
+}
+
+static bool
+simd_v128_float_arith(AOTCompContext *comp_ctx,
+                      AOTFuncContext *func_ctx,
+                      FloatArithmetic arith_op,
+                      LLVMTypeRef vector_type)
+{
+    LLVMValueRef lhs, rhs, result;
+
+    if (!(rhs = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                          "rhs"))) {
+        goto fail;
+    }
+
+    if (!(lhs = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                          "lhs"))) {
+        goto fail;
+    }
+
+    switch (arith_op) {
+        case FLOAT_ADD:
+            if (!(result =
+                    LLVMBuildFAdd(comp_ctx->builder, lhs, rhs, "sum"))) {
+                HANDLE_FAILURE("LLVMBuildFAdd");
+                goto fail;
+            }
+            break;
+        case FLOAT_SUB:
+            if (!(result = LLVMBuildFSub(comp_ctx->builder, lhs, rhs,
+                                         "difference"))) {
+                HANDLE_FAILURE("LLVMBuildFSub");
+                goto fail;
+            }
+            break;
+        case FLOAT_MUL:
+            if (!(result =
+                    LLVMBuildFMul(comp_ctx->builder, lhs, rhs, "product"))) {
+                HANDLE_FAILURE("LLVMBuildFMul");
+                goto fail;
+            }
+            break;
+        case FLOAT_DIV:
+            if (!(result =
+                    LLVMBuildFDiv(comp_ctx->builder, lhs, rhs, "quotient"))) {
+                HANDLE_FAILURE("LLVMBuildFDiv");
+                goto fail;
+            }
+            break;
+        case FLOAT_MIN:
+            if (!(result = simd_v128_float_cmp(comp_ctx, func_ctx, FLOAT_MIN,
+                                               lhs, rhs))) {
+                goto fail;
+            }
+            break;
+        case FLOAT_MAX:
+            if (!(result = simd_v128_float_cmp(comp_ctx, func_ctx, FLOAT_MAX,
+                                               lhs, rhs))) {
+                goto fail;
+            }
+            break;
+        default:
+            result = NULL;
+            bh_assert(0);
+            break;
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "ret"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    /* push result into the stack */
+    PUSH_V128(result);
+
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_f32x4_arith(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             FloatArithmetic arith_op)
+{
+    return simd_v128_float_arith(comp_ctx, func_ctx, arith_op,
+                                 V128_f32x4_TYPE);
+}
+
+bool
+aot_compile_simd_f64x2_arith(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             FloatArithmetic arith_op)
+{
+    return simd_v128_float_arith(comp_ctx, func_ctx, arith_op,
+                                 V128_f64x2_TYPE);
+}
+
+static bool
+simd_v128_float_neg(AOTCompContext *comp_ctx,
+                    AOTFuncContext *func_ctx,
+                    LLVMTypeRef vector_type)
+{
+    LLVMValueRef number, result;
+
+    if (!(number = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                             "number"))) {
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildFNeg(comp_ctx->builder, number, "neg"))) {
+        HANDLE_FAILURE("LLVMBuildFNeg");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "ret"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    /* push result into the stack */
+    PUSH_V128(result);
+
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_f32x4_neg(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+{
+    return simd_v128_float_neg(comp_ctx, func_ctx, V128_f32x4_TYPE);
+}
+
+bool
+aot_compile_simd_f64x2_neg(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+{
+    return simd_v128_float_neg(comp_ctx, func_ctx, V128_f64x2_TYPE);
+}
+
+static bool
+simd_v128_float_abs(AOTCompContext *comp_ctx,
+                    AOTFuncContext *func_ctx,
+                    LLVMTypeRef vector_type,
+                    const char *intrinsic)
+{
+    LLVMValueRef vector, result;
+    LLVMTypeRef param_types[1] = { vector_type };
+
+    if (!(vector = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                             "vec"))) {
+        goto fail;
+    }
+
+    if (!(result = aot_call_llvm_intrinsic(comp_ctx, intrinsic, vector_type,
+                                           param_types, 1, vector))) {
+        HANDLE_FAILURE("LLVMBuildCall");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "ret"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    /* push result into the stack */
+    PUSH_V128(result);
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_f32x4_abs(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+{
+    return simd_v128_float_abs(comp_ctx, func_ctx, V128_f32x4_TYPE,
+                               "llvm.fabs.v4f32");
+}
+
+bool
+aot_compile_simd_f64x2_abs(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+{
+    return simd_v128_float_abs(comp_ctx, func_ctx, V128_f64x2_TYPE,
+                               "llvm.fabs.v2f64");
+}
+
+static bool
+simd_v128_float_sqrt(AOTCompContext *comp_ctx,
+                     AOTFuncContext *func_ctx,
+                     LLVMTypeRef vector_type,
+                     const char *intrinsic)
+{
+    LLVMValueRef number, result;
+    LLVMTypeRef param_types[1] = { vector_type };
+
+    if (!(number = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                             "number"))) {
+        goto fail;
+    }
+
+    if (!(result = aot_call_llvm_intrinsic(comp_ctx, intrinsic, vector_type,
+                                           param_types, 1, number))) {
+        HANDLE_FAILURE("LLVMBuildCall");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "ret"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    /* push result into the stack */
+    PUSH_V128(result);
+
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_f32x4_sqrt(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+{
+    return simd_v128_float_sqrt(comp_ctx, func_ctx, V128_f32x4_TYPE,
+                                "llvm.sqrt.v4f32");
+}
+
+bool
+aot_compile_simd_f64x2_sqrt(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+{
+    return simd_v128_float_sqrt(comp_ctx, func_ctx, V128_f64x2_TYPE,
+                                "llvm.sqrt.v2f64");
+}

+ 49 - 0
core/iwasm/compilation/simd/simd_floating_point.h

@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _SIMD_FLOATING_POINT_H_
+#define _SIMD_FLOATING_POINT_H_
+
+#include "../aot_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool
+aot_compile_simd_f32x4_arith(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             FloatArithmetic arith_op);
+
+bool
+aot_compile_simd_f64x2_arith(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             FloatArithmetic arith_op);
+
+bool
+aot_compile_simd_f32x4_neg(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_f64x2_neg(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_f32x4_abs(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_f64x2_abs(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_f32x4_sqrt(AOTCompContext *comp_ctx,
+                            AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_f64x2_sqrt(AOTCompContext *comp_ctx,
+                            AOTFuncContext *func_ctx);
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif /* end of _SIMD_FLOATING_POINT_H_ */

+ 207 - 0
core/iwasm/compilation/simd/simd_int_arith.c

@@ -0,0 +1,207 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include "simd_int_arith.h"
+#include "simd_common.h"
+#include "../aot_emit_exception.h"
+#include "../../aot/aot_runtime.h"
+
+static bool
+simd_v128_integer_arith(AOTCompContext *comp_ctx,
+                        AOTFuncContext *func_ctx,
+                        V128Arithmetic arith_op,
+                        LLVMValueRef lhs,
+                        LLVMValueRef rhs)
+{
+    LLVMValueRef result;
+
+    switch (arith_op) {
+        case V128_ADD:
+            if (!(result = LLVMBuildAdd(comp_ctx->builder, lhs, rhs, "sum"))) {
+                HANDLE_FAILURE("LLVMBuildAdd");
+                goto fail;
+            }
+            break;
+        case V128_SUB:
+            if (!(result =
+                    LLVMBuildSub(comp_ctx->builder, lhs, rhs, "difference"))) {
+                HANDLE_FAILURE("LLVMBuildSub");
+                goto fail;
+            }
+            break;
+        case V128_MUL:
+            if (!(result =
+                    LLVMBuildMul(comp_ctx->builder, lhs, rhs, "product"))) {
+                HANDLE_FAILURE("LLVMBuildMul");
+                goto fail;
+            }
+            break;
+        case V128_NEG:
+            if (!(result = LLVMBuildNeg(comp_ctx->builder, lhs, "neg"))) {
+                HANDLE_FAILURE("LLVMBuildNeg");
+                goto fail;
+            }
+            break;
+        default:
+            result = NULL;
+            bh_assert(0);
+            break;
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "ret"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    /* push result into the stack */
+    PUSH_V128(result);
+
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_i8x16_arith(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             V128Arithmetic arith_op)
+{
+    LLVMValueRef lhs, rhs;
+
+    if (!(rhs = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, V128_i8x16_TYPE,
+                                          "rhs"))) {
+        goto fail;
+    }
+
+    if (!(lhs = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, V128_i8x16_TYPE,
+                                          "lhs"))) {
+        goto fail;
+    }
+
+    return simd_v128_integer_arith(comp_ctx, func_ctx, arith_op, lhs, rhs);
+fail:
+    return NULL;
+}
+
+bool
+aot_compile_simd_i16x8_arith(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             V128Arithmetic arith_op)
+{
+    LLVMValueRef lhs, rhs;
+
+    if (!(rhs = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, V128_i16x8_TYPE,
+                                          "rhs"))) {
+        goto fail;
+    }
+
+    if (!(lhs = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, V128_i16x8_TYPE,
+                                          "lhs"))) {
+        goto fail;
+    }
+
+    return simd_v128_integer_arith(comp_ctx, func_ctx, arith_op, lhs, rhs);
+fail:
+    return NULL;
+}
+
+bool
+aot_compile_simd_i32x4_arith(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             V128Arithmetic arith_op)
+{
+    LLVMValueRef lhs, rhs;
+
+    if (!(rhs = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, V128_i32x4_TYPE,
+                                          "rhs"))) {
+        goto fail;
+    }
+
+    if (!(lhs = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, V128_i32x4_TYPE,
+                                          "lhs"))) {
+        goto fail;
+    }
+
+    return simd_v128_integer_arith(comp_ctx, func_ctx, arith_op, lhs, rhs);
+fail:
+    return NULL;
+}
+
+bool
+aot_compile_simd_i64x2_arith(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             V128Arithmetic arith_op)
+{
+    LLVMValueRef lhs, rhs;
+
+    POP_V128(rhs);
+    POP_V128(lhs);
+
+    return simd_v128_integer_arith(comp_ctx, func_ctx, arith_op, lhs, rhs);
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_i8x16_neg(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+{
+    LLVMValueRef number;
+
+    if (!(number = simd_pop_v128_and_bitcast(comp_ctx, func_ctx,
+                                             V128_i8x16_TYPE, "number"))) {
+        goto fail;
+    }
+
+    return simd_v128_integer_arith(comp_ctx, func_ctx, V128_NEG, number, NULL);
+
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_i16x8_neg(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+{
+    LLVMValueRef number;
+
+    if (!(number = simd_pop_v128_and_bitcast(comp_ctx, func_ctx,
+                                             V128_i16x8_TYPE, "number"))) {
+        goto fail;
+    }
+
+    return simd_v128_integer_arith(comp_ctx, func_ctx, V128_NEG, number, NULL);
+
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_i32x4_neg(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+{
+    LLVMValueRef number;
+
+    if (!(number = simd_pop_v128_and_bitcast(comp_ctx, func_ctx,
+                                             V128_i32x4_TYPE, "number"))) {
+        goto fail;
+    }
+
+    return simd_v128_integer_arith(comp_ctx, func_ctx, V128_NEG, number, NULL);
+
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_i64x2_neg(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+{
+    LLVMValueRef number;
+
+    POP_V128(number);
+
+    return simd_v128_integer_arith(comp_ctx, func_ctx, V128_NEG, number, NULL);
+
+fail:
+    return false;
+}

+ 51 - 0
core/iwasm/compilation/simd/simd_int_arith.h

@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _SIMD_INT_ARITH_H_
+#define _SIMD_INT_ARITH_H_
+
+#include "../aot_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool
+aot_compile_simd_i8x16_arith(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             V128Arithmetic cond);
+
+bool
+aot_compile_simd_i16x8_arith(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             V128Arithmetic cond);
+
+bool
+aot_compile_simd_i32x4_arith(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             V128Arithmetic cond);
+
+bool
+aot_compile_simd_i64x2_arith(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             V128Arithmetic cond);
+
+bool
+aot_compile_simd_i8x16_neg(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_i16x8_neg(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_i32x4_neg(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_i64x2_neg(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif /* end of _SIMD_INT_ARITH_H_ */

+ 301 - 0
core/iwasm/compilation/simd/simd_load_store.c

@@ -0,0 +1,301 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include "simd_load_store.h"
+#include "../aot_emit_exception.h"
+#include "../aot_emit_memory.h"
+#include "../../aot/aot_runtime.h"
+#include "../../interpreter/wasm_opcode.h"
+
+/* data_length in bytes */
+static LLVMValueRef
+simd_load(AOTCompContext *comp_ctx,
+          AOTFuncContext *func_ctx,
+          uint32 align,
+          uint32 offset,
+          uint32 data_length,
+          LLVMTypeRef ptr_type)
+{
+    LLVMValueRef maddr, data;
+
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset,
+                                            data_length))) {
+        HANDLE_FAILURE("aot_check_memory_overflow");
+        goto fail;
+    }
+
+    if (!(maddr = LLVMBuildBitCast(comp_ctx->builder, maddr, ptr_type,
+                                   "data_ptr"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    if (!(data = LLVMBuildLoad(comp_ctx->builder, maddr, "data"))) {
+        HANDLE_FAILURE("LLVMBuildLoad");
+        goto fail;
+    }
+
+    LLVMSetAlignment(data, 1);
+
+    return data;
+fail:
+    return NULL;
+}
+
+/* data_length in bytes */
+static LLVMValueRef
+simd_splat(AOTCompContext *comp_ctx,
+           AOTFuncContext *func_ctx,
+           LLVMValueRef element,
+           LLVMTypeRef vectory_type,
+           unsigned lane_count)
+{
+    LLVMValueRef undef, zeros, vector;
+    LLVMTypeRef zeros_type;
+
+    if (!(undef = LLVMGetUndef(vectory_type))) {
+        HANDLE_FAILURE("LLVMGetUndef");
+        goto fail;
+    }
+
+    if (!(zeros_type = LLVMVectorType(I32_TYPE, lane_count))) {
+        HANDLE_FAILURE("LVMVectorType");
+        goto fail;
+    }
+
+    if (!(zeros = LLVMConstNull(zeros_type))) {
+        HANDLE_FAILURE("LLVMConstNull");
+        goto fail;
+    }
+
+    if (!(vector = LLVMBuildInsertElement(comp_ctx->builder, undef, element,
+                                          I32_ZERO, "base"))) {
+        HANDLE_FAILURE("LLVMBuildInsertElement");
+        goto fail;
+    }
+
+    if (!(vector = LLVMBuildShuffleVector(comp_ctx->builder, vector, undef,
+                                          zeros, "vector"))) {
+        HANDLE_FAILURE("LLVMBuildShuffleVector");
+        goto fail;
+    }
+
+    return vector;
+fail:
+    return NULL;
+}
+
+bool
+aot_compile_simd_v128_load(AOTCompContext *comp_ctx,
+                           AOTFuncContext *func_ctx,
+                           uint32 align,
+                           uint32 offset)
+{
+    LLVMValueRef result;
+
+    if (!(result =
+            simd_load(comp_ctx, func_ctx, align, offset, 16, V128_PTR_TYPE))) {
+        goto fail;
+    }
+
+    PUSH_V128(result);
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_v128_store(AOTCompContext *comp_ctx,
+                            AOTFuncContext *func_ctx,
+                            uint32 align,
+                            uint32 offset)
+{
+    LLVMValueRef maddr, value, result;
+
+    POP_V128(value);
+
+    if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 16)))
+        return false;
+
+    if (!(maddr = LLVMBuildBitCast(comp_ctx->builder, maddr, V128_PTR_TYPE,
+                                   "data_ptr"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildStore(comp_ctx->builder, value, maddr))) {
+        HANDLE_FAILURE("LLVMBuildStore");
+        goto fail;
+    }
+
+    LLVMSetAlignment(result, 1);
+
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_load_extend(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             uint8 load_opcode,
+                             uint32 align,
+                             uint32 offset)
+{
+    LLVMValueRef sub_vector, result;
+    LLVMTypeRef sub_vector_type, vector_type;
+    bool is_signed;
+    uint32 data_length;
+
+    switch (load_opcode) {
+        case SIMD_i16x8_load8x8_s:
+        case SIMD_i16x8_load8x8_u:
+        {
+            data_length = 8;
+            vector_type = V128_i16x8_TYPE;
+            is_signed = (load_opcode == SIMD_i16x8_load8x8_s);
+
+            if (!(sub_vector_type = LLVMVectorType(INT8_TYPE, 8))) {
+                HANDLE_FAILURE("LLVMVectorType");
+                goto fail;
+            }
+
+            break;
+        }
+        case SIMD_i32x4_load16x4_s:
+        case SIMD_i32x4_load16x4_u:
+        {
+            data_length = 8;
+            vector_type = V128_i32x4_TYPE;
+            is_signed = (load_opcode == SIMD_i32x4_load16x4_s);
+
+            if (!(sub_vector_type = LLVMVectorType(INT16_TYPE, 4))) {
+                HANDLE_FAILURE("LLVMVectorType");
+                goto fail;
+            }
+
+            break;
+        }
+        case SIMD_i64x2_load32x2_s:
+        case SIMD_i64x2_load32x2_u:
+        {
+            data_length = 8;
+            vector_type = V128_i64x2_TYPE;
+            is_signed = (load_opcode == SIMD_i64x2_load32x2_s);
+
+            if (!(sub_vector_type = LLVMVectorType(I32_TYPE, 2))) {
+                HANDLE_FAILURE("LLVMVectorType");
+                goto fail;
+            }
+
+            break;
+        }
+        default:
+        {
+            bh_assert(0);
+            goto fail;
+        }
+    }
+
+    /* to vector ptr type */
+    if (!(sub_vector_type = LLVMPointerType(sub_vector_type, 0))) {
+        HANDLE_FAILURE("LLVMPointerType");
+        goto fail;
+    }
+
+    if (!(sub_vector = simd_load(comp_ctx, func_ctx, align, offset,
+                                 data_length, sub_vector_type))) {
+        goto fail;
+    }
+
+    if (is_signed) {
+        if (!(result = LLVMBuildSExt(comp_ctx->builder, sub_vector,
+                                     vector_type, "vector"))) {
+            HANDLE_FAILURE("LLVMBuildSExt");
+            goto fail;
+        }
+    }
+    else {
+        if (!(result = LLVMBuildZExt(comp_ctx->builder, sub_vector,
+                                     vector_type, "vector"))) {
+            HANDLE_FAILURE("LLVMBuildZExt");
+            goto fail;
+        }
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "result"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    PUSH_V128(result);
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_load_splat(AOTCompContext *comp_ctx,
+                            AOTFuncContext *func_ctx,
+                            uint8 load_opcode,
+                            uint32 align,
+                            uint32 offset)
+{
+    LLVMValueRef element, result;
+    LLVMTypeRef element_ptr_type, vector_type;
+    unsigned data_length, lane_count;
+
+    switch (load_opcode) {
+        case SIMD_v8x16_load_splat:
+            data_length = 1;
+            lane_count = 16;
+            element_ptr_type = INT8_PTR_TYPE;
+            vector_type = V128_i8x16_TYPE;
+            break;
+        case SIMD_v16x8_load_splat:
+            data_length = 2;
+            lane_count = 8;
+            element_ptr_type = INT16_PTR_TYPE;
+            vector_type = V128_i16x8_TYPE;
+            break;
+        case SIMD_v32x4_load_splat:
+            data_length = 4;
+            lane_count = 4;
+            element_ptr_type = INT32_PTR_TYPE;
+            vector_type = V128_i32x4_TYPE;
+            break;
+        case SIMD_v64x2_load_splat:
+            data_length = 8;
+            lane_count = 2;
+            element_ptr_type = INT64_PTR_TYPE;
+            vector_type = V128_i64x2_TYPE;
+            break;
+        default:
+            bh_assert(0);
+            goto fail;
+    }
+
+    if (!(element = simd_load(comp_ctx, func_ctx, align, offset, data_length,
+                              element_ptr_type))) {
+        goto fail;
+    }
+
+    if (!(result = simd_splat(comp_ctx, func_ctx, element, vector_type,
+                              lane_count))) {
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "result"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    PUSH_V128(result);
+    return true;
+fail:
+    return false;
+}

+ 45 - 0
core/iwasm/compilation/simd/simd_load_store.h

@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _SIMD_LOAD_STORE_H_
+#define _SIMD_LOAD_STORE_H_
+
+#include "../aot_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool
+aot_compile_simd_v128_load(AOTCompContext *comp_ctx,
+                           AOTFuncContext *func_ctx,
+                           uint32 align,
+                           uint32 offset);
+
+bool
+aot_compile_simd_v128_store(AOTCompContext *comp_ctx,
+                            AOTFuncContext *func_ctx,
+                            uint32 align,
+                            uint32 offset);
+
+bool
+aot_compile_simd_load_extend(AOTCompContext *comp_ctx,
+                             AOTFuncContext *func_ctx,
+                             uint8 load_opcode,
+                             uint32 align,
+                             uint32 offset);
+
+bool
+aot_compile_simd_load_splat(AOTCompContext *comp_ctx,
+                            AOTFuncContext *func_ctx,
+                            uint8 load_opcode,
+                            uint32 align,
+                            uint32 offset);
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif /* end of _SIMD_LOAD_STORE_H_ */

+ 367 - 0
core/iwasm/compilation/simd/simd_sat_int_arith.c

@@ -0,0 +1,367 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include "simd_sat_int_arith.h"
+#include "simd_common.h"
+#include "../aot_emit_exception.h"
+#include "../../aot/aot_runtime.h"
+
+static bool
+simd_v128_integer_arith(AOTCompContext *comp_ctx,
+                        AOTFuncContext *func_ctx,
+                        LLVMTypeRef vector_type,
+                        char *intrinsics_s_u[2],
+                        bool is_signed)
+{
+    LLVMValueRef lhs, rhs, result;
+    LLVMTypeRef param_types[2];
+
+    if (!(rhs = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                          "rhs"))) {
+        goto fail;
+    }
+
+    if (!(lhs = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                          "lhs"))) {
+        goto fail;
+    }
+
+    param_types[0] = vector_type;
+    param_types[1] = vector_type;
+
+    if (!(result = aot_call_llvm_intrinsic(
+            comp_ctx, is_signed ? intrinsics_s_u[0] : intrinsics_s_u[1],
+            vector_type, param_types, 2, lhs, rhs))) {
+        HANDLE_FAILURE("LLVMBuildCall");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "ret"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    /* push result into the stack */
+    PUSH_V128(result);
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_i8x16_saturate(AOTCompContext *comp_ctx,
+                                AOTFuncContext *func_ctx,
+                                V128Arithmetic arith_op,
+                                bool is_signed)
+{
+    char *intrinsics[2] = { 0 };
+    bool result = false;
+    switch (arith_op) {
+        case V128_ADD:
+            intrinsics[0] = "llvm.sadd.sat.v16i8";
+            intrinsics[1] = "llvm.uadd.sat.v16i8";
+            result = simd_v128_integer_arith(
+              comp_ctx, func_ctx, V128_i8x16_TYPE, intrinsics, is_signed);
+            break;
+        case V128_SUB:
+            intrinsics[0] = "llvm.ssub.sat.v16i8";
+            intrinsics[1] = "llvm.usub.sat.v16i8";
+            result = simd_v128_integer_arith(
+              comp_ctx, func_ctx, V128_i8x16_TYPE, intrinsics, is_signed);
+            break;
+        default:
+            bh_assert(0);
+            break;
+    }
+
+    return result;
+}
+
+bool
+aot_compile_simd_i16x8_saturate(AOTCompContext *comp_ctx,
+                                AOTFuncContext *func_ctx,
+                                V128Arithmetic arith_op,
+                                bool is_signed)
+{
+    char *intrinsics[2] = { 0 };
+    bool result = false;
+    switch (arith_op) {
+        case V128_ADD:
+            intrinsics[0] = "llvm.sadd.sat.v8i16";
+            intrinsics[1] = "llvm.uadd.sat.v8i16";
+            result = simd_v128_integer_arith(
+              comp_ctx, func_ctx, V128_i16x8_TYPE, intrinsics, is_signed);
+            break;
+        case V128_SUB:
+            intrinsics[0] = "llvm.ssub.sat.v8i16";
+            intrinsics[1] = "llvm.usub.sat.v8i16";
+            result = simd_v128_integer_arith(
+              comp_ctx, func_ctx, V128_i16x8_TYPE, intrinsics, is_signed);
+            break;
+        default:
+            bh_assert(0);
+            break;
+    }
+
+    return result;
+}
+
+static bool
+simd_v128_cmp(AOTCompContext *comp_ctx,
+              AOTFuncContext *func_ctx,
+              LLVMTypeRef vector_type,
+              V128Arithmetic arith_op,
+              bool is_signed)
+{
+    LLVMValueRef lhs, rhs, result;
+    LLVMIntPredicate op;
+
+    if (!(rhs = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                          "rhs"))) {
+        goto fail;
+    }
+
+    if (!(lhs = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                          "lhs"))) {
+        goto fail;
+    }
+
+    if (V128_MIN == arith_op) {
+        op = is_signed ? LLVMIntSLT : LLVMIntULT;
+    }
+    else {
+        op = is_signed ? LLVMIntSGT : LLVMIntUGT;
+    }
+
+    if (!(result = LLVMBuildICmp(comp_ctx->builder, op, lhs, rhs, "cmp"))) {
+        HANDLE_FAILURE("LLVMBuildICmp");
+        goto fail;
+    }
+
+    if (!(result =
+            LLVMBuildSelect(comp_ctx->builder, result, lhs, rhs, "select"))) {
+        HANDLE_FAILURE("LLVMBuildSelect");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "ret"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    /* push result into the stack */
+    PUSH_V128(result);
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_i8x16_cmp(AOTCompContext *comp_ctx,
+                           AOTFuncContext *func_ctx,
+                           V128Arithmetic arith_op,
+                           bool is_signed)
+{
+    return simd_v128_cmp(comp_ctx, func_ctx, V128_i8x16_TYPE, arith_op,
+                         is_signed);
+}
+
+bool
+aot_compile_simd_i16x8_cmp(AOTCompContext *comp_ctx,
+                           AOTFuncContext *func_ctx,
+                           V128Arithmetic arith_op,
+                           bool is_signed)
+{
+    return simd_v128_cmp(comp_ctx, func_ctx, V128_i16x8_TYPE, arith_op,
+                         is_signed);
+}
+
+bool
+aot_compile_simd_i32x4_cmp(AOTCompContext *comp_ctx,
+                           AOTFuncContext *func_ctx,
+                           V128Arithmetic arith_op,
+                           bool is_signed)
+{
+    return simd_v128_cmp(comp_ctx, func_ctx, V128_i32x4_TYPE, arith_op,
+                         is_signed);
+}
+
+static bool
+simd_v128_abs(AOTCompContext *comp_ctx,
+              AOTFuncContext *func_ctx,
+              LLVMTypeRef vector_type)
+{
+    LLVMValueRef vector, negs, zeros, cond, result;
+
+    if (!(vector = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                             "vec"))) {
+        goto fail;
+    }
+
+    if (!(negs = LLVMBuildNeg(comp_ctx->builder, vector, "neg"))) {
+        HANDLE_FAILURE("LLVMBuildNeg");
+        goto fail;
+    }
+
+    if (!(zeros = LLVMConstNull(vector_type))) {
+        HANDLE_FAILURE("LLVMConstNull");
+        goto fail;
+    }
+
+    if (!(cond = LLVMBuildICmp(comp_ctx->builder, LLVMIntSGE, vector, zeros,
+                               "ge_zero"))) {
+        HANDLE_FAILURE("LLVMBuildICmp");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildSelect(comp_ctx->builder, cond, vector, negs,
+                                   "select"))) {
+        HANDLE_FAILURE("LLVMBuildSelect");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "ret"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    /* push result into the stack */
+    PUSH_V128(result);
+    return true;
+fail:
+    return false;
+}
+
+bool
+aot_compile_simd_i8x16_abs(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+{
+    return simd_v128_abs(comp_ctx, func_ctx, V128_i8x16_TYPE);
+}
+
+bool
+aot_compile_simd_i16x8_abs(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+{
+    return simd_v128_abs(comp_ctx, func_ctx, V128_i16x8_TYPE);
+}
+
+bool
+aot_compile_simd_i32x4_abs(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
+{
+    return simd_v128_abs(comp_ctx, func_ctx, V128_i32x4_TYPE);
+}
+
+/* (v1 + v2 + 1) / 2 */
+static bool
+simd_v128_avg(AOTCompContext *comp_ctx,
+              AOTFuncContext *func_ctx,
+              LLVMTypeRef vector_type,
+              LLVMTypeRef element_type,
+              unsigned lane_width)
+{
+    LLVMValueRef lhs, rhs, undef, zeros, ones, result;
+    LLVMTypeRef ext_type;
+
+    if (!(rhs = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                          "rhs"))) {
+        goto fail;
+    }
+
+    if (!(lhs = simd_pop_v128_and_bitcast(comp_ctx, func_ctx, vector_type,
+                                          "lhs"))) {
+        goto fail;
+    }
+
+    if (!(ext_type = LLVMVectorType(I32_TYPE, lane_width))) {
+        HANDLE_FAILURE("LLVMVectorType");
+        goto fail;
+    }
+
+    if (!(lhs = LLVMBuildZExt(comp_ctx->builder, lhs, ext_type, "left_ext"))) {
+        HANDLE_FAILURE("LLVMBuildZExt");
+        goto fail;
+    }
+
+    if (!(rhs =
+            LLVMBuildZExt(comp_ctx->builder, rhs, ext_type, "right_ext"))) {
+        HANDLE_FAILURE("LLVMBuildZExt");
+        goto fail;
+    }
+
+    if (!(undef = LLVMGetUndef(ext_type))) {
+        HANDLE_FAILURE("LLVMGetUndef");
+        goto fail;
+    }
+
+    if (!(zeros = LLVMConstNull(ext_type))) {
+        HANDLE_FAILURE("LLVMConstNull");
+        goto fail;
+    }
+
+    if (!(ones = LLVMConstInt(I32_TYPE, 1, true))) {
+        HANDLE_FAILURE("LLVMConstInt");
+        goto fail;
+    }
+
+    if (!(ones = LLVMBuildInsertElement(comp_ctx->builder, undef, ones,
+                                        I32_ZERO, "base_ones"))) {
+        HANDLE_FAILURE("LLVMBuildInsertElement");
+        goto fail;
+    }
+
+    if (!(ones = LLVMBuildShuffleVector(comp_ctx->builder, ones, undef, zeros,
+                                        "ones"))) {
+        HANDLE_FAILURE("LLVMBuildShuffleVector");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildAdd(comp_ctx->builder, lhs, rhs, "a_add_b"))) {
+        HANDLE_FAILURE("LLVMBuildAdd");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildAdd(comp_ctx->builder, result, ones, "plus_1"))) {
+        HANDLE_FAILURE("LLVMBuildAdd");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildLShr(comp_ctx->builder, result, ones, "avg"))) {
+        HANDLE_FAILURE("LLVMBuildLShr");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildTrunc(comp_ctx->builder, result, vector_type,
+                                  "avg_trunc"))) {
+        HANDLE_FAILURE("LLVMBuildTrunc");
+        goto fail;
+    }
+
+    if (!(result = LLVMBuildBitCast(comp_ctx->builder, result, V128_i64x2_TYPE,
+                                    "ret"))) {
+        HANDLE_FAILURE("LLVMBuildBitCast");
+        goto fail;
+    }
+
+    /* push result into the stack */
+    PUSH_V128(result);
+    return true;
+fail:
+    return false;
+}
+bool
+aot_compile_simd_i8x16_avgr_u(AOTCompContext *comp_ctx,
+                              AOTFuncContext *func_ctx)
+{
+    return simd_v128_avg(comp_ctx, func_ctx, V128_i8x16_TYPE, INT8_TYPE, 16);
+}
+
+bool
+aot_compile_simd_i16x8_avgr_u(AOTCompContext *comp_ctx,
+                              AOTFuncContext *func_ctx)
+{
+    return simd_v128_avg(comp_ctx, func_ctx, V128_i16x8_TYPE, INT16_TYPE, 8);
+}

+ 66 - 0
core/iwasm/compilation/simd/simd_sat_int_arith.h

@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _SIMD_SAT_INT_ARITH_H_
+#define _SIMD_SAT_INT_ARITH_H_
+
+#include "../aot_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool
+aot_compile_simd_i8x16_saturate(AOTCompContext *comp_ctx,
+                                AOTFuncContext *func_ctx,
+                                V128Arithmetic arith_op,
+                                bool is_signed);
+
+bool
+aot_compile_simd_i16x8_saturate(AOTCompContext *comp_ctx,
+                                AOTFuncContext *func_ctx,
+                                V128Arithmetic arith_op,
+                                bool is_signed);
+
+bool
+aot_compile_simd_i8x16_cmp(AOTCompContext *comp_ctx,
+                           AOTFuncContext *func_ctx,
+                           V128Arithmetic arith_op,
+                           bool is_signed);
+
+bool
+aot_compile_simd_i16x8_cmp(AOTCompContext *comp_ctx,
+                           AOTFuncContext *func_ctx,
+                           V128Arithmetic arith_op,
+                           bool is_signed);
+
+bool
+aot_compile_simd_i32x4_cmp(AOTCompContext *comp_ctx,
+                           AOTFuncContext *func_ctx,
+                           V128Arithmetic arith_op,
+                           bool is_signed);
+
+bool
+aot_compile_simd_i8x16_abs(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_i16x8_abs(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_i32x4_abs(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_i8x16_avgr_u(AOTCompContext *comp_ctx,
+                              AOTFuncContext *func_ctx);
+
+bool
+aot_compile_simd_i16x8_avgr_u(AOTCompContext *comp_ctx,
+                              AOTFuncContext *func_ctx);
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif /* end of _SIMD_SAT_INT_ARITH_H_ */

+ 1 - 0
core/iwasm/include/aot_export.h

@@ -42,6 +42,7 @@ typedef struct AOTCompOption{
     bool enable_bulk_memory;
     bool enable_thread_mgr;
     bool enable_tail_call;
+    bool enable_simd;
     bool is_sgx_platform;
     uint32_t opt_level;
     uint32_t size_level;

+ 21 - 0
core/iwasm/interpreter/wasm.h

@@ -19,6 +19,7 @@ extern "C" {
 #define VALUE_TYPE_I64 0X7E
 #define VALUE_TYPE_F32 0x7D
 #define VALUE_TYPE_F64 0x7C
+#define VALUE_TYPE_V128 0x7B
 #define VALUE_TYPE_VOID 0x40
 /* Used by AOT */
 #define VALUE_TYPE_I1  0x41
@@ -34,6 +35,7 @@ extern "C" {
 #define INIT_EXPR_TYPE_I64_CONST 0x42
 #define INIT_EXPR_TYPE_F32_CONST 0x43
 #define INIT_EXPR_TYPE_F64_CONST 0x44
+#define INIT_EXPR_TYPE_V128_CONST 0xFD
 #define INIT_EXPR_TYPE_GET_GLOBAL 0x23
 #define INIT_EXPR_TYPE_ERROR 0xff
 
@@ -79,6 +81,15 @@ typedef struct WASMModule WASMModule;
 typedef struct WASMFunction WASMFunction;
 typedef struct WASMGlobal WASMGlobal;
 
+typedef union V128 {
+    int8 i8x16[16];
+    int16 i16x8[8];
+    int32 i32x8[4];
+    int64 i64x2[2];
+    float32 f32x4[4];
+    float64 f64x2[2];
+} V128;
+
 typedef union WASMValue {
     int32 i32;
     uint32 u32;
@@ -87,6 +98,7 @@ typedef union WASMValue {
     float32 f32;
     float64 f64;
     uintptr_t addr;
+    V128 v128;
 } WASMValue;
 
 typedef struct InitializerExpression {
@@ -98,6 +110,7 @@ typedef struct InitializerExpression {
         float32 f32;
         float64 f64;
         uint32 global_index;
+        V128 v128;
     } u;
 } InitializerExpression;
 
@@ -448,6 +461,10 @@ wasm_value_type_size(uint8 value_type)
         case VALUE_TYPE_I64:
         case VALUE_TYPE_F64:
             return sizeof(int64);
+#if WASM_ENABLE_SIMD != 0
+        case VALUE_TYPE_V128:
+            return sizeof(int64) * 2;
+#endif
         default:
             bh_assert(0);
     }
@@ -465,6 +482,10 @@ wasm_value_type_cell_num(uint8 value_type)
     else if (value_type == VALUE_TYPE_I64
              || value_type == VALUE_TYPE_F64)
         return 2;
+#if WASM_ENABLE_SIMD != 0
+    else if (value_type == VALUE_TYPE_V128)
+        return 4;
+#endif
     else {
         bh_assert(0);
     }

+ 674 - 12
core/iwasm/interpreter/wasm_loader.c

@@ -231,6 +231,23 @@ fail:
   res = (int32)res64;                               \
 } while (0)
 
+#if WASM_ENABLE_SIMD != 0
+#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0)
+static V128
+read_i8x16(uint8 *p_buf, char* error_buf, uint32 error_buf_size)
+{
+    V128 result;
+    uint8 i;
+
+    for (i = 0; i != 16; ++i) {
+        result.i8x16[i] = read_uint8(p_buf);
+    }
+
+    return result;
+}
+#endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) */
+#endif /* end of WASM_ENABLE_SIMD */
+
 static void *
 loader_malloc(uint64 size, char *error_buf, uint32 error_buf_size)
 {
@@ -412,6 +429,29 @@ load_init_expr(const uint8 **p_buf, const uint8 *buf_end,
             for (i = 0; i < sizeof(float64); i++)
                 *p_float++ = *p++;
             break;
+#if WASM_ENABLE_SIMD != 0
+#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0)
+        case INIT_EXPR_TYPE_V128_CONST:
+        {
+            uint8 flag;
+            uint64 high, low;
+
+            if (type != VALUE_TYPE_V128)
+                goto fail;
+
+            flag = read_uint8(p);
+            (void)flag;
+
+            CHECK_BUF(p, p_end, 16);
+            wasm_runtime_read_v128(p, &high,  &low);
+            p += 16;
+
+            init_expr->u.v128.i64x2[0] = high;
+            init_expr->u.v128.i64x2[1] = low;
+            break;
+        }
+#endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) */
+#endif /* end of WASM_ENABLE_SIMD */
         /* get_global */
         case INIT_EXPR_TYPE_GET_GLOBAL:
             read_leb_uint32(p, p_end, init_expr->u.global_index);
@@ -1794,7 +1834,13 @@ load_function_section(const uint8 *buf, const uint8 *buf_end,
                 CHECK_BUF(p_code, buf_code_end, 1);
                 /* 0x7F/0x7E/0x7D/0x7C */
                 type = read_uint8(p_code);
-                if (type < VALUE_TYPE_F64 || type > VALUE_TYPE_I32) {
+                if ((type < VALUE_TYPE_F64 || type > VALUE_TYPE_I32)
+#if WASM_ENABLE_SIMD != 0
+#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0)
+                    && type != VALUE_TYPE_V128
+#endif
+#endif
+                        ) {
                     set_error_buf(error_buf, error_buf_size,
                                   "invalid local type");
                     return false;
@@ -2031,6 +2077,12 @@ load_export_section(const uint8 *buf, const uint8 *buf_end, WASMModule *module,
                                       "unknown function");
                         return false;
                     }
+#if WASM_ENABLE_SIMD != 0
+#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0)
+                    /* TODO: check func type, if it has v128 param or result,
+                             report error */
+#endif
+#endif
                     break;
                 /*table index*/
                 case EXPORT_KIND_TABLE:
@@ -3529,6 +3581,81 @@ wasm_loader_find_block_addr(BlockAddr *block_addr_cache,
                 }
                 break;
             }
+
+#if WASM_ENABLE_SIMD != 0
+#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0)
+            case WASM_OP_SIMD_PREFIX:
+            {
+                opcode = read_uint8(p);
+                if (SIMD_i8x16_eq <= opcode
+                    && opcode <= SIMD_f32x4_convert_i32x4_u) {
+                    break;
+                }
+
+                switch (opcode) {
+                    case SIMD_v128_load:
+                    case SIMD_i16x8_load8x8_s:
+                    case SIMD_i16x8_load8x8_u:
+                    case SIMD_i32x4_load16x4_s:
+                    case SIMD_i32x4_load16x4_u:
+                    case SIMD_i64x2_load32x2_s:
+                    case SIMD_i64x2_load32x2_u:
+                    case SIMD_v8x16_load_splat:
+                    case SIMD_v16x8_load_splat:
+                    case SIMD_v32x4_load_splat:
+                    case SIMD_v64x2_load_splat:
+                    case SIMD_v128_store:
+                        skip_leb_uint32(p, p_end); /* align */
+                        skip_leb_uint32(p, p_end); /* offset */
+                        break;
+
+                    case SIMD_v128_const:
+                    case SIMD_v8x16_shuffle:
+                        CHECK_BUF1(p, p_end, 16);
+                        p += 16;
+                        break;
+
+                    case SIMD_v8x16_swizzle:
+                    case SIMD_i8x16_splat:
+                    case SIMD_i16x8_splat:
+                    case SIMD_i32x4_splat:
+                    case SIMD_i64x2_splat:
+                    case SIMD_f32x4_splat:
+                    case SIMD_f64x2_splat:
+                        break;
+
+                    case SIMD_i8x16_extract_lane_s:
+                    case SIMD_i8x16_extract_lane_u:
+                    case SIMD_i8x16_replace_lane:
+                    case SIMD_i16x8_extract_lane_s:
+                    case SIMD_i16x8_extract_lane_u:
+                    case SIMD_i16x8_replace_lane:
+                    case SIMD_i32x4_extract_lane:
+                    case SIMD_i32x4_replace_lane:
+                    case SIMD_i64x2_extract_lane:
+                    case SIMD_i64x2_replace_lane:
+                    case SIMD_f32x4_extract_lane:
+                    case SIMD_f32x4_replace_lane:
+                    case SIMD_f64x2_extract_lane:
+                    case SIMD_f64x2_replace_lane:
+                        CHECK_BUF(p, p_end, 1);
+                        p++;
+                        break;
+
+                    default:
+                        LOG_WARNING("WASM loader find block addr failed: "
+                                    "invalid opcode fd 0x%02x.", opcode);
+                        if (error_buf)
+                            snprintf(error_buf, error_buf_size,
+                                     "WASM loader find block addr failed: "
+                                     "invalid opcode fd %02x.", opcode);
+                        return false;
+                }
+                break;
+            }
+#endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) */
+#endif /* end of WASM_ENABLE_SIMD */
+
 #if WASM_ENABLE_SHARED_MEMORY != 0
             case WASM_OP_ATOMIC_PREFIX:
             {
@@ -3545,6 +3672,7 @@ wasm_loader_find_block_addr(BlockAddr *block_addr_cache,
                 break;
             }
 #endif
+
             default:
                 set_error_buf_v(error_buf, error_buf_size,
                                 "%s %02x",
@@ -3565,6 +3693,10 @@ fail:
 #define REF_I64_2 VALUE_TYPE_I64
 #define REF_F64_1 VALUE_TYPE_F64
 #define REF_F64_2 VALUE_TYPE_F64
+#define REF_V128_1 VALUE_TYPE_V128
+#define REF_V128_2 VALUE_TYPE_V128
+#define REF_V128_3 VALUE_TYPE_V128
+#define REF_V128_4 VALUE_TYPE_V128
 #define REF_ANY   VALUE_TYPE_ANY
 
 #if WASM_ENABLE_FAST_INTERP != 0
@@ -3775,12 +3907,18 @@ static bool
 check_stack_top_values(uint8 *frame_ref, int32 stack_cell_num, uint8 type,
                        char *error_buf, uint32 error_buf_size)
 {
-    char *type_str[] = { "f64", "f32", "i64", "i32" };
+    char *type_str[] = { "v128", "f64", "f32", "i64", "i32" };
 
     if (((type == VALUE_TYPE_I32 || type == VALUE_TYPE_F32)
          && stack_cell_num < 1)
         || ((type == VALUE_TYPE_I64 || type == VALUE_TYPE_F64)
-            && stack_cell_num < 2)) {
+            && stack_cell_num < 2)
+#if WASM_ENABLE_SIMD != 0
+#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0)
+        || (type == VALUE_TYPE_V128 && stack_cell_num < 4)
+#endif
+#endif
+        ) {
         set_error_buf(error_buf, error_buf_size,
                       "type mismatch: expect data but stack was empty");
         return false;
@@ -3793,10 +3931,20 @@ check_stack_top_values(uint8 *frame_ref, int32 stack_cell_num, uint8 type,
                 || *(frame_ref - 1) != REF_I64_2))
         || (type == VALUE_TYPE_F64
             && (*(frame_ref - 2) != REF_F64_1
-                || *(frame_ref - 1) != REF_F64_2))) {
+                || *(frame_ref - 1) != REF_F64_2))
+#if WASM_ENABLE_SIMD != 0
+#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0)
+        || (type == VALUE_TYPE_V128
+            && (*(frame_ref - 4) != REF_V128_1
+                || *(frame_ref - 3) != REF_V128_2
+                || *(frame_ref - 2) != REF_V128_3
+                || *(frame_ref - 1) != REF_V128_4))
+#endif
+#endif
+        ) {
         set_error_buf_v(error_buf, error_buf_size, "%s%s%s",
                         "type mismatch: expect ",
-                        type_str[type - VALUE_TYPE_F64],
+                        type_str[type - VALUE_TYPE_V128],
                         " but got other");
         return false;
     }
@@ -3922,6 +4070,23 @@ wasm_loader_push_frame_ref(WASMLoaderContext *ctx, uint8 type,
     ctx->stack_cell_num++;
     if (ctx->stack_cell_num > ctx->max_stack_cell_num)
         ctx->max_stack_cell_num = ctx->stack_cell_num;
+
+#if WASM_ENABLE_SIMD != 0
+#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0)
+    if (type == VALUE_TYPE_V128) {
+        if (!check_stack_push(ctx, error_buf, error_buf_size))
+            return false;
+        *ctx->frame_ref++ = type;
+        ctx->stack_cell_num++;
+        if (!check_stack_push(ctx, error_buf, error_buf_size))
+            return false;
+        *ctx->frame_ref++ = type;
+        ctx->stack_cell_num++;
+        if (ctx->stack_cell_num > ctx->max_stack_cell_num)
+            ctx->max_stack_cell_num = ctx->stack_cell_num;
+    }
+#endif
+#endif
     return true;
 }
 
@@ -3954,6 +4119,15 @@ wasm_loader_pop_frame_ref(WASMLoaderContext *ctx, uint8 type,
 
     ctx->frame_ref--;
     ctx->stack_cell_num--;
+
+#if WASM_ENABLE_SIMD != 0
+#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0)
+    if (type == VALUE_TYPE_V128) {
+        ctx->frame_ref -= 2;
+        ctx->stack_cell_num -= 2;
+    }
+#endif
+#endif
     return true;
 }
 
@@ -4713,6 +4887,13 @@ fail:
         goto fail;                                                      \
   } while (0)
 
+#define PUSH_V128() do {                                                \
+    if (!(wasm_loader_push_frame_ref_offset(loader_ctx, VALUE_TYPE_V128,\
+                                            disable_emit, operand_offset,\
+                                            error_buf, error_buf_size)))\
+        goto fail;                                                      \
+  } while (0)
+
 #define POP_I32() do {                                                  \
     if (!wasm_loader_pop_frame_ref_offset(loader_ctx, VALUE_TYPE_I32,   \
                                           error_buf, error_buf_size))   \
@@ -4737,6 +4918,12 @@ fail:
         goto fail;                                                      \
   } while (0)
 
+#define POP_V128() do {                                                 \
+    if (!wasm_loader_pop_frame_ref_offset(loader_ctx, VALUE_TYPE_V128,  \
+                                          error_buf, error_buf_size))   \
+        goto fail;                                                      \
+  } while (0)
+
 #define PUSH_OFFSET_TYPE(type) do {                                     \
     if (!(wasm_loader_push_frame_offset(loader_ctx, type,               \
                                         disable_emit, operand_offset,   \
@@ -4793,6 +4980,12 @@ fail:
         goto fail;                                                  \
   } while (0)
 
+#define PUSH_V128() do {                                            \
+    if (!(wasm_loader_push_frame_ref(loader_ctx, VALUE_TYPE_V128,   \
+                                     error_buf, error_buf_size)))   \
+        goto fail;                                                  \
+  } while (0)
+
 #define POP_I32() do {                                              \
     if (!(wasm_loader_pop_frame_ref(loader_ctx, VALUE_TYPE_I32,     \
                                     error_buf, error_buf_size)))    \
@@ -4817,6 +5010,12 @@ fail:
         goto fail;                                                  \
   } while (0)
 
+#define POP_V128() do {                                             \
+    if (!(wasm_loader_pop_frame_ref(loader_ctx, VALUE_TYPE_V128,    \
+                                    error_buf, error_buf_size)))    \
+        goto fail;                                                  \
+  } while (0)
+
 #define POP_AND_PUSH(type_pop, type_push) do {                           \
     if (!(wasm_loader_push_pop_frame_ref(loader_ctx, 1,                  \
                                          type_push, type_pop,            \
@@ -5054,8 +5253,8 @@ check_memory_access_align(uint8 opcode, uint32 align,
                           char *error_buf, uint32 error_buf_size)
 {
     uint8 mem_access_aligns[] = {
-       2, 3, 2, 3, 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, /* loads */
-       2, 3, 2, 3, 0, 1, 0, 1, 2                 /* stores */
+        2, 3, 2, 3, 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, /* loads */
+        2, 3, 2, 3, 0, 1, 0, 1, 2                 /* stores */
     };
     bh_assert(opcode >= WASM_OP_I32_LOAD
               && opcode <= WASM_OP_I64_STORE32);
@@ -5067,6 +5266,92 @@ check_memory_access_align(uint8 opcode, uint32 align,
     return true;
 }
 
+#if WASM_ENABLE_SIMD != 0
+#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0)
+static bool
+check_simd_memory_access_align(uint8 opcode, uint32 align,
+                               char *error_buf, uint32 error_buf_size)
+{
+    uint8 mem_access_aligns[] = {
+        4,  /* load */
+        3, 3, 3, 3, 3, 3,  /* load and extend */
+        0, 1, 2, 3, /* load and splat */
+        4, /* store */
+    };
+
+    bh_assert(opcode <= SIMD_v128_store);
+
+    if (align > mem_access_aligns[opcode - SIMD_v128_load]) {
+        set_error_buf(error_buf, error_buf_size,
+                      "alignment must not be larger than natural");
+        return false;
+    }
+
+    return true;
+}
+
+static bool
+check_simd_access_lane(uint8 opcode, uint8 lane,
+                       char *error_buf, uint32 error_buf_size)
+{
+    switch (opcode) {
+        case SIMD_i8x16_extract_lane_s:
+        case SIMD_i8x16_extract_lane_u:
+        case SIMD_i8x16_replace_lane:
+            if (lane >= 16) {
+                goto fail;
+            }
+            break;
+        case SIMD_i16x8_extract_lane_s:
+        case SIMD_i16x8_extract_lane_u:
+        case SIMD_i16x8_replace_lane:
+            if (lane >= 8) {
+                goto fail;
+            }
+            break;
+        case SIMD_i32x4_extract_lane:
+        case SIMD_i32x4_replace_lane:
+        case SIMD_f32x4_extract_lane:
+        case SIMD_f32x4_replace_lane:
+            if (lane >= 4) {
+                goto fail;
+            }
+            break;
+        case SIMD_i64x2_extract_lane:
+        case SIMD_i64x2_replace_lane:
+        case SIMD_f64x2_extract_lane:
+        case SIMD_f64x2_replace_lane:
+            if (lane >= 2) {
+                goto fail;
+            }
+            break;
+        default:
+            goto fail;
+    }
+
+    return true;
+fail:
+    set_error_buf(error_buf, error_buf_size, "invalid lane index");
+    return false;
+}
+
+static bool
+check_simd_shuffle_mask(V128 mask,
+                       char *error_buf,
+                       uint32 error_buf_size)
+{
+    uint8 i;
+    for (i = 0; i != 16; ++i) {
+        if (mask.i8x16[i] < 0 || mask.i8x16[i] >= 32) {
+            set_error_buf(error_buf, error_buf_size, "invalid lane index");
+            return false;
+        }
+    }
+    return true;
+}
+#endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) */
+#endif /* end of WASM_ENABLE_SIMD */
+
 #if WASM_ENABLE_SHARED_MEMORY != 0
 static bool
 check_memory_align_equal(uint8 opcode, uint32 align,
@@ -5104,6 +5389,7 @@ is_value_type(uint8 type)
            type == VALUE_TYPE_I64 ||
            type == VALUE_TYPE_F32 ||
            type == VALUE_TYPE_F64 ||
+           type == VALUE_TYPE_V128 ||
            type == VALUE_TYPE_VOID;
 }
 
@@ -5892,7 +6178,7 @@ handle_op_block_and_loop:
 #if WASM_ENABLE_TAIL_CALL != 0
                 }
                 else {
-                    char *type_str[] = { "f64", "f32", "i64", "i32" };
+                    char *type_str[] = { "v128", "f64", "f32", "i64", "i32" };
                     uint8 type;
                     if (func_type->result_count != func->func_type->result_count) {
                         set_error_buf_v(error_buf, error_buf_size,
@@ -5906,7 +6192,7 @@ handle_op_block_and_loop:
                         if (func_type->types[func_type->param_count + i] != type) {
                             set_error_buf_v(error_buf, error_buf_size,
                                             "%s%s%s", "type mismatch: expect ",
-                                            type_str[type - VALUE_TYPE_F64],
+                                            type_str[type - VALUE_TYPE_V128],
                                             " but got other");
                             goto fail;
                         }
@@ -5982,7 +6268,7 @@ handle_op_block_and_loop:
 #if WASM_ENABLE_TAIL_CALL != 0
                 }
                 else {
-                    char *type_str[] = { "f64", "f32", "i64", "i32" };
+                    char *type_str[] = { "v128", "f64", "f32", "i64", "i32" };
                     uint8 type;
                     if (func_type->result_count != func->func_type->result_count) {
                         set_error_buf_v(error_buf, error_buf_size,
@@ -5996,7 +6282,7 @@ handle_op_block_and_loop:
                         if (func_type->types[func_type->param_count + i] != type) {
                             set_error_buf_v(error_buf, error_buf_size,
                                             "%s%s%s", "type mismatch: expect ",
-                                            type_str[type - VALUE_TYPE_F64],
+                                            type_str[type - VALUE_TYPE_V128],
                                             " but got other");
                             goto fail;
                         }
@@ -6037,7 +6323,8 @@ handle_op_block_and_loop:
                             loader_ctx->dynamic_offset --;
 #endif
                     }
-                    else {
+                    else if (*(loader_ctx->frame_ref - 1) == REF_I64_1
+                             || *(loader_ctx->frame_ref - 1) == REF_F64_1) {
                         loader_ctx->frame_ref -= 2;
                         loader_ctx->stack_cell_num -= 2;
 #if (WASM_ENABLE_FAST_INTERP == 0) || (WASM_ENABLE_JIT != 0)
@@ -6051,6 +6338,10 @@ handle_op_block_and_loop:
                             loader_ctx->dynamic_offset -= 2;
 #endif
                     }
+                    else { /* V128 */
+                        loader_ctx->frame_ref -= 4;
+                        loader_ctx->stack_cell_num -= 4;
+                    }
                 }
                 else {
 #if WASM_ENABLE_FAST_INTERP != 0
@@ -6889,6 +7180,376 @@ fail_data_cnt_sec_require:
                 }
                 break;
             }
+
+#if WASM_ENABLE_SIMD != 0
+#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0)
+            case WASM_OP_SIMD_PREFIX:
+            {
+                uint8 lane;
+
+                opcode = read_uint8(p);
+                switch (opcode) {
+                    case SIMD_v128_load:
+                    case SIMD_i16x8_load8x8_s:
+                    case SIMD_i16x8_load8x8_u:
+                    case SIMD_i32x4_load16x4_s:
+                    case SIMD_i32x4_load16x4_u:
+                    case SIMD_i64x2_load32x2_s:
+                    case SIMD_i64x2_load32x2_u:
+                    case SIMD_v8x16_load_splat:
+                    case SIMD_v16x8_load_splat:
+                    case SIMD_v32x4_load_splat:
+                    case SIMD_v64x2_load_splat:
+                    {
+                        CHECK_MEMORY();
+
+                        read_leb_uint32(p, p_end, align); /* align */
+                        if (!check_simd_memory_access_align(
+                              opcode, align, error_buf, error_buf_size)) {
+                            goto fail;
+                        }
+
+                        read_leb_uint32(p, p_end, mem_offset); /* offset */
+
+                        /* pop(i32 %i), push(v128 *result) */
+                        POP_AND_PUSH(VALUE_TYPE_I32, VALUE_TYPE_V128);
+                        break;
+                    }
+
+                    case SIMD_v128_store:
+                    {
+                        CHECK_MEMORY();
+
+                        read_leb_uint32(p, p_end, align); /* align */
+                        if (!check_simd_memory_access_align(
+                              opcode, align, error_buf, error_buf_size)) {
+                            goto fail;
+                        }
+
+                        read_leb_uint32(p, p_end, mem_offset); /* offset */
+
+                        /* pop(v128 %value) */
+                        POP_V128();
+                        /* pop(i32 %i) */
+                        POP_I32();
+                        break;
+                    }
+
+                    case SIMD_v128_const:
+                        CHECK_BUF1(p, p_end, 16);
+                        p += 16;
+                        PUSH_V128();
+                        break;
+
+                    case SIMD_v8x16_shuffle:
+                    {
+                        V128 mask;
+
+                        CHECK_BUF1(p, p_end, 16);
+                        mask = read_i8x16(p, error_buf, error_buf_size);
+                        p += 16;
+                        if (!check_simd_shuffle_mask(mask, error_buf,
+                                                     error_buf_size)) {
+                            goto fail;
+                        }
+
+                        POP2_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128);
+                        break;
+                    }
+
+                    case SIMD_v8x16_swizzle:
+                        POP2_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128);
+                        break;
+
+                    case SIMD_i8x16_splat:
+                    case SIMD_i16x8_splat:
+                    case SIMD_i32x4_splat:
+                        POP_AND_PUSH(VALUE_TYPE_I32, VALUE_TYPE_V128);
+                        break;
+                    case SIMD_i64x2_splat:
+                        POP_AND_PUSH(VALUE_TYPE_I64, VALUE_TYPE_V128);
+                        break;
+                    case SIMD_f32x4_splat:
+                        POP_AND_PUSH(VALUE_TYPE_F32, VALUE_TYPE_V128);
+                        break;
+                    case SIMD_f64x2_splat:
+                        POP_AND_PUSH(VALUE_TYPE_F64, VALUE_TYPE_V128);
+                        break;
+
+                    case SIMD_i8x16_extract_lane_s:
+                    case SIMD_i8x16_extract_lane_u:
+                    case SIMD_i16x8_extract_lane_s:
+                    case SIMD_i16x8_extract_lane_u:
+                    case SIMD_i32x4_extract_lane:
+                        CHECK_BUF(p, p_end, 1);
+                        lane = read_uint8(p);
+
+                        if (!check_simd_access_lane(opcode, lane, error_buf,
+                                                    error_buf_size)) {
+                            goto fail;
+                        }
+
+                        POP_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_I32);
+                        break;
+                    case SIMD_i64x2_extract_lane:
+                        CHECK_BUF(p, p_end, 1);
+                        lane = read_uint8(p);
+
+                        if (!check_simd_access_lane(opcode, lane, error_buf,
+                                                    error_buf_size)) {
+                            goto fail;
+                        }
+
+                        POP_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_I64);
+                        break;
+                    case SIMD_f32x4_extract_lane:
+                        CHECK_BUF(p, p_end, 1);
+                        lane = read_uint8(p);
+
+                        if (!check_simd_access_lane(opcode, lane, error_buf,
+                                                    error_buf_size)) {
+                            goto fail;
+                        }
+
+                        POP_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_F32);
+                        break;
+                    case SIMD_f64x2_extract_lane:
+                        CHECK_BUF(p, p_end, 1);
+                        lane = read_uint8(p);
+
+                        if (!check_simd_access_lane(opcode, lane, error_buf,
+                                                    error_buf_size)) {
+                            goto fail;
+                        }
+
+                        POP_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_F64);
+                        break;
+                    case SIMD_i8x16_replace_lane:
+                    case SIMD_i16x8_replace_lane:
+                    case SIMD_i32x4_replace_lane:
+                        CHECK_BUF(p, p_end, 1);
+                        lane = read_uint8(p);
+
+                        if (!check_simd_access_lane(opcode, lane, error_buf,
+                                                    error_buf_size)) {
+                            goto fail;
+                        }
+
+                        POP_I32();
+                        POP_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128);
+                        break;
+                    case SIMD_i64x2_replace_lane:
+                        CHECK_BUF(p, p_end, 1);
+                        lane = read_uint8(p);
+
+                        if (!check_simd_access_lane(opcode, lane, error_buf,
+                                                    error_buf_size)) {
+                            goto fail;
+                        }
+
+                        POP_I64();
+                        POP_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128);
+                        break;
+                    case SIMD_f32x4_replace_lane:
+                        CHECK_BUF(p, p_end, 1);
+                        lane = read_uint8(p);
+
+                        if (!check_simd_access_lane(opcode, lane, error_buf,
+                                                    error_buf_size)) {
+                            goto fail;
+                        }
+
+                        POP_F32();
+                        POP_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128);
+                        break;
+                    case SIMD_f64x2_replace_lane:
+                        CHECK_BUF(p, p_end, 1);
+                        lane = read_uint8(p);
+
+                        if (!check_simd_access_lane(opcode, lane, error_buf,
+                                                    error_buf_size)) {
+                            goto fail;
+                        }
+
+                        POP_F64();
+                        POP_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128);
+                        break;
+                    case SIMD_i8x16_eq:
+                    case SIMD_i8x16_ne:
+                    case SIMD_i8x16_lt_s:
+                    case SIMD_i8x16_lt_u:
+                    case SIMD_i8x16_gt_s:
+                    case SIMD_i8x16_gt_u:
+                    case SIMD_i8x16_le_s:
+                    case SIMD_i8x16_le_u:
+                    case SIMD_i8x16_ge_s:
+                    case SIMD_i8x16_ge_u:
+                    case SIMD_i16x8_eq:
+                    case SIMD_i16x8_ne:
+                    case SIMD_i16x8_lt_s:
+                    case SIMD_i16x8_lt_u:
+                    case SIMD_i16x8_gt_s:
+                    case SIMD_i16x8_gt_u:
+                    case SIMD_i16x8_le_s:
+                    case SIMD_i16x8_le_u:
+                    case SIMD_i16x8_ge_s:
+                    case SIMD_i16x8_ge_u:
+                    case SIMD_i32x4_eq:
+                    case SIMD_i32x4_ne:
+                    case SIMD_i32x4_lt_s:
+                    case SIMD_i32x4_lt_u:
+                    case SIMD_i32x4_gt_s:
+                    case SIMD_i32x4_gt_u:
+                    case SIMD_i32x4_le_s:
+                    case SIMD_i32x4_le_u:
+                    case SIMD_i32x4_ge_s:
+                    case SIMD_i32x4_ge_u:
+                    case SIMD_f32x4_eq:
+                    case SIMD_f32x4_ne:
+                    case SIMD_f32x4_lt:
+                    case SIMD_f32x4_gt:
+                    case SIMD_f32x4_le:
+                    case SIMD_f32x4_ge:
+                    case SIMD_f64x2_eq:
+                    case SIMD_f64x2_ne:
+                    case SIMD_f64x2_lt:
+                    case SIMD_f64x2_gt:
+                    case SIMD_f64x2_le:
+                    case SIMD_f64x2_ge:
+                        POP2_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128);
+                        break;
+
+                    case SIMD_v128_not:
+                    case SIMD_i8x16_abs:
+                    case SIMD_i8x16_neg:
+                    case SIMD_i16x8_abs:
+                    case SIMD_i16x8_neg:
+                    case SIMD_i32x4_abs:
+                    case SIMD_i32x4_neg:
+                    case SIMD_i64x2_neg:
+                    case SIMD_f32x4_abs:
+                    case SIMD_f32x4_neg:
+                    case SIMD_f32x4_sqrt:
+                    case SIMD_f64x2_abs:
+                    case SIMD_f64x2_neg:
+                    case SIMD_f64x2_sqrt:
+                    case SIMD_i16x8_widen_low_i8x16_s:
+                    case SIMD_i16x8_widen_high_i8x16_s:
+                    case SIMD_i16x8_widen_low_i8x16_u:
+                    case SIMD_i16x8_widen_high_i8x16_u:
+                    case SIMD_i32x4_widen_low_i16x8_s:
+                    case SIMD_i32x4_widen_high_i16x8_s:
+                    case SIMD_i32x4_widen_low_i16x8_u:
+                    case SIMD_i32x4_widen_high_i16x8_u:
+                    case SIMD_i32x4_trunc_sat_f32x4_s:
+                    case SIMD_i32x4_trunc_sat_f32x4_u:
+                    case SIMD_f32x4_convert_i32x4_s:
+                    case SIMD_f32x4_convert_i32x4_u:
+                        POP_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128);
+                        break;
+
+                    case SIMD_v128_bitselect:
+                        POP_V128();
+                        POP2_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128);
+                        break;
+
+                    case SIMD_i8x16_any_true:
+                    case SIMD_i8x16_all_true:
+                    case SIMD_i8x16_bitmask:
+                    case SIMD_i16x8_any_true:
+                    case SIMD_i16x8_all_true:
+                    case SIMD_i16x8_bitmask:
+                    case SIMD_i32x4_any_true:
+                    case SIMD_i32x4_all_true:
+                    case SIMD_i32x4_bitmask:
+                        POP_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_I32);
+                        break;
+
+                    case SIMD_i8x16_shl:
+                    case SIMD_i8x16_shr_s:
+                    case SIMD_i8x16_shr_u:
+                    case SIMD_i16x8_shl:
+                    case SIMD_i16x8_shr_s:
+                    case SIMD_i16x8_shr_u:
+                    case SIMD_i32x4_shl:
+                    case SIMD_i32x4_shr_s:
+                    case SIMD_i32x4_shr_u:
+                    case SIMD_i64x2_shl:
+                    case SIMD_i64x2_shr_s:
+                    case SIMD_i64x2_shr_u:
+                        POP_I32();
+                        POP_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128);
+                        break;
+
+                    case SIMD_i8x16_narrow_i16x8_s:
+                    case SIMD_i8x16_narrow_i16x8_u:
+                    case SIMD_i16x8_narrow_i32x4_s:
+                    case SIMD_i16x8_narrow_i32x4_u:
+                    case SIMD_v128_and:
+                    case SIMD_v128_andnot:
+                    case SIMD_v128_or:
+                    case SIMD_v128_xor:
+                    case SIMD_i8x16_add:
+                    case SIMD_i8x16_add_saturate_s:
+                    case SIMD_i8x16_add_saturate_u:
+                    case SIMD_i8x16_sub:
+                    case SIMD_i8x16_sub_saturate_s:
+                    case SIMD_i8x16_sub_saturate_u:
+                    case SIMD_i8x16_min_s:
+                    case SIMD_i8x16_min_u:
+                    case SIMD_i8x16_max_s:
+                    case SIMD_i8x16_max_u:
+                    case SIMD_i8x16_avgr_u:
+                    case SIMD_i16x8_add:
+                    case SIMD_i16x8_add_saturate_s:
+                    case SIMD_i16x8_add_saturate_u:
+                    case SIMD_i16x8_sub:
+                    case SIMD_i16x8_sub_saturate_s:
+                    case SIMD_i16x8_sub_saturate_u:
+                    case SIMD_i16x8_mul:
+                    case SIMD_i16x8_min_s:
+                    case SIMD_i16x8_min_u:
+                    case SIMD_i16x8_max_s:
+                    case SIMD_i16x8_max_u:
+                    case SIMD_i16x8_avgr_u:
+                    case SIMD_i32x4_add:
+                    case SIMD_i32x4_sub:
+                    case SIMD_i32x4_mul:
+                    case SIMD_i32x4_min_s:
+                    case SIMD_i32x4_min_u:
+                    case SIMD_i32x4_max_s:
+                    case SIMD_i32x4_max_u:
+                    case SIMD_i64x2_add:
+                    case SIMD_i64x2_sub:
+                    case SIMD_i64x2_mul:
+                    case SIMD_f32x4_add:
+                    case SIMD_f32x4_sub:
+                    case SIMD_f32x4_mul:
+                    case SIMD_f32x4_div:
+                    case SIMD_f32x4_min:
+                    case SIMD_f32x4_max:
+                    case SIMD_f64x2_add:
+                    case SIMD_f64x2_sub:
+                    case SIMD_f64x2_mul:
+                    case SIMD_f64x2_div:
+                    case SIMD_f64x2_min:
+                    case SIMD_f64x2_max:
+                        POP2_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128);
+                        break;
+
+                    default:
+                        if (error_buf != NULL) {
+                            snprintf(error_buf, error_buf_size,
+                                    "WASM module load failed: "
+                                    "invalid opcode 0xfd %02x.", opcode);
+                        }
+                        goto fail;
+                }
+                break;
+            }
+#endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) */
+#endif /* end of WASM_ENABLE_SIMD */
+
 #if WASM_ENABLE_SHARED_MEMORY != 0
             case WASM_OP_ATOMIC_PREFIX:
             {
@@ -7031,6 +7692,7 @@ fail_data_cnt_sec_require:
                 break;
             }
 #endif /* end of WASM_ENABLE_SHARED_MEMORY */
+
             default:
                 set_error_buf_v(error_buf, error_buf_size,
                                 "%s %02x",

+ 215 - 0
core/iwasm/interpreter/wasm_opcode.h

@@ -263,6 +263,7 @@ typedef enum WASMOpcode {
 
     /* Post-MVP extend op prefix */
     WASM_OP_MISC_PREFIX           = 0xfc,
+    WASM_OP_SIMD_PREFIX           = 0xfd,
     WASM_OP_ATOMIC_PREFIX         = 0xfe,
 } WASMOpcode;
 
@@ -286,6 +287,220 @@ typedef enum WASMMiscEXTOpcode {
 #endif
 } WASMMiscEXTOpcode;
 
+typedef enum WASMSimdEXTOpcode {
+    /* memory instruction */
+    SIMD_v128_load        = 0x00,
+    SIMD_i16x8_load8x8_s  = 0x01,
+    SIMD_i16x8_load8x8_u  = 0x02,
+    SIMD_i32x4_load16x4_s = 0x03,
+    SIMD_i32x4_load16x4_u = 0x04,
+    SIMD_i64x2_load32x2_s = 0x05,
+    SIMD_i64x2_load32x2_u = 0x06,
+    SIMD_v8x16_load_splat = 0x07,
+    SIMD_v16x8_load_splat = 0x08,
+    SIMD_v32x4_load_splat = 0x09,
+    SIMD_v64x2_load_splat = 0x0a,
+    SIMD_v128_store       = 0x0b,
+
+    /* basic operation */
+    SIMD_v128_const       = 0x0c,
+    SIMD_v8x16_shuffle    = 0x0d,
+    SIMD_v8x16_swizzle    = 0x0e,
+
+    /* splat operation */
+    SIMD_i8x16_splat      = 0x0f,
+    SIMD_i16x8_splat      = 0x10,
+    SIMD_i32x4_splat      = 0x11,
+    SIMD_i64x2_splat      = 0x12,
+    SIMD_f32x4_splat      = 0x13,
+    SIMD_f64x2_splat      = 0x14,
+
+    /* lane operation */
+    SIMD_i8x16_extract_lane_s = 0x15,
+    SIMD_i8x16_extract_lane_u = 0x16,
+    SIMD_i8x16_replace_lane   = 0x17,
+    SIMD_i16x8_extract_lane_s = 0x18,
+    SIMD_i16x8_extract_lane_u = 0x19,
+    SIMD_i16x8_replace_lane   = 0x1a,
+    SIMD_i32x4_extract_lane   = 0x1b,
+    SIMD_i32x4_replace_lane   = 0x1c,
+    SIMD_i64x2_extract_lane   = 0x1d,
+    SIMD_i64x2_replace_lane   = 0x1e,
+    SIMD_f32x4_extract_lane   = 0x1f,
+    SIMD_f32x4_replace_lane   = 0x20,
+    SIMD_f64x2_extract_lane   = 0x21,
+    SIMD_f64x2_replace_lane   = 0x22,
+
+    /* i8x16 compare operation */
+    SIMD_i8x16_eq    = 0x23,
+    SIMD_i8x16_ne    = 0x24,
+    SIMD_i8x16_lt_s  = 0x25,
+    SIMD_i8x16_lt_u  = 0x26,
+    SIMD_i8x16_gt_s  = 0x27,
+    SIMD_i8x16_gt_u  = 0x28,
+    SIMD_i8x16_le_s  = 0x29,
+    SIMD_i8x16_le_u  = 0x2a,
+    SIMD_i8x16_ge_s  = 0x2b,
+    SIMD_i8x16_ge_u  = 0x2c,
+
+    /* i16x8 compare operation */
+    SIMD_i16x8_eq    = 0x2d,
+    SIMD_i16x8_ne    = 0x2e,
+    SIMD_i16x8_lt_s  = 0x2f,
+    SIMD_i16x8_lt_u  = 0x30,
+    SIMD_i16x8_gt_s  = 0x31,
+    SIMD_i16x8_gt_u  = 0x32,
+    SIMD_i16x8_le_s  = 0x33,
+    SIMD_i16x8_le_u  = 0x34,
+    SIMD_i16x8_ge_s  = 0x35,
+    SIMD_i16x8_ge_u  = 0x36,
+
+    /* i32x4 compare operation */
+    SIMD_i32x4_eq    = 0x37,
+    SIMD_i32x4_ne    = 0x38,
+    SIMD_i32x4_lt_s  = 0x39,
+    SIMD_i32x4_lt_u  = 0x3a,
+    SIMD_i32x4_gt_s  = 0x3b,
+    SIMD_i32x4_gt_u  = 0x3c,
+    SIMD_i32x4_le_s  = 0x3d,
+    SIMD_i32x4_le_u  = 0x3e,
+    SIMD_i32x4_ge_s  = 0x3f,
+    SIMD_i32x4_ge_u  = 0x40,
+
+    /* f32x4 compare operation */
+    SIMD_f32x4_eq    = 0x41,
+    SIMD_f32x4_ne    = 0x42,
+    SIMD_f32x4_lt    = 0x43,
+    SIMD_f32x4_gt    = 0x44,
+    SIMD_f32x4_le    = 0x45,
+    SIMD_f32x4_ge    = 0x46,
+
+    /* f64x2 compare operation */
+    SIMD_f64x2_eq    = 0x47,
+    SIMD_f64x2_ne    = 0x48,
+    SIMD_f64x2_lt    = 0x49,
+    SIMD_f64x2_gt    = 0x4a,
+    SIMD_f64x2_le    = 0x4b,
+    SIMD_f64x2_ge    = 0x4c,
+
+    /* v128 operation */
+    SIMD_v128_not    = 0x4d,
+    SIMD_v128_and    = 0x4e,
+    SIMD_v128_andnot = 0x4f,
+    SIMD_v128_or     = 0x50,
+    SIMD_v128_xor    = 0x51,
+    SIMD_v128_bitselect = 0x52,
+
+    /* i8x16 Operation */
+    SIMD_i8x16_abs            = 0x60,
+    SIMD_i8x16_neg            = 0x61,
+    SIMD_i8x16_any_true       = 0x62,
+    SIMD_i8x16_all_true       = 0x63,
+    SIMD_i8x16_bitmask        = 0x64,
+    SIMD_i8x16_narrow_i16x8_s = 0x65,
+    SIMD_i8x16_narrow_i16x8_u = 0x66,
+    SIMD_i8x16_shl            = 0x6b,
+    SIMD_i8x16_shr_s          = 0x6c,
+    SIMD_i8x16_shr_u          = 0x6d,
+    SIMD_i8x16_add            = 0x6e,
+    SIMD_i8x16_add_saturate_s = 0x6f,
+    SIMD_i8x16_add_saturate_u = 0x70,
+    SIMD_i8x16_sub            = 0x71,
+    SIMD_i8x16_sub_saturate_s = 0x72,
+    SIMD_i8x16_sub_saturate_u = 0x73,
+    SIMD_i8x16_min_s          = 0x76,
+    SIMD_i8x16_min_u          = 0x77,
+    SIMD_i8x16_max_s          = 0x78,
+    SIMD_i8x16_max_u          = 0x79,
+    SIMD_i8x16_avgr_u         = 0x7b,
+
+    /* i16x8 operation */
+    SIMD_i16x8_abs            = 0x80,
+    SIMD_i16x8_neg            = 0x81,
+    SIMD_i16x8_any_true       = 0x82,
+    SIMD_i16x8_all_true       = 0x83,
+    SIMD_i16x8_bitmask        = 0x84,
+    SIMD_i16x8_narrow_i32x4_s = 0x85,
+    SIMD_i16x8_narrow_i32x4_u = 0x86,
+    SIMD_i16x8_widen_low_i8x16_s  = 0x87,
+    SIMD_i16x8_widen_high_i8x16_s = 0x88,
+    SIMD_i16x8_widen_low_i8x16_u  = 0x89,
+    SIMD_i16x8_widen_high_i8x16_u = 0x8a,
+    SIMD_i16x8_shl            = 0x8b,
+    SIMD_i16x8_shr_s          = 0x8c,
+    SIMD_i16x8_shr_u          = 0x8d,
+    SIMD_i16x8_add            = 0x8e,
+    SIMD_i16x8_add_saturate_s = 0x8f,
+    SIMD_i16x8_add_saturate_u = 0x90,
+    SIMD_i16x8_sub            = 0x91,
+    SIMD_i16x8_sub_saturate_s = 0x92,
+    SIMD_i16x8_sub_saturate_u = 0x93,
+    SIMD_i16x8_mul            = 0x95,
+    SIMD_i16x8_min_s          = 0x96,
+    SIMD_i16x8_min_u          = 0x97,
+    SIMD_i16x8_max_s          = 0x98,
+    SIMD_i16x8_max_u          = 0x99,
+    SIMD_i16x8_avgr_u         = 0x9b,
+
+    /* i32x4 operation */
+    SIMD_i32x4_abs            = 0xa0,
+    SIMD_i32x4_neg            = 0xa1,
+    SIMD_i32x4_any_true       = 0xa2,
+    SIMD_i32x4_all_true       = 0xa3,
+    SIMD_i32x4_bitmask        = 0xa4,
+    SIMD_i32x4_widen_low_i16x8_s  = 0xa7,
+    SIMD_i32x4_widen_high_i16x8_s = 0xa8,
+    SIMD_i32x4_widen_low_i16x8_u  = 0xa9,
+    SIMD_i32x4_widen_high_i16x8_u = 0xaa,
+    SIMD_i32x4_shl            = 0xab,
+    SIMD_i32x4_shr_s          = 0xac,
+    SIMD_i32x4_shr_u          = 0xad,
+    SIMD_i32x4_add            = 0xae,
+    SIMD_i32x4_sub            = 0xb1,
+    SIMD_i32x4_mul            = 0xb5,
+    SIMD_i32x4_min_s          = 0xb6,
+    SIMD_i32x4_min_u          = 0xb7,
+    SIMD_i32x4_max_s          = 0xb8,
+    SIMD_i32x4_max_u          = 0xb9,
+
+    /* i64x2 operation */
+    SIMD_i64x2_neg    = 0xc1,
+    SIMD_i64x2_shl    = 0xcb,
+    SIMD_i64x2_shr_s  = 0xcc,
+    SIMD_i64x2_shr_u  = 0xcd,
+    SIMD_i64x2_add    = 0xce,
+    SIMD_i64x2_sub    = 0xd1,
+    SIMD_i64x2_mul    = 0xd5,
+
+    /* f32x4 operation */
+    SIMD_f32x4_abs    = 0xe0,
+    SIMD_f32x4_neg    = 0xe1,
+    SIMD_f32x4_sqrt   = 0xe3,
+    SIMD_f32x4_add    = 0xe4,
+    SIMD_f32x4_sub    = 0xe5,
+    SIMD_f32x4_mul    = 0xe6,
+    SIMD_f32x4_div    = 0xe7,
+    SIMD_f32x4_min    = 0xe8,
+    SIMD_f32x4_max    = 0xe9,
+
+    /* f64x2 operation */
+    SIMD_f64x2_abs    = 0xec,
+    SIMD_f64x2_neg    = 0xed,
+    SIMD_f64x2_sqrt   = 0xef,
+    SIMD_f64x2_add    = 0xf0,
+    SIMD_f64x2_sub    = 0xf1,
+    SIMD_f64x2_mul    = 0xf2,
+    SIMD_f64x2_div    = 0xf3,
+    SIMD_f64x2_min    = 0xf4,
+    SIMD_f64x2_max    = 0xf5,
+
+    /* conversion operation */
+    SIMD_i32x4_trunc_sat_f32x4_s = 0xf8,
+    SIMD_i32x4_trunc_sat_f32x4_u = 0xf9,
+    SIMD_f32x4_convert_i32x4_s   = 0xfa,
+    SIMD_f32x4_convert_i32x4_u   = 0xfb,
+} WASMSimdEXTOpcode;
+
 typedef enum WASMAtomicEXTOpcode {
     /* atomic wait and notify */
     WASM_OP_ATOMIC_NOTIFY               = 0x00,

+ 192 - 0
core/iwasm/libraries/libc-emcc/libc_emcc_wrapper.c

@@ -23,6 +23,90 @@
 #define REG_NATIVE_FUNC(func_name, signature)  \
     { #func_name, func_name##_wrapper, signature, NULL }
 
+extern bool
+wasm_runtime_call_indirect(wasm_exec_env_t exec_env,
+                           uint32 element_idx,
+                           uint32 argc, uint32 argv[]);
+
+static void
+invoke_viiii_wrapper(wasm_exec_env_t exec_env, uint32 elem_idx,
+                     int arg0, int arg1, int arg2, int arg3)
+{
+    uint32 argv[4];
+    bool ret;
+
+    argv[0] = arg0;
+    argv[1] = arg1;
+    argv[2] = arg2;
+    argv[3] = arg3;
+    ret = wasm_runtime_call_indirect(exec_env, elem_idx, 4, argv);
+    (void)ret;
+}
+
+static void
+invoke_viii_wrapper(wasm_exec_env_t exec_env, uint32 elem_idx,
+                    int arg0, int arg1, int arg2)
+{
+    uint32 argv[4];
+    bool ret;
+
+    argv[0] = arg0;
+    argv[1] = arg1;
+    argv[2] = arg2;
+    ret = wasm_runtime_call_indirect(exec_env, elem_idx, 3, argv);
+    (void)ret;
+}
+
+static void
+invoke_vii_wrapper(wasm_exec_env_t exec_env,
+                   uint32 elem_idx, int arg0, int arg1)
+{
+    uint32 argv[4];
+    bool ret;
+
+    argv[0] = arg0;
+    argv[1] = arg1;
+    ret = wasm_runtime_call_indirect(exec_env, elem_idx, 2, argv);
+    (void)ret;
+}
+
+static void
+invoke_vi_wrapper(wasm_exec_env_t exec_env,
+                  uint32 elem_idx, int arg0)
+{
+    uint32 argv[4];
+    bool ret;
+
+    argv[0] = arg0;
+    ret = wasm_runtime_call_indirect(exec_env, elem_idx, 1, argv);
+    (void)ret;
+}
+
+static int
+invoke_iii_wrapper(wasm_exec_env_t exec_env,
+                   uint32 elem_idx, int arg0, int arg1)
+{
+    uint32 argv[4];
+    bool ret;
+
+    argv[0] = arg0;
+    argv[1] = arg1;
+    ret = wasm_runtime_call_indirect(exec_env, elem_idx, 2, argv);
+    return ret ? argv[0] : 0;
+}
+
+static int
+invoke_ii_wrapper(wasm_exec_env_t exec_env,
+                  uint32 elem_idx, int arg0)
+{
+    uint32 argv[4];
+    bool ret;
+
+    argv[0] = arg0;
+    ret = wasm_runtime_call_indirect(exec_env, elem_idx, 1, argv);
+    return ret ? argv[0] : 0;
+}
+
 struct timespec_emcc {
     int tv_sec;
     int tv_nsec;
@@ -174,10 +258,111 @@ getentropy_wrapper(wasm_exec_env_t exec_env, void *buffer, uint32 length)
     return getentropy(buffer, length);
 }
 
+#if !defined(BH_PLATFORM_LINUX_SGX)
+static FILE *file_list[32] = { 0 };
+
+static int
+get_free_file_slot()
+{
+    unsigned int i;
+
+    for (i = 0; i < sizeof(file_list) / sizeof(FILE *); i++) {
+        if (file_list[i] == NULL)
+            return (int)i;
+    }
+    return -1;
+}
+
+static int
+fopen_wrapper(wasm_exec_env_t exec_env,
+              const char *pathname,
+              const char *mode)
+{
+    FILE *file;
+    int file_id;
+
+    if (pathname == NULL || mode == NULL)
+        return -1;
+
+    if ((file_id = get_free_file_slot()) == -1)
+        return -1;
+
+    file = fopen(pathname, mode);
+    file_list[file_id] = file;
+    return file_id + 1;
+}
+
+static uint32
+fread_wrapper(wasm_exec_env_t exec_env,
+              void *ptr, uint32 size, uint32 nmemb, int file_id)
+{
+    FILE *file;
+
+    file_id = file_id - 1;
+    if ((unsigned)file_id >= sizeof(file_list) / sizeof(FILE *)) {
+        return 0;
+    }
+    if ((file = file_list[file_id]) == NULL) {
+        return 0;
+    }
+    return (uint32)fread(ptr, size, nmemb, file);
+}
+
+static uint32
+emcc_fwrite_wrapper(wasm_exec_env_t exec_env,
+                    const void *ptr, uint32 size, uint32 nmemb,
+                    int file_id)
+{
+    FILE *file;
+
+    file_id = file_id - 1;
+    if ((unsigned)file_id >= sizeof(file_list) / sizeof(FILE *)) {
+        return 0;
+    }
+    if ((file = file_list[file_id]) == NULL) {
+        return 0;
+    }
+    return (uint32)fwrite(ptr, size, nmemb, file);
+}
+
+static int
+feof_wrapper(wasm_exec_env_t exec_env, int file_id)
+{
+    FILE *file;
+
+    file_id = file_id - 1;
+    if ((unsigned)file_id >= sizeof(file_list) / sizeof(FILE *))
+        return 1;
+    if ((file = file_list[file_id]) == NULL)
+        return 1;
+    return feof(file);
+}
+
+static int
+fclose_wrapper(wasm_exec_env_t exec_env, int file_id)
+{
+    FILE *file;
+
+    file_id = file_id - 1;
+    if ((unsigned)file_id >= sizeof(file_list) / sizeof(FILE *))
+        return -1;
+    if ((file = file_list[file_id]) == NULL)
+        return -1;
+    file_list[file_id] = NULL;
+    return fclose(file);
+}
+#endif /* end of BH_PLATFORM_LINUX_SGX */
+
 #define REG_NATIVE_FUNC(func_name, signature)  \
     { #func_name, func_name##_wrapper, signature, NULL }
 
 static NativeSymbol native_symbols_libc_emcc[] = {
+    REG_NATIVE_FUNC(invoke_viiii, "(iiiii)"),
+    REG_NATIVE_FUNC(invoke_viii, "(iiii)"),
+    REG_NATIVE_FUNC(invoke_vii, "(iii)"),
+    REG_NATIVE_FUNC(invoke_vi, "(ii)"),
+    REG_NATIVE_FUNC(invoke_iii, "(iii)i"),
+    REG_NATIVE_FUNC(invoke_ii, "(ii)i"),
     REG_NATIVE_FUNC(open, "($ii)i"),
     REG_NATIVE_FUNC(__sys_read, "(i*~)i"),
     REG_NATIVE_FUNC(__sys_stat64, "($*)i"),
@@ -186,6 +371,13 @@ static NativeSymbol native_symbols_libc_emcc[] = {
     REG_NATIVE_FUNC(munmap, "(ii)i"),
     REG_NATIVE_FUNC(__munmap, "(ii)i"),
     REG_NATIVE_FUNC(getentropy, "(*~)i"),
+#if !defined(BH_PLATFORM_LINUX_SGX)
+    REG_NATIVE_FUNC(fopen, "($$)i"),
+    REG_NATIVE_FUNC(fread, "(*iii)i"),
+    REG_NATIVE_FUNC(emcc_fwrite, "(*iii)i"),
+    REG_NATIVE_FUNC(feof, "(i)i"),
+    REG_NATIVE_FUNC(fclose, "(i)i"),
+#endif /* end of BH_PLATFORM_LINUX_SGX */
 };
 
 uint32

+ 4 - 0
doc/build_wamr.md

@@ -86,6 +86,10 @@ Currently we only profile the memory consumption of module, module_instance and
 #### **Enable tail call feature**
 - **WAMR_BUILD_TAIL_CALL**=1/0, default to disable if not set
 
+#### **Enable 128-bit SIMD feature**
+- **WAMR_BUILD_SIMD**=1/0, default to disable if not set
+> Note: only supported in AOT mode, and the *--enable-simd* flag should be added for wamrc when generating aot file.
+
 **Combination of configurations:**
 
 We can combine the configurations. For example, if we want to disable interpreter, enable AOT and WASI, we can run command:

+ 5 - 0
product-mini/platforms/linux/CMakeLists.txt

@@ -75,6 +75,11 @@ if (NOT DEFINED WAMR_BUILD_MINI_LOADER)
   set (WAMR_BUILD_MINI_LOADER 0)
 endif ()
 
+if (NOT DEFINED WAMR_BUILD_SIMD)
+  # Disable SIMD by default
+  set (WAMR_BUILD_SIMD 0)
+endif ()
+
 if (COLLECT_CODE_COVERAGE EQUAL 1)
   set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fprofile-arcs -ftest-coverage")
 endif ()

+ 5 - 0
samples/basic/build.sh

@@ -1,3 +1,8 @@
+#
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+
 #!/bin/bash
 
 CURR_DIR=$PWD

+ 5 - 0
samples/gui/build.sh

@@ -1,3 +1,8 @@
+#
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+
 #!/bin/bash
 
 PROJECT_DIR=$PWD

+ 5 - 0
samples/littlevgl/build.sh

@@ -1,3 +1,8 @@
+#
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+
 #!/bin/bash
 
 PROJECT_DIR=$PWD

+ 5 - 0
samples/simple/build.sh

@@ -1,3 +1,8 @@
+#
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+
 #!/bin/bash
 
 CURR_DIR=$PWD

+ 34 - 0
samples/workload/README.md

@@ -0,0 +1,34 @@
+All workloads have similar a requirment of software dependencies. It includes
+**wasi-sdk**, **clang-11**, **emsdk**, **wabt** and **binaryen**
+
+> It might slightly different when using MacOS, and other linux distro than Ubuntu. This document only target
+Ubuntu 18.04 as an example.
+
+## Installation instructions
+
+- **wasi-sdk**. Install
+  [latest release](https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-11/wasi-sdk-11.0-linux.tar.gz)
+  in */opt/wasi-sdk* or */opt/wasi-sdk-11*
+
+- **wabt**. Install
+  [latest release](https://github.com/WebAssembly/wabt/releases/download/${WABT_VER}/wabt-1.0.19-ubuntu.tar.gz)
+  in */opt/wabt* or */opt/wabt-1.0.19*
+
+- **clang-11**. Refer to [the guide](https://apt.llvm.org/).
+
+- **emsdk**. Refer to [the guide](https://emscripten.org/docs/getting_started/downloads.html). Don't forget to activate
+  emsdk and set up environment variables. Verify it with `echo ${EMSDK}`.
+
+- **libclang_rt.builtins-wasm32.a**. *wasi* has its private rt library. Put it under clang search path
+
+``` shell
+# copy it
+$ cp -r /opt/wasi-sdk-11.0/lib/clang/10.0.0/lib/wasi /usr/lib/llvm-11/lib/clang/11.0.0/lib/
+
+# or just link it
+$ ln -sf /opt/wasi-sdk-11.0/lib/clang/10.0.0/lib/wasi/ /usr/lib/llvm-11/lib/clang/11.0.0/lib/
+```
+
+- **binaryen**. Install
+  [latest release](https://github.com/WebAssembly/binaryen/releases/download/version_97/binaryen-version_97-x86_64-linux.tar.gz)
+  in */opt/binaryen* or */opt/binaryen-version_97*

+ 4 - 0
samples/workload/bwa/.gitignore

@@ -0,0 +1,4 @@
+build
+libz
+bwa
+include

+ 134 - 0
samples/workload/bwa/CMakeLists.bwa_wasm.txt

@@ -0,0 +1,134 @@
+# Copyright (C) 2019 Intel Corporation. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+cmake_minimum_required (VERSION 3.0)
+
+project(bwa_wasm C)
+
+################ LIBZ ################
+set(LIBZ_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../libz)
+add_library(z_wasm STATIC
+  ${LIBZ_SRC_DIR}/adler32.c
+  ${LIBZ_SRC_DIR}/compress.c
+  ${LIBZ_SRC_DIR}/crc32.c
+  ${LIBZ_SRC_DIR}/deflate.c
+  ${LIBZ_SRC_DIR}/gzclose.c
+  ${LIBZ_SRC_DIR}/gzlib.c
+  ${LIBZ_SRC_DIR}/gzread.c
+  ${LIBZ_SRC_DIR}/gzwrite.c
+  ${LIBZ_SRC_DIR}/infback.c
+  ${LIBZ_SRC_DIR}/inffast.c
+  ${LIBZ_SRC_DIR}/inflate.c
+  ${LIBZ_SRC_DIR}/inftrees.c
+  ${LIBZ_SRC_DIR}/trees.c
+  ${LIBZ_SRC_DIR}/uncompr.c
+  ${LIBZ_SRC_DIR}/zutil.c
+)
+
+set_target_properties(z_wasm PROPERTIES LINKER_LANGUAGE C)
+
+target_compile_definitions(z_wasm PRIVATE Z_HAVE_UNISTD_H _LARGEFILE64_SOURCE=1)
+
+target_compile_options(z_wasm
+  PRIVATE
+    -Wno-unused-function
+    -Wno-unused-variable
+)
+
+target_include_directories(z_wasm
+  PUBLIC
+    ${LIBZ_SRC_DIR}
+)
+
+################ BWA_WASM ################
+set(BWA_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(BWA_SOURCE
+  ${BWA_SRC_DIR}/utils.c
+  ${BWA_SRC_DIR}/kthread.c
+  ${BWA_SRC_DIR}/kstring.c
+  ${BWA_SRC_DIR}/ksw.c
+  ${BWA_SRC_DIR}/bwt.c
+  ${BWA_SRC_DIR}/bntseq.c
+  ${BWA_SRC_DIR}/bwa.c
+  ${BWA_SRC_DIR}/bwamem.c
+  ${BWA_SRC_DIR}/bwamem_pair.c
+  ${BWA_SRC_DIR}/bwamem_extra.c
+  ${BWA_SRC_DIR}/malloc_wrap.c
+  ${BWA_SRC_DIR}/QSufSort.c
+  ${BWA_SRC_DIR}/bwt_gen.c
+  ${BWA_SRC_DIR}/rope.c
+  ${BWA_SRC_DIR}/rle.c
+  ${BWA_SRC_DIR}/is.c
+  ${BWA_SRC_DIR}/bwtindex.c
+  ${BWA_SRC_DIR}/bwashm.c
+  ${BWA_SRC_DIR}/bwase.c
+  ${BWA_SRC_DIR}/bwaseqio.c
+  ${BWA_SRC_DIR}/bwtgap.c
+  ${BWA_SRC_DIR}/bwtaln.c
+  ${BWA_SRC_DIR}/bamlite.c
+  ${BWA_SRC_DIR}/bwape.c
+  ${BWA_SRC_DIR}/kopen.c
+  ${BWA_SRC_DIR}/pemerge.c
+  ${BWA_SRC_DIR}/maxk.c
+  ${BWA_SRC_DIR}/bwtsw2_core.c
+  ${BWA_SRC_DIR}/bwtsw2_main.c
+  ${BWA_SRC_DIR}/bwtsw2_aux.c
+  ${BWA_SRC_DIR}/bwt_lite.c
+  ${BWA_SRC_DIR}/bwtsw2_chain.c
+  ${BWA_SRC_DIR}/fastmap.c
+  ${BWA_SRC_DIR}/bwtsw2_pair.c
+  ${BWA_SRC_DIR}/main.c
+)
+
+add_executable(${PROJECT_NAME} ${BWA_SOURCE})
+
+set_target_properties(${PROJECT_NAME} PROPERTIES OUTPUT_NAME bwa.wasm)
+
+target_include_directories(${PROJECT_NAME}
+  PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../include
+    ${CMAKE_CURRENT_SOURCE_DIR}/../include/SSE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../include/pthread
+)
+
+target_compile_definitions(${PROJECT_NAME}
+  PRIVATE
+    USE_MALLOC_WRAPPERS
+    __SSE__ __SSE2__ __SSE4_1__
+    _WASI_EMULATED_MMAN _WASI_EMULATED_SIGNAL
+)
+
+target_compile_options(${PROJECT_NAME}
+  PRIVATE
+    -Wno-unused-function
+    -Wno-unused-variable
+)
+
+target_link_options(${PROJECT_NAME}
+  PRIVATE
+    -Wno-unused-command-line-argument
+    LINKER:--allow-undefined,--export=__heap_base,--export=__data_end
+    LINKER:-z,stack-size=1048576
+)
+
+target_link_libraries(${PROJECT_NAME} z_wasm)
+
+find_program(WASM_OPT
+    NAMES wasm-opt
+    PATHS /opt/binaryen-version_97/bin /opt/binaryen/bin
+)
+
+if (NOT WASM_OPT)
+  message(WARNING "can not find wasm-opt and will not optimize any wasm module")
+endif()
+
+add_custom_target(bwa_wasm_opt ALL
+  COMMAND
+    ${WASM_OPT} -Oz --enable-simd -o bwa.opt.wasm bwa.wasm
+  BYPRODUCTS
+    ${CMAKE_CURRENT_BINARY_DIR}/bwa.opt.wasm
+  WORKING_DIRECTORY
+    ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+add_dependencies(bwa_wasm_opt ${PROJECT_NAME})

+ 91 - 0
samples/workload/bwa/CMakeLists.txt

@@ -0,0 +1,91 @@
+# Copyright (C) 2019 Intel Corporation. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+cmake_minimum_required (VERSION 3.0)
+
+project(bwa_wasm)
+
+################  EMCC ################
+if(NOT DEFINED ENV{EMSDK})
+  message(FATAL_ERROR
+    "can not find emsdk. "
+    "please refer to https://emscripten.org/docs/getting_started/downloads.html "
+    "and install it, "
+    "or active emsdk by 'source ./emsdk_env.sh'"
+  )
+endif()
+
+################  BINARYEN ################
+find_program(WASM_OPT
+    NAMES wasm-opt
+    PATHS /opt/binaryen-version_97/bin /opt/binaryen/bin
+)
+
+if (NOT WASM_OPT)
+  message(FATAL_ERROR
+    "can not find wasm-opt. "
+    "please download it from "
+    "https://github.com/WebAssembly/binaryen/releases/download/version_97/binaryen-version_97-x86_64-linux.tar.gz "
+    "and install it under /opt"
+  )
+endif()
+
+#######################################
+include(ExternalProject)
+
+################ HEADERS ################
+ExternalProject_Add(headers_from_emcc
+   PREFIX headers
+   SOURCE_DIR "$ENV{EMSDK}/upstream/emscripten/system/include/SSE"
+   CONFIGURE_COMMAND ""
+   BUILD_COMMAND  ""
+   INSTALL_COMMAND mkdir -p ${CMAKE_CURRENT_SOURCE_DIR}/include/SSE
+                     && ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_SOURCE_DIR}/include/pthread/sys
+                     && ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_SOURCE_DIR}/include/emscripten
+                     # copy emscripten SSE header files
+                     && ${CMAKE_COMMAND} -E copy $ENV{EMSDK}/upstream/emscripten/system/include/SSE/immintrin.h ${CMAKE_CURRENT_SOURCE_DIR}/include/SSE/
+                     # SSE
+                     && ${CMAKE_COMMAND} -E copy $ENV{EMSDK}/upstream/emscripten/system/include/SSE/xmmintrin.h ${CMAKE_CURRENT_SOURCE_DIR}/include/SSE/
+                     # SSE2
+                     && ${CMAKE_COMMAND} -E copy $ENV{EMSDK}/upstream/emscripten/system/include/SSE/emmintrin.h ${CMAKE_CURRENT_SOURCE_DIR}/include/SSE/
+                     # SSE4.1
+                     && ${CMAKE_COMMAND} -E copy $ENV{EMSDK}/upstream/emscripten/system/include/SSE/smmintrin.h ${CMAKE_CURRENT_SOURCE_DIR}/include/SSE/
+                     # a fake empty header to aovid further depenency
+                     && ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_SOURCE_DIR}/include/emscripten/emscripten.h
+                     # copy emscripten pthread related header files
+                     && ${CMAKE_COMMAND} -E copy $ENV{EMSDK}/upstream/emscripten/system/include/libc/pthread.h ${CMAKE_CURRENT_SOURCE_DIR}/include/pthread/
+                     && ${CMAKE_COMMAND} -E copy $ENV{EMSDK}/upstream/emscripten/system/include/libc/signal.h ${CMAKE_CURRENT_SOURCE_DIR}/include/pthread/
+                     && ${CMAKE_COMMAND} -E copy $ENV{EMSDK}/upstream/emscripten/system/include/libc/netdb.h ${CMAKE_CURRENT_SOURCE_DIR}/include/pthread/
+                     && ${CMAKE_COMMAND} -E copy $ENV{EMSDK}/upstream/emscripten/system/include/libc/sys/wait.h ${CMAKE_CURRENT_SOURCE_DIR}/include/pthread/sys/
+                     && ${CMAKE_COMMAND} -E copy $ENV{EMSDK}/upstream/emscripten/system/include/libc/sys/socket.h ${CMAKE_CURRENT_SOURCE_DIR}/include/pthread/sys/
+)
+
+################ libz ################
+ExternalProject_Add(libz_src
+  PREFIX libz
+  GIT_REPOSITORY https://github.com/madler/zlib.git
+  GIT_TAG        master
+  GIT_PROGRESS   ON
+  GIT_SHALLOW    ON
+  SOURCE_DIR     ${CMAKE_CURRENT_SOURCE_DIR}/libz
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND  ""
+  INSTALL_COMMAND ""
+)
+
+################ bwa ################
+ExternalProject_Add(bwa
+  PREFIX bwa
+  GIT_REPOSITORY https://github.com/lh3/bwa.git
+  GIT_TAG        master
+  GIT_PROGRESS   ON
+  GIT_SHALLOW    ON
+  SOURCE_DIR     ${CMAKE_CURRENT_SOURCE_DIR}/bwa
+  DEPENDS        libz_src headers_from_emcc
+  UPDATE_COMMAND git clean -fd && git checkout -- *
+                  && ${CMAKE_COMMAND} -E echo "Copying pre-installed CMakeLists.txt"
+                  && ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.bwa_wasm.txt CMakeLists.txt
+  CONFIGURE_COMMAND  ${CMAKE_COMMAND} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/../cmake/toolchain.cmake ${CMAKE_CURRENT_SOURCE_DIR}/bwa
+  BUILD_COMMAND  make bwa_wasm_opt
+  INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ./bwa.opt.wasm ${CMAKE_CURRENT_SOURCE_DIR}/build/bwa.wasm
+)

+ 47 - 0
samples/workload/bwa/README.md

@@ -0,0 +1,47 @@
+"bwa" sample introduction
+==============
+
+This sample demonstrates how to build [bwa](https://github.com/lh3/bwa) into
+WebAssembly with simd support and run it with iwasm.
+
+## Preparation
+
+please refer to [installation instructions](../README.md).
+
+## Build
+
+``` shell
+$ mkdir build && cd build
+$ cmake ..
+$ make
+# to verify
+$ ls bwa.wasm
+```
+
+## Download sample data
+
+Download the bwa-0.7.15 binary package from
+[such an address](https://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.15_x64-linux.tar.bz2/download),
+a sample data file named **hs38DH.fa** will be used later.
+
+If want more data, please refer to http://hgdownload.cse.ucsc.edu/goldenpath/hg19/bigZips/
+
+## Run workload
+
+Firstly please build iwasm with simd support:
+
+``` shell
+$ cd <wamr dir>/product-mini/platforms/linux/
+$ mkdir build && cd build
+$ cmake .. -DWAMR_BUILD_SIMD=1
+$ make
+```
+
+Then compile wasm file to aot file and run:
+
+``` shell
+$ cd <wamr dir>/wamr-compiler/build
+$ ./wamrc --enable-simd -o bwa.aot ./bwa.wasm
+$ cd <wamr dir>/product-mini/platforms/linux/
+$ ./iwasm --dir=. ./bwa.aot index hs38DH.fa
+```

+ 100 - 0
samples/workload/cmake/toolchain.cmake

@@ -0,0 +1,100 @@
+# Copyright (C) 2019 Intel Corporation. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+cmake_minimum_required (VERSION 3.0)
+
+if(DEFINED _WAMR_TOOLCHAIN_CMAKE_)
+  return()
+else()
+  set(_WAMR_TOOLCHAIN_CMAKE_ 1)
+endif()
+
+SET(CMAKE_SYSTEM_NAME Linux)
+
+################  COMPILER  ################
+find_program(CLANG_11 NAMES clang clang-11 REQUIRED)
+find_program(CLANG++_11 NAMES clang++ clang++-11 REQUIRED)
+
+if(NOT CLANG_11)
+  message(FATAL_ERROR "clang not found")
+else()
+  message(STATUS "use ${CLANG_11} as the c compiler")
+endif()
+
+if(NOT CLANG++_11)
+  message(FATAL_ERROR "clang++ not found")
+else()
+  message(STATUS "use ${CLANG++_11} as the c++ compiler")
+endif()
+
+set(CMAKE_C_COMPILER "${CLANG_11}" CACHE STRING "C compiler" FORCE)
+set(CMAKE_C_COMPILER_ID Clang CACHE STRING "C compiler ID" FORCE)
+
+set(CMAKE_CXX_COMPILER "${CLANG++_11}" CACHE STRING "C++ compiler" FORCE)
+set(CMAKE_CXX_COMPILER_ID Clang CACHE STRING "C++ compiler ID" FORCE)
+
+################  WASI AS SYSROOT  ################
+find_path(WASI_SYSROOT
+  wasi-sysroot
+  PATHS /opt/wasi-sdk-11.0/share /opt/wasi-sdk/share
+  REQUIRED
+)
+
+if(NOT WASI_SYSROOT)
+  message(FATAL_ERROR
+    "can not find wasi sysroot. "
+    "please download it from "
+    "https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-11/wasi-sdk-11.0-linux.tar.gz "
+    "and install it under /opt"
+  )
+endif()
+
+set(CMAKE_SYSROOT ${WASI_SYSROOT}/wasi-sysroot CACHE STRING "--sysroot to compiler" FORCE)
+
+add_compile_options(
+  --target=wasm32-wasi
+  -msimd128
+  $<IF:$<CONFIG:Debug>,-O0,-O3>
+  $<$<CONFIG:Debug>:-g>
+  $<$<CONFIG:Debug>:-v>
+)
+
+################  AR  ################
+find_program(LLVM_AR NAMES llvm-ar llvm-ar-11 REQUIRED)
+
+if(NOT LLVM_AR)
+  message(FATAL_ERROR "llvm-ar not found")
+else()
+  message(STATUS "use ${LLVM_AR} as the AR")
+endif()
+
+set(CMAKE_AR "${LLVM_AR}" CACHE STRING "AR" FORCE)
+
+################  RANLIB  ################
+find_program(LLVM_RANLIB NAMES llvm-ranlib llvm-ranlib-11 REQUIRED)
+
+if(NOT LLVM_RANLIB)
+  message(FATAL_ERROR "llvm-ranlib not found")
+else()
+  message(STATUS "use ${LLVM_RANLIB} as the ranlib")
+endif()
+
+set(CMAKE_RANLIB "${LLVM_RANLIB}" CACHE STRING "RANLIB" FORCE)
+
+################  LD  ################
+find_program(WASM_LD NAMES wasm-ld wasm-ld-11 REQUIRED)
+
+if(NOT WASM_LD)
+  message(FATAL_ERROR "wasm-ld not found")
+else()
+  message(STATUS "use ${WASM_LD} as the linker")
+endif()
+
+add_link_options(
+  --target=wasm32-wasi
+  -fuse-ld=${WASM_LD}
+  LINKER:--allow-undefined
+  $<IF:$<CONFIG:Debug>,-O0,-O3>
+  $<$<CONFIG:Debug>:-g>
+  $<$<CONFIG:Debug>:-v>
+)

+ 1 - 0
samples/workload/docker/.gitignore

@@ -0,0 +1 @@
+build_scripts

+ 77 - 0
samples/workload/docker/Dockerfile

@@ -0,0 +1,77 @@
+FROM ubuntu:18.04 as builder
+
+#
+# install clang and llvm
+COPY llvm.sh /tmp
+RUN apt update \
+    && apt install -y lsb-release wget software-properties-common build-essential \
+    && cd /tmp \
+    && chmod a+x llvm.sh \
+    && ./llvm.sh 11
+
+ARG WASI_SDK_VER=11.0
+ARG WABT_VER=1.0.19
+ARG CMAKE_VER=3.16.2
+ARG BINARYEN_VER=version_97
+
+#
+# install wasi-sdk
+ARG WASI_SDK_FILE="wasi-sdk-${WASI_SDK_VER}-linux.tar.gz"
+COPY ${WASI_SDK_FILE} /opt
+RUN cd /opt \
+    && tar zxf ${WASI_SDK_FILE} \
+    && rm ${WASI_SDK_FILE} \
+    && ln -sf /opt/wasi-sdk-${WASI_SDK_VER} /opt/wasi-sdk \
+    && ln -sf /opt/wasi-sdk/lib/clang/10.0.0/lib/wasi/ /usr/lib/llvm-11/lib/clang/11.0.0/lib/
+
+#
+# install wabt
+ARG WABT_FILE="wabt-${WABT_VER}-ubuntu.tar.gz"
+COPY ${WABT_FILE} /opt
+RUN cd /opt \
+    && tar zxf ${WABT_FILE} \
+    && rm ${WABT_FILE} \
+    && ln -sf /opt/wabt-${WABT_VER} /opt/wabt
+
+#
+# install cmake
+ARG CMAKE_FILE="cmake-${CMAKE_VER}-Linux-x86_64.sh"
+COPY ${CMAKE_FILE} /tmp
+RUN cd /tmp \
+    && chmod a+x ${CMAKE_FILE} \
+    && mkdir /opt/cmake \
+    && ./${CMAKE_FILE} --prefix=/opt/cmake --skip-license  \
+    && ln -sf /opt/cmake/bin/cmake /usr/local/bin/cmake
+
+#
+# install tools
+RUN apt install -y git tree
+
+#
+# install emsdk
+RUN cd /opt \
+    && git clone https://github.com/emscripten-core/emsdk.git \
+    && cd emsdk \
+    && git pull \
+    && ./emsdk install latest \
+    && ./emsdk activate latest \
+    && echo "source /opt/emsdk/emsdk_env.sh" >> /root/.bashrc
+
+#
+# install binaryen
+ARG BINARYEN_FILE="binaryen-${BINARYEN_VER}-x86_64-linux.tar.gz"
+COPY ${BINARYEN_FILE} /opt
+RUN cd /opt \
+    && tar zxf ${BINARYEN_FILE} \
+    && rm ${BINARYEN_FILE}  \
+    && ln -sf /opt/binaryen-${BINARYEN_VER} /opt/binaryen
+
+#
+# Clean up
+RUN apt-get autoremove -y \
+    && apt-get clean -y \
+    && rm -rf /var/lib/apt/lists/* \
+    && rm -rf /tmp/*
+
+VOLUME /data
+WORKDIR /data

+ 48 - 0
samples/workload/docker/build.sh

@@ -0,0 +1,48 @@
+#
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+
+#!/bin/bash
+
+if [[ ! -d build_scripts ]]; then
+    mkdir build_scripts
+fi
+
+WASI_SDK_VER=11.0
+WABT_VER=1.0.19
+CMAKE_VER=3.16.2
+BINARYEN_VER=version_97
+
+cd build_scripts
+if [[ ! -f wasi-sdk-${WASI_SDK_VER}-linux.tar.gz ]]; then
+  wget https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-11/wasi-sdk-${WASI_SDK_VER}-linux.tar.gz
+fi
+
+if [[ ! -f wabt-${WABT_VER}-ubuntu.tar.gz ]]; then
+  wget https://github.com/WebAssembly/wabt/releases/download/${WABT_VER}/wabt-${WABT_VER}-ubuntu.tar.gz
+fi
+
+if [[ ! -f llvm.sh ]]; then
+  wget https://apt.llvm.org/llvm.sh
+fi
+
+if [[ ! -f cmake-${CMAKE_VER}-Linux-x86_64.sh ]]; then
+  wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VER}/cmake-${CMAKE_VER}-Linux-x86_64.sh
+fi
+
+if [[ ! -f binaryen-${BINARYEN_VER}-x86_64-linux.tar.gz ]]; then
+  wget https://github.com/WebAssembly/binaryen/releases/download/${BINARYEN_VER}/binaryen-${BINARYEN_VER}-x86_64-linux.tar.gz
+fi
+cd -
+
+docker build \
+  --build-arg http_proxy=${http_proxy} \
+  --build-arg https_proxy=${https_proxy} \
+  --build-arg HTTP_PROXY=${http_proxy} \
+  --build-arg HTTPS_PROXY=${https_proxy} \
+  --build-arg WASI_SDK_VER=11.0 \
+  --build-arg WABT_VER=${WABT_VER} \
+  --build-arg CMAKE_VER=${CMAKE_VER} \
+  --build-arg BINARYEN_VER=${BINARYEN_VER} \
+  -t clang_env:0.1 -f Dockerfile build_scripts

+ 10 - 0
samples/workload/docker/run.sh

@@ -0,0 +1,10 @@
+#!/bin/bash
+
+docker run --rm -it \
+  -e http_proxy=${http_proxy} \
+  -e https_proxy=${https_proxy} \
+  -e HTTP_PROXY=${http_proxy} \
+  -e HTTPS_PROXY=${htpps_proxy} \
+  --name workload_w_clang \
+  --mount type=bind,source=$(pwd)/..,target=/data \
+  clang_env:0.1

+ 2 - 0
samples/workload/meshoptimizer/.gitignore

@@ -0,0 +1,2 @@
+build
+meshoptimizer

+ 39 - 0
samples/workload/meshoptimizer/CMakeLists.txt

@@ -0,0 +1,39 @@
+# Copyright (C) 2019 Intel Corporation. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+cmake_minimum_required (VERSION 3.0)
+
+project(bench-meshoptimizer)
+
+################  BINARYEN ################
+find_program(WASM_OPT
+    NAMES wasm-opt
+    PATHS /opt/binaryen-version_97/bin /opt/binaryen/bin
+)
+
+if (NOT WASM_OPT)
+  message(FATAL_ERROR
+    "can not find wasm-opt. "
+    "please download it from "
+    "https://github.com/WebAssembly/binaryen/releases/download/version_97/binaryen-version_97-x86_64-linux.tar.gz "
+    "and install it under /opt"
+  )
+endif()
+
+################  MESHOPTIMIZER  ################
+include(ExternalProject)
+
+ExternalProject_Add(codecbench
+  PREFIX codecbench
+  GIT_REPOSITORY https://github.com/zeux/meshoptimizer.git
+  GIT_TAG        master
+  GIT_SHALLOW    ON
+  GIT_PROGRESS   ON
+  SOURCE_DIR     ${CMAKE_CURRENT_SOURCE_DIR}/meshoptimizer
+  UPDATE_COMMAND git clean -fd && git checkout -- *
+                   && ${CMAKE_COMMAND} -E echo "Applying patch"
+                   && git apply ${CMAKE_CURRENT_SOURCE_DIR}/codecbench.patch
+  CONFIGURE_COMMAND ${CMAKE_COMMAND} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/../cmake/toolchain.cmake ${CMAKE_CURRENT_SOURCE_DIR}/meshoptimizer
+  BUILD_COMMAND  make codecbench.opt
+  INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ./codecbench.opt.wasm ${CMAKE_CURRENT_SOURCE_DIR}/build/codecbench.wasm
+)

+ 59 - 0
samples/workload/meshoptimizer/README.md

@@ -0,0 +1,59 @@
+"codecbench of meshoptimizer" sample introduction
+==============
+
+This sample demonstrates how to build [codecbench of messoptimizer](https://github.com/zeux/meshoptimizer) into
+WebAssembly with simd support and run it with iwasm.
+
+## Preparation
+
+please refer to [installation instructions](../README.md).
+
+## Build with clang-11 and wasi-sdk
+
+``` shell
+$ mkdir build && cd build
+$ cmake ..
+$ make
+# to verify
+$ ls codecbench.wasm
+```
+
+## Or build with EMCC
+
+EMCC is another toolchain to compile C code to WASM. In this case, will have
+a higher performance with EMCC.
+
+``` shell
+$ git clone https://github.com/zeux/meshoptimizer.git
+$ cd messoptimizer
+$ emcc tools/codecbench.cpp src/vertexcodec.cpp src/vertexfilter.cpp \
+       src/overdrawanalyzer.cpp src/indexgenerator.cpp src/vcacheoptimizer.cpp \
+       src/clusterizer.cpp src/indexcodec.cpp src/vfetchanalyzer.cpp \
+       src/spatialorder.cpp src/allocator.cpp src/vcacheanalyzer.cpp \
+       src/vfetchoptimizer.cpp src/overdrawoptimizer.cpp src/simplifier.cpp \
+       src/stripifier.cpp -O3 -msimd128 \
+       -s TOTAL_MEMORY=268435456 -s "EXPORTED_FUNCTIONS=['_main']" \
+       -o codecbench.wasm
+$ ls -l codecbench.wasm
+```
+
+## Run workload
+
+Firstly please build iwasm with simd support:
+
+``` shell
+$ cd <wamr dir>/product-mini/platforms/linux/
+$ mkdir build && cd build
+$ cmake .. -DWAMR_BUILD_SIMD=1
+$ make
+```
+
+Then compile wasm file to aot file and run:
+
+``` shell
+$ cd <wamr dir>/wamr-compiler/build
+$ ./wamrc --enable-simd -o codecbench.aot codecbench.wasm
+$ cd <wamr dir>/product-mini/platforms/linux/
+$ ./iwasm codecbench.aot
+```
+

+ 47 - 0
samples/workload/meshoptimizer/codecbench.patch

@@ -0,0 +1,47 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index eccc49e..dac126c 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -127,3 +127,42 @@ install(FILES
+     ${CMAKE_CURRENT_BINARY_DIR}/meshoptimizerConfig.cmake
+     ${CMAKE_CURRENT_BINARY_DIR}/meshoptimizerConfigVersion.cmake
+     DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/meshoptimizer)
++
++##################################################
++# codecbench
++##################################################
++add_executable(codecbench tools/codecbench.cpp ${SOURCES})
++
++set_target_properties(codecbench PROPERTIES OUTPUT_NAME codecbench.wasm)
++
++target_compile_options(codecbench
++  PUBLIC
++    -std=c++11
++    -Wno-unused-function
++    -Wno-unused-variable
++)
++
++target_link_options(codecbench
++  PUBLIC
++    LINKER:-allow-undefined,--demangle
++)
++
++find_program(WASM_OPT
++    NAMES wasm-opt
++    PATHS /opt/binaryen-version_97/bin /opt/binaryen/bin
++)
++
++if (NOT WASM_OPT)
++  message(WARNING "can not find wasm-opt and will not optimize any wasm module")
++endif()
++
++add_custom_target(codecbench.opt ALL
++  COMMAND
++    ${WASM_OPT} -Oz --enable-simd -o codecbench.opt.wasm codecbench.wasm
++  BYPRODUCTS
++    ${CMAKE_CURRENT_BINARY_DIR}/codecbench.opt.wasm
++  WORKING_DIRECTORY
++    ${CMAKE_CURRENT_BINARY_DIR}
++)
++
++add_dependencies(codecbench.opt codecbench)

+ 25 - 7
samples/workload/tensorflow/build.sh

@@ -1,8 +1,20 @@
+#
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+
 #!/bin/bash
 
 ####################################
 #   build tensorflow-lite sample   #
 ####################################
+if [ ! -d "${EMSDK}" ]; then
+    echo "can not find emsdk. "
+    echo "please refer to https://emscripten.org/docs/getting_started/downloads.html "
+    echo "to install it, or active it by 'source <emsdk_dir>emsdk_env.sh'"
+    exit
+fi
+
 set -xe
 
 EMSDK_WASM_DIR="$EM_CACHE/wasm"
@@ -64,7 +76,15 @@ fi
 if [ -d "${TF_LITE_BUILD_DIR}/gen" ]; then
     rm -fr ${TF_LITE_BUILD_DIR}/gen
 fi
-make -j 4 -C "${TENSORFLOW_DIR}" -f ${TF_LITE_BUILD_DIR}/Makefile
+if [[ $1 == '--sgx' ]]; then
+    make -j 4 -C "${TENSORFLOW_DIR}" -f ${TF_LITE_BUILD_DIR}/Makefile
+else
+    export BUILD_WITH_SIMD=true
+    make -j 4 -C "${TENSORFLOW_DIR}" -f ${TF_LITE_BUILD_DIR}/Makefile
+fi
+
+# remove patch file and recover emcc libc.a after building
+Clear_Before_Exit
 
 # 2.5 copy /make/gen target files to out/
 rm -rf ${OUT_DIR}
@@ -84,7 +104,7 @@ cd ${OUT_DIR}
 if [[ $1 == '--sgx' ]]; then
     ${WAMRC_CMD} -sgx -o benchmark_model.aot benchmark_model.wasm
 else
-    ${WAMRC_CMD} -o benchmark_model.aot benchmark_model.wasm
+    ${WAMRC_CMD} --enable-simd -o benchmark_model.aot benchmark_model.wasm
 fi
 
 # 4. build iwasm with pthread and libc_emcc enable
@@ -101,7 +121,7 @@ if [[ $1 == '--sgx' ]]; then
 else
     cd ${WAMR_PLATFORM_DIR}/linux
     rm -fr build && mkdir build
-    cd build && cmake .. -DWAMR_BUILD_LIB_PTHREAD=1 -DWAMR_BUILD_LIBC_EMCC=1
+    cd build && cmake .. -DWAMR_BUILD_SIMD=1 -DWAMR_BUILD_LIB_PTHREAD=1 -DWAMR_BUILD_LIBC_EMCC=1
     make
 fi
 
@@ -122,8 +142,6 @@ else
 fi
 
 ${IWASM_CMD} --heap-size=10475860 \
-                        ${OUT_DIR}/benchmark_model.aot \
-                        --graph=mobilenet_quant_v1_224.tflite --max_secs=300
-
-Clear_Before_Exit
+             ${OUT_DIR}/benchmark_model.aot \
+             --graph=mobilenet_quant_v1_224.tflite --max_secs=300
 

+ 9 - 3
samples/workload/tensorflow/tf_lite.patch

@@ -1,5 +1,5 @@
 diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
-index c7ddff5844..1082644043 100644
+index c7ddff5844..17146868f7 100644
 --- a/tensorflow/lite/tools/make/Makefile
 +++ b/tensorflow/lite/tools/make/Makefile
 @@ -48,11 +48,7 @@ INCLUDES += -I/usr/local/include
@@ -15,10 +15,16 @@ index c7ddff5844..1082644043 100644
  -ldl
  
  # There are no rules for compiling objects for the host system (since we don't
-@@ -84,14 +80,18 @@ endif # ifeq ($(HOST_ARCH),$(TARGET_ARCH))
+@@ -84,14 +80,24 @@ endif # ifeq ($(HOST_ARCH),$(TARGET_ARCH))
  endif # ifeq ($(HOST_OS),$(TARGET))
  endif
  
++BUILD_WITH_SIMD ?= false
++ifeq ($(BUILD_WITH_SIMD), true)
++CFLAGS+=-msimd128
++CXXFLAGS+=-msimd128
++endif
++
 +LIBFLAGS += -s TOTAL_STACK=1048576 \
 +            -Wl,--export=__data_end -Wl,--export=__heap_base \
 +            -s ERROR_ON_UNDEFINED_SYMBOLS=0
@@ -36,7 +42,7 @@ index c7ddff5844..1082644043 100644
  
  # A small example program that shows how to link against the library.
  MINIMAL_SRCS := \
-@@ -277,12 +277,16 @@ LIB_PATH := $(LIBDIR)$(LIB_NAME)
+@@ -277,12 +283,16 @@ LIB_PATH := $(LIBDIR)$(LIB_NAME)
  BENCHMARK_LIB := $(LIBDIR)$(BENCHMARK_LIB_NAME)
  BENCHMARK_BINARY := $(BINDIR)$(BENCHMARK_BINARY_NAME)
  BENCHMARK_PERF_OPTIONS_BINARY := $(BINDIR)$(BENCHMARK_PERF_OPTIONS_BINARY_NAME)

+ 22 - 0
samples/workload/wasm-av1/README.md

@@ -0,0 +1,22 @@
+"wasm-av1" sample introduction
+==============
+This sample demonstrates how to build [wasm-av1](https://github.com/GoogleChromeLabs/wasm-av1) into WebAssembly with emcc toolchain and run it with iwasm. Please first install [emsdk](https://github.com/emscripten-core/emsdk):
+```bash
+git clone https://github.com/emscripten-core/emsdk.git
+cd emsdk
+./emsdk install latest
+./emsdk activate latest
+```
+And set up ensdk environment:
+```bash
+source emsdk_env.sh
+```
+Then run
+```bash
+./build.sh
+```
+to build wasm-av1 and run it with iwasm, which basically contains the following steps:
+- hack emcc to delete some objects in libc.a
+- patch wasm-av1 and build it with emcc compiler
+- build iwasm with simd and libc-emcc support
+- run testav1.aot with iwasm

+ 100 - 0
samples/workload/wasm-av1/build.sh

@@ -0,0 +1,100 @@
+#
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+
+#!/bin/bash
+
+####################################
+#   build wasm-av1 sample   #
+####################################
+if [ ! -d "${EMSDK}" ]; then
+    echo "can not find emsdk. "
+    echo "please refer to https://emscripten.org/docs/getting_started/downloads.html "
+    echo "to install it, or active it by 'source <emsdk_dir>emsdk_env.sh'"
+    exit
+fi
+
+set -xe
+
+EMSDK_WASM_DIR="$EM_CACHE/wasm"
+BUILD_SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+OUT_DIR="${BUILD_SCRIPT_DIR}/out"
+WASM_AV1_DIR="${BUILD_SCRIPT_DIR}/wasm-av1"
+
+WAMR_PLATFORM_DIR="${BUILD_SCRIPT_DIR}/../../../product-mini/platforms"
+IWASM_CMD="${WAMR_PLATFORM_DIR}/linux/build/iwasm"
+
+WAMRC_DIR="${BUILD_SCRIPT_DIR}/../../../wamr-compiler"
+WAMRC_CMD="${WAMRC_DIR}/build/wamrc"
+
+function Clear_Before_Exit
+{
+    [[ -f ${WASM_AV1_DIR}/wasm-av1.patch ]] &&
+       rm -f ${WASM_AV1_DIR}/wasm-av1.patch
+    # resume the libc.a under EMSDK_WASM_DIR
+    cd ${EMSDK_WASM_DIR}
+    mv libc.a.bak libc.a
+}
+
+# 1.hack emcc
+cd ${EMSDK_WASM_DIR}
+# back up libc.a
+cp libc.a libc.a.bak
+# delete some objects in libc.a
+emar d libc.a fopen.o
+emar d libc.a fread.o
+emar d libc.a feof.o
+emar d libc.a fclose.o
+
+# 2. build wasm-av1
+cd ${BUILD_SCRIPT_DIR}
+# 2.1 clone wasm-av1 repo from Github
+if [ ! -d "wasm-av1" ]; then
+    git clone https://github.com/GoogleChromeLabs/wasm-av1.git
+fi
+
+# 2.2 copy the wasm-av1.patch to wasm-av1 and apply the patch
+cd ${WASM_AV1_DIR}
+cp -a ${BUILD_SCRIPT_DIR}/wasm-av1.patch .
+git checkout Makefile
+git checkout test.c
+git checkout third_party/aom
+
+if [[ $(git apply wasm-av1.patch 2>&1) =~ "error" ]]; then
+    echo "git apply patch failed, please check wasm-av1 related changes..."
+    Clear_Before_Exit
+    exit 0
+fi
+
+make testavx -j 4
+
+# remove patch file and recover emcc libc.a after building
+Clear_Before_Exit
+
+# 2.3 copy /make/gen target files to out/
+rm -rf ${OUT_DIR} && mkdir ${OUT_DIR}
+cp -a ${WASM_AV1_DIR}/testavx.wasm ${OUT_DIR}/
+
+# 3. compile wasm-av1.wasm to wasm-av1.aot with wamrc
+# 3.1 build wamr-compiler
+cd ${WAMRC_DIR}
+./build_llvm.sh
+rm -fr build && mkdir build
+cd build && cmake ..
+make
+# 3.2 compile wasm-av1.wasm to wasm-av1.aot
+cd ${OUT_DIR}
+${WAMRC_CMD} --enable-simd -o testavx.aot testavx.wasm
+
+# 4. build iwasm with pthread and libc_emcc enable
+cd ${WAMR_PLATFORM_DIR}/linux
+rm -fr build && mkdir build
+cd build && cmake .. -DWAMR_BUILD_SIMD=1 -DWAMR_BUILD_LIB_PTHREAD=1 -DWAMR_BUILD_LIBC_EMCC=1
+make
+
+# 5. run wasm-av1 with iwasm
+echo "---> run testav1.aot with iwasm"
+cd ${OUT_DIR}
+${IWASM_CMD} testavx.aot ../wasm-av1/third_party/samples/elephants_dream_480p24.ivf
+

+ 696 - 0
samples/workload/wasm-av1/wasm-av1.patch

@@ -0,0 +1,696 @@
+diff --git a/Makefile b/Makefile
+index c39fff6..4682d43 100644
+--- a/Makefile
++++ b/Makefile
+@@ -59,11 +59,13 @@ $(TARGET): $(DEPS) blob-api.c yuv-to-rgb.c $(EMLIBAV1)
+ 									   ]" \
+ 				blob-api.c yuv-to-rgb.c $(SRCS) $(INC) -L $(LIBDIR) -l$(LIB)
+ 
+-$(TESTTARGET): test.c $(DEPS) $(X86LIBAV1)
+-		cc -o $@ -O3 test.c $(SRCS) $(INC) -L $(X86LIBDIR) -l$(LIB)
++$(TESTTARGET): test.c $(DEPS) $(EMLIBAV1)
++		emcc -o $@.wasm -O3 test.c $(SRCS) $(INC) -L $(LIBDIR) -l$(LIB) \
++			-s TOTAL_MEMORY=104857600 -s ERROR_ON_UNDEFINED_SYMBOLS=0
+ 
+-$(TESTTARGET)g: test.c $(DEPS) $(X86LIBAV1)
+-		cc -o $@ -g test.c $(SRCS) $(INC) -L $(X86LIBDIR) -l$(LIB)
++$(TESTTARGET)g: test.c $(DEPS) $(EMLIBAV1)
++		emcc -o $@.wasm -g test.c $(SRCS) $(INC) -L $(LIBDIR) -l$(LIB) \
++			-s TOTAL_MEMORY=104857600 -s ERROR_ON_UNDEFINED_SYMBOLS=0
+ 
+ clean:
+ 		-rm $(TARGET) $(TESTTARGET) $(TESTTARGET)g
+@@ -80,7 +82,7 @@ $(EMLIBAV1): $(LIBDIR)
+ 		        -DCONFIG_RUNTIME_CPU_DETECT=0 \
+ 		        -DCONFIG_UNIT_TESTS=0 \
+ 		        -DCONFIG_WEBM_IO=0 \
+-		        -DCMAKE_TOOLCHAIN_FILE=`../../get-emcmake.sh`; \
++		        -DCMAKE_TOOLCHAIN_FILE=${EMSDK}/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake; \
+ 			make \
+ 		)
+ 
+diff --git a/test.c b/test.c
+index df2d44b..8e81cdc 100644
+--- a/test.c
++++ b/test.c
+@@ -18,6 +18,9 @@
+ 
+ #include "decode-av1-priv.h"
+ 
++size_t
++emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++
+ static void
+ dump_raw_frame(AVX_Video_Frame *avf, int id) {
+     FILE    *f;
+@@ -26,12 +29,13 @@ dump_raw_frame(AVX_Video_Frame *avf, int id) {
+     void    *buf;
+ 
+     sprintf(name, "frame%04d.yuv", id);
++    printf("writing %s ..\n", name);
+     if ((f = fopen(name, "wb")) == NULL) {
+         return;
+     }
+     buf = AVX_Video_Frame_get_buffer(avf);
+     size = AVX_Video_Frame_get_size(avf);
+-    fwrite(buf, size, 1, f);
++    emcc_fwrite(buf, size, 1, f);
+     fclose(f);
+ }
+ 
+@@ -63,6 +67,7 @@ main(int argc, char *argv[]) {
+                     static int     i = 0;
+                     
+                     ++i;
++                    printf("##decode raw frame %d\n", i);
+                     if (30 <= i && i < 40) {
+                         dump_raw_frame(af, i);
+                     }
+diff --git a/third_party/aom/CMakeLists.txt b/third_party/aom/CMakeLists.txt
+index 9dbe301..20c7be4 100644
+--- a/third_party/aom/CMakeLists.txt
++++ b/third_party/aom/CMakeLists.txt
+@@ -56,6 +56,10 @@ option(BUILD_SHARED_LIBS "CMake should generate a shared library build." OFF)
+ 
+ project(AOM C CXX)
+ 
++set(CMAKE_C_FLAGS "-msimd128 -msse2 -msse3 -msse4.1 -msse4.2 ${CMAKE_C_FLAGS}")
++set(CMAKE_CXX_FLAGS "-msimd128 -msse2 -msse3 -msse4.1 -msse4.2 ${CMAKE_CXX_FLAGS}")
++set(CMAKE_VERBOSE_MAKEFILE on)
++
+ set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
+ set(AOM_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+ set(INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/include"
+@@ -347,7 +351,7 @@ if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
+       em_link_post_js(inspect "${AOM_ROOT}/tools/inspect-post.js")
+       # Force generation of Wasm instead of asm.js
+       append_link_flag_to_target("inspect" "-s WASM=1")
+-      append_compiler_flag("-s WASM=1")
++      append_compiler_flag("-O3 -s WASM=1 -s ERROR_ON_UNDEFINED_SYMBOLS=0")
+     endif()
+   endif()
+ 
+diff --git a/third_party/aom/aom/src/aom_codec.c b/third_party/aom/aom/src/aom_codec.c
+index dbd6fa5..a8d2a49 100644
+--- a/third_party/aom/aom/src/aom_codec.c
++++ b/third_party/aom/aom/src/aom_codec.c
+@@ -132,6 +132,7 @@ void aom_internal_error(struct aom_internal_error_info *info,
+     info->detail[sz - 1] = '\0';
+   }
+ 
++  printf("##aom internal error: %s\n", info->detail);
+   if (info->setjmp) longjmp(info->jmp, info->error_code);
+ }
+ 
+diff --git a/third_party/aom/aom_dsp/grain_table.c b/third_party/aom/aom_dsp/grain_table.c
+index 0d6a73f..4b05833 100644
+--- a/third_party/aom/aom_dsp/grain_table.c
++++ b/third_party/aom/aom_dsp/grain_table.c
+@@ -293,6 +293,9 @@ aom_codec_err_t aom_film_grain_table_read(
+   return error_info->error_code;
+ }
+ 
++size_t
++emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++
+ aom_codec_err_t aom_film_grain_table_write(
+     const aom_film_grain_table_t *t, const char *filename,
+     struct aom_internal_error_info *error_info) {
+@@ -305,7 +308,7 @@ aom_codec_err_t aom_film_grain_table_write(
+     return error_info->error_code;
+   }
+ 
+-  if (!fwrite(kFileMagic, 8, 1, file)) {
++  if (!emcc_fwrite(kFileMagic, 8, 1, file)) {
+     aom_internal_error(error_info, AOM_CODEC_ERROR,
+                        "Unable to write file magic");
+     fclose(file);
+diff --git a/third_party/aom/aomdec.c b/third_party/aom/aomdec.c
+index 4addee8..f850147 100644
+--- a/third_party/aom/aomdec.c
++++ b/third_party/aom/aomdec.c
+@@ -274,6 +274,9 @@ static void update_image_md5(const aom_image_t *img, const int planes[3],
+   }
+ }
+ 
++size_t
++emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++
+ static void write_image_file(const aom_image_t *img, const int *planes,
+                              const int num_planes, FILE *file) {
+   int i, y;
+@@ -287,7 +290,7 @@ static void write_image_file(const aom_image_t *img, const int *planes,
+     const int h = aom_img_plane_height(img, plane);
+ 
+     for (y = 0; y < h; ++y) {
+-      fwrite(buf, bytes_per_sample, w, file);
++      emcc_fwrite(buf, bytes_per_sample, w, file);
+       buf += stride;
+     }
+   }
+diff --git a/third_party/aom/aomenc.c b/third_party/aom/aomenc.c
+index 64155b0..3ed5080 100644
+--- a/third_party/aom/aomenc.c
++++ b/third_party/aom/aomenc.c
+@@ -59,9 +59,12 @@ static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
+ }
+ #define fread wrap_fread
+ 
++size_t
++emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++
+ static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb,
+                           FILE *stream) {
+-  return fwrite(ptr, size, nmemb, stream);
++  return emcc_fwrite(ptr, size, nmemb, stream);
+ }
+ #define fwrite wrap_fwrite
+ 
+diff --git a/third_party/aom/aomstats.c b/third_party/aom/aomstats.c
+index 0cfeea2..6833776 100644
+--- a/third_party/aom/aomstats.c
++++ b/third_party/aom/aomstats.c
+@@ -80,9 +80,12 @@ void stats_close(stats_io_t *stats, int last_pass) {
+   }
+ }
+ 
++size_t
++emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++
+ void stats_write(stats_io_t *stats, const void *pkt, size_t len) {
+   if (stats->file) {
+-    (void)fwrite(pkt, 1, len, stats->file);
++    (void)emcc_fwrite(pkt, 1, len, stats->file);
+   } else {
+     if (stats->buf.sz + len > stats->buf_alloc_sz) {
+       size_t new_sz = stats->buf_alloc_sz + 64 * 1024;
+diff --git a/third_party/aom/av1/common/debugmodes.c b/third_party/aom/av1/common/debugmodes.c
+index 868f341..c44258c 100644
+--- a/third_party/aom/av1/common/debugmodes.c
++++ b/third_party/aom/av1/common/debugmodes.c
+@@ -89,10 +89,13 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
+   fclose(mvs);
+ }
+ 
++size_t
++emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++
+ void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
+                                          const char *filename) {
+   FILE *hdrFile = fopen(filename, "w");
+-  fwrite(data, size, sizeof(uint8_t), hdrFile);
++  emcc_fwrite(data, size, sizeof(uint8_t), hdrFile);
+   fclose(hdrFile);
+ }
+ 
+diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
+index a557380..d709d26 100644
+--- a/third_party/aom/av1/encoder/encoder.c
++++ b/third_party/aom/av1/encoder/encoder.c
+@@ -2799,6 +2799,9 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
+   snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
+ #endif  // CONFIG_INTERNAL_STATS
+ 
++size_t
++emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++
+ void av1_remove_compressor(AV1_COMP *cpi) {
+   AV1_COMMON *cm;
+   unsigned int i;
+@@ -2814,7 +2817,7 @@ void av1_remove_compressor(AV1_COMP *cpi) {
+     if (cpi->oxcf.pass != 1) {
+       fprintf(stderr, "Writing counts.stt\n");
+       FILE *f = fopen("counts.stt", "wb");
+-      fwrite(&aggregate_fc, sizeof(aggregate_fc), 1, f);
++      emcc_fwrite(&aggregate_fc, sizeof(aggregate_fc), 1, f);
+       fclose(f);
+     }
+ #endif  // CONFIG_ENTROPY_STATS
+@@ -3013,7 +3016,7 @@ void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
+   int h = s->y_height;
+ 
+   do {
+-    fwrite(src, s->y_width, 1, f);
++    emcc_fwrite(src, s->y_width, 1, f);
+     src += s->y_stride;
+   } while (--h);
+ 
+@@ -3021,7 +3024,7 @@ void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
+   h = s->uv_height;
+ 
+   do {
+-    fwrite(src, s->uv_width, 1, f);
++    emcc_fwrite(src, s->uv_width, 1, f);
+     src += s->uv_stride;
+   } while (--h);
+ 
+@@ -3029,7 +3032,7 @@ void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
+   h = s->uv_height;
+ 
+   do {
+-    fwrite(src, s->uv_width, 1, f);
++    emcc_fwrite(src, s->uv_width, 1, f);
+     src += s->uv_stride;
+   } while (--h);
+ }
+@@ -3121,7 +3124,7 @@ void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
+     uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
+ 
+     do {
+-      fwrite(src16, s->y_width, 2, yuv_rec_file);
++      emcc_fwrite(src16, s->y_width, 2, yuv_rec_file);
+       src16 += s->y_stride;
+     } while (--h);
+ 
+@@ -3129,7 +3132,7 @@ void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
+     h = s->uv_height;
+ 
+     do {
+-      fwrite(src16, s->uv_width, 2, yuv_rec_file);
++      emcc_fwrite(src16, s->uv_width, 2, yuv_rec_file);
+       src16 += s->uv_stride;
+     } while (--h);
+ 
+@@ -3137,7 +3140,7 @@ void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
+     h = s->uv_height;
+ 
+     do {
+-      fwrite(src16, s->uv_width, 2, yuv_rec_file);
++      emcc_fwrite(src16, s->uv_width, 2, yuv_rec_file);
+       src16 += s->uv_stride;
+     } while (--h);
+ 
+@@ -3146,7 +3149,7 @@ void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
+   }
+ 
+   do {
+-    fwrite(src, s->y_width, 1, yuv_rec_file);
++    emcc_fwrite(src, s->y_width, 1, yuv_rec_file);
+     src += s->y_stride;
+   } while (--h);
+ 
+@@ -3154,7 +3157,7 @@ void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
+   h = s->uv_height;
+ 
+   do {
+-    fwrite(src, s->uv_width, 1, yuv_rec_file);
++    emcc_fwrite(src, s->uv_width, 1, yuv_rec_file);
+     src += s->uv_stride;
+   } while (--h);
+ 
+@@ -3162,7 +3165,7 @@ void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
+   h = s->uv_height;
+ 
+   do {
+-    fwrite(src, s->uv_width, 1, yuv_rec_file);
++    emcc_fwrite(src, s->uv_width, 1, yuv_rec_file);
+     src += s->uv_stride;
+   } while (--h);
+ 
+@@ -3241,16 +3244,16 @@ static int dump_one_image(AV1_COMMON *cm,
+ 
+   // --- Y ---
+   for (h = 0; h < cm->height; ++h) {
+-    fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref);
++    emcc_fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref);
+   }
+   // --- U ---
+   for (h = 0; h < (cm->height >> 1); ++h) {
+-    fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
++    emcc_fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+            f_ref);
+   }
+   // --- V ---
+   for (h = 0; h < (cm->height >> 1); ++h) {
+-    fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
++    emcc_fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+            f_ref);
+   }
+ 
+@@ -4692,17 +4695,17 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) {
+ 
+   // --- Y ---
+   for (h = 0; h < cm->height; ++h) {
+-    fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width,
++    emcc_fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width,
+            f_recon);
+   }
+   // --- U ---
+   for (h = 0; h < (cm->height >> 1); ++h) {
+-    fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
++    emcc_fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+            f_recon);
+   }
+   // --- V ---
+   for (h = 0; h < (cm->height >> 1); ++h) {
+-    fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
++    emcc_fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+            f_recon);
+   }
+ 
+diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c
+index bb73fde..b963043 100644
+--- a/third_party/aom/av1/encoder/firstpass.c
++++ b/third_party/aom/av1/encoder/firstpass.c
+@@ -476,6 +476,9 @@ static double raw_motion_error_stdev(int *raw_motion_err_list,
+   return raw_err_stdev;
+ }
+ 
++size_t
++emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++
+ #define UL_INTRA_THRESH 50
+ #define INVALID_ROW -1
+ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
+@@ -1077,7 +1080,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
+     else
+       recon_file = fopen(filename, "ab");
+ 
+-    (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
++    (void)emcc_fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
+     fclose(recon_file);
+   }
+ 
+diff --git a/third_party/aom/build/cmake/aom_configure.cmake b/third_party/aom/build/cmake/aom_configure.cmake
+index 9220a32..fb8bf9f 100644
+--- a/third_party/aom/build/cmake/aom_configure.cmake
++++ b/third_party/aom/build/cmake/aom_configure.cmake
+@@ -260,7 +260,7 @@ if(MSVC)
+     add_compiler_flag_if_supported("/WX")
+   endif()
+ else()
+-  require_c_flag("-std=c99" YES)
++  #require_c_flag("-std=c99" YES)
+   add_compiler_flag_if_supported("-Wall")
+   add_compiler_flag_if_supported("-Wdisabled-optimization")
+   add_compiler_flag_if_supported("-Wextra")
+diff --git a/third_party/aom/examples/resize_util.c b/third_party/aom/examples/resize_util.c
+index 5485691..e60ed86 100644
+--- a/third_party/aom/examples/resize_util.c
++++ b/third_party/aom/examples/resize_util.c
+@@ -45,6 +45,9 @@ static int parse_dim(char *v, int *width, int *height) {
+     return 1;
+ }
+ 
++size_t
++emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++
+ int main(int argc, char *argv[]) {
+   char *fin, *fout;
+   FILE *fpin, *fpout;
+@@ -111,7 +114,7 @@ int main(int argc, char *argv[]) {
+     av1_resize_frame420(inbuf, width, inbuf_u, inbuf_v, width / 2, height,
+                         width, outbuf, target_width, outbuf_u, outbuf_v,
+                         target_width / 2, target_height, target_width);
+-    fwrite(outbuf, target_width * target_height * 3 / 2, 1, fpout);
++    emcc_fwrite(outbuf, target_width * target_height * 3 / 2, 1, fpout);
+     f++;
+   }
+   printf("%d frames processed\n", f);
+diff --git a/third_party/aom/examples/scalable_encoder.c b/third_party/aom/examples/scalable_encoder.c
+index 10d647e..fcf31e1 100644
+--- a/third_party/aom/examples/scalable_encoder.c
++++ b/third_party/aom/examples/scalable_encoder.c
+@@ -91,6 +91,9 @@ void usage_exit(void) {
+   exit(EXIT_FAILURE);
+ }
+ 
++size_t
++emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++
+ static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
+                         int frame_index, int flags, FILE *outfile) {
+   int got_pkts = 0;
+@@ -105,7 +108,7 @@ static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
+ 
+     if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+       const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+-      if (fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile) !=
++      if (emcc_fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile) !=
+           pkt->data.frame.sz) {
+         die_codec(codec, "Failed to write compressed frame");
+       }
+diff --git a/third_party/aom/ivfenc.c b/third_party/aom/ivfenc.c
+index 80f4d14..d0e4e34 100644
+--- a/third_party/aom/ivfenc.c
++++ b/third_party/aom/ivfenc.c
+@@ -14,6 +14,9 @@
+ #include "aom/aom_encoder.h"
+ #include "aom_ports/mem_ops.h"
+ 
++size_t
++emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++
+ void ivf_write_file_header(FILE *outfile, const struct aom_codec_enc_cfg *cfg,
+                            unsigned int fourcc, int frame_cnt) {
+   char header[32];
+@@ -32,7 +35,7 @@ void ivf_write_file_header(FILE *outfile, const struct aom_codec_enc_cfg *cfg,
+   mem_put_le32(header + 24, frame_cnt);            // length
+   mem_put_le32(header + 28, 0);                    // unused
+ 
+-  fwrite(header, 1, 32, outfile);
++  emcc_fwrite(header, 1, 32, outfile);
+ }
+ 
+ void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size) {
+@@ -41,12 +44,12 @@ void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size) {
+   mem_put_le32(header, (int)frame_size);
+   mem_put_le32(header + 4, (int)(pts & 0xFFFFFFFF));
+   mem_put_le32(header + 8, (int)(pts >> 32));
+-  fwrite(header, 1, 12, outfile);
++  emcc_fwrite(header, 1, 12, outfile);
+ }
+ 
+ void ivf_write_frame_size(FILE *outfile, size_t frame_size) {
+   char header[4];
+ 
+   mem_put_le32(header, (int)frame_size);
+-  fwrite(header, 1, 4, outfile);
++  emcc_fwrite(header, 1, 4, outfile);
+ }
+diff --git a/third_party/aom/test/decode_perf_test.cc b/third_party/aom/test/decode_perf_test.cc
+index 3c93e7d..2d364ae 100644
+--- a/third_party/aom/test/decode_perf_test.cc
++++ b/third_party/aom/test/decode_perf_test.cc
+@@ -24,6 +24,11 @@
+ 
+ using ::testing::make_tuple;
+ 
++extern "C" {
++  size_t
++  emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++}
++
+ namespace {
+ 
+ #define VIDEO_NAME 0
+@@ -153,7 +158,7 @@ class AV1NewEncodeDecodePerfTest
+ 
+     // Write frame header and data.
+     ivf_write_frame_header(outfile_, out_frames_, pkt->data.frame.sz);
+-    ASSERT_EQ(fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_),
++    ASSERT_EQ(emcc_fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_),
+               pkt->data.frame.sz);
+   }
+ 
+diff --git a/third_party/aom/test/film_grain_table_test.cc b/third_party/aom/test/film_grain_table_test.cc
+index 0688146..dbb8e6b 100644
+--- a/third_party/aom/test/film_grain_table_test.cc
++++ b/third_party/aom/test/film_grain_table_test.cc
+@@ -5,6 +5,11 @@
+ #include "av1/encoder/grain_test_vectors.h"
+ #include "test/video_source.h"
+ 
++extern "C" {
++  size_t
++  emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++}
++
+ void grain_equal(const aom_film_grain_t *expected,
+                  const aom_film_grain_t *actual) {
+   EXPECT_EQ(expected->apply_grain, actual->apply_grain);
+@@ -168,7 +173,7 @@ TEST_F(FilmGrainTableIOTest, ReadTruncatedFile) {
+ 
+   std::string grain_file;
+   FILE *file = libaom_test::GetTempOutFile(&grain_file);
+-  fwrite("deadbeef", 8, 1, file);
++  emcc_fwrite("deadbeef", 8, 1, file);
+   fclose(file);
+   ASSERT_EQ(AOM_CODEC_ERROR,
+             aom_film_grain_table_read(&table, grain_file.c_str(), &error_));
+diff --git a/third_party/aom/test/resize_test.cc b/third_party/aom/test/resize_test.cc
+index e1c4e9f..9c2bce8 100644
+--- a/third_party/aom/test/resize_test.cc
++++ b/third_party/aom/test/resize_test.cc
+@@ -22,6 +22,11 @@
+ // Enable(1) or Disable(0) writing of the compressed bitstream.
+ #define WRITE_COMPRESSED_STREAM 0
+ 
++extern "C" {
++  size_t
++  emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++}
++
+ namespace {
+ 
+ #if WRITE_COMPRESSED_STREAM
+@@ -55,13 +60,13 @@ static void write_ivf_file_header(const aom_codec_enc_cfg_t *const cfg,
+   mem_put_le32(header + 24, frame_cnt);           /* length */
+   mem_put_le32(header + 28, 0);                   /* unused */
+ 
+-  (void)fwrite(header, 1, 32, outfile);
++  (void)emcc_fwrite(header, 1, 32, outfile);
+ }
+ 
+ static void write_ivf_frame_size(FILE *const outfile, const size_t size) {
+   char header[4];
+   mem_put_le32(header, static_cast<unsigned int>(size));
+-  (void)fwrite(header, 1, 4, outfile);
++  (void)emcc_fwrite(header, 1, 4, outfile);
+ }
+ 
+ static void write_ivf_frame_header(const aom_codec_cx_pkt_t *const pkt,
+@@ -76,7 +81,7 @@ static void write_ivf_frame_header(const aom_codec_cx_pkt_t *const pkt,
+   mem_put_le32(header + 4, pts & 0xFFFFFFFF);
+   mem_put_le32(header + 8, pts >> 32);
+ 
+-  (void)fwrite(header, 1, 12, outfile);
++  (void)emcc_fwrite(header, 1, 12, outfile);
+ }
+ #endif  // WRITE_COMPRESSED_STREAM
+ 
+@@ -309,7 +314,7 @@ class ResizeInternalTestLarge : public ResizeTest {
+ 
+     // Write frame header and data.
+     write_ivf_frame_header(pkt, outfile_);
+-    (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
++    (void)emcc_fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
+   }
+ #endif
+ 
+@@ -608,7 +613,7 @@ class ResizeCspTest : public ResizeTest {
+ 
+     // Write frame header and data.
+     write_ivf_frame_header(pkt, outfile_);
+-    (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
++    (void)emcc_fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
+   }
+ #endif
+ 
+diff --git a/third_party/aom/test/y4m_test.cc b/third_party/aom/test/y4m_test.cc
+index ad901d9..f24093f 100644
+--- a/third_party/aom/test/y4m_test.cc
++++ b/third_party/aom/test/y4m_test.cc
+@@ -19,6 +19,11 @@
+ #include "test/util.h"
+ #include "test/y4m_video_source.h"
+ 
++extern "C" {
++  size_t
++  emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++}
++
+ namespace {
+ 
+ using std::string;
+@@ -68,7 +73,7 @@ static void write_image_file(const aom_image_t *img, FILE *file) {
+         (plane ? (img->d_w + img->x_chroma_shift) >> img->x_chroma_shift
+                : img->d_w);
+     for (y = 0; y < h; ++y) {
+-      fwrite(buf, bytes_per_sample, w, file);
++      emcc_fwrite(buf, bytes_per_sample, w, file);
+       buf += stride;
+     }
+   }
+diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest.cc
+index 5a8932c..ac2c435 100644
+--- a/third_party/aom/third_party/googletest/src/googletest/src/gtest.cc
++++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest.cc
+@@ -146,6 +146,11 @@
+ # define vsnprintf _vsnprintf
+ #endif  // GTEST_OS_WINDOWS
+ 
++extern "C" {
++  size_t
++  emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++}
++
+ namespace testing {
+ 
+ using internal::CountIf;
+@@ -3867,7 +3872,7 @@ class ScopedPrematureExitFile {
+       // errors are ignored as there's nothing better we can do and we
+       // don't want to fail the test because of this.
+       FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
+-      fwrite("0", 1, 1, pfile);
++      emcc_fwrite("0", 1, 1, pfile);
+       fclose(pfile);
+     }
+   }
+diff --git a/third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.cc b/third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.cc
+index 84655d8..0004093 100644
+--- a/third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.cc
++++ b/third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.cc
+@@ -14,6 +14,11 @@
+ #include <share.h>  // for _SH_DENYWR
+ #endif
+ 
++extern "C" {
++  size_t
++  emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++}
++
+ namespace mkvmuxer {
+ 
+ MkvWriter::MkvWriter() : file_(NULL), writer_owns_file_(true) {}
+@@ -32,7 +37,7 @@ int32 MkvWriter::Write(const void* buffer, uint32 length) {
+   if (buffer == NULL)
+     return -1;
+ 
+-  const size_t bytes_written = fwrite(buffer, 1, length, file_);
++  const size_t bytes_written = emcc_fwrite(buffer, 1, length, file_);
+ 
+   return (bytes_written == length) ? 0 : -1;
+ }
+diff --git a/third_party/aom/tools_common.c b/third_party/aom/tools_common.c
+index 7abc20c..fbc30bc 100644
+--- a/third_party/aom/tools_common.c
++++ b/third_party/aom/tools_common.c
+@@ -185,6 +185,9 @@ const AvxInterface *get_aom_decoder_by_fourcc(uint32_t fourcc) {
+ }
+ #endif  // CONFIG_AV1_DECODER
+ 
++size_t
++emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++
+ void aom_img_write(const aom_image_t *img, FILE *file) {
+   int plane;
+ 
+@@ -197,7 +200,7 @@ void aom_img_write(const aom_image_t *img, FILE *file) {
+     int y;
+ 
+     for (y = 0; y < h; ++y) {
+-      fwrite(buf, 1, w, file);
++      emcc_fwrite(buf, 1, w, file);
+       buf += stride;
+     }
+   }
+diff --git a/third_party/aom/video_writer.c b/third_party/aom/video_writer.c
+index 4e072c7..6b1ca54 100644
+--- a/third_party/aom/video_writer.c
++++ b/third_party/aom/video_writer.c
+@@ -66,10 +66,13 @@ void aom_video_writer_close(AvxVideoWriter *writer) {
+   }
+ }
+ 
++size_t
++emcc_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
++
+ int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer,
+                                  size_t size, int64_t pts) {
+   ivf_write_frame_header(writer->file, pts, size);
+-  if (fwrite(buffer, 1, size, writer->file) != size) return 0;
++  if (emcc_fwrite(buffer, 1, size, writer->file) != size) return 0;
+ 
+   ++writer->frame_count;
+ 

+ 1 - 0
wamr-compiler/CMakeLists.txt

@@ -27,6 +27,7 @@ add_definitions(-DWASM_DISABLE_HW_BOUND_CHECK=1)
 add_definitions(-DWASM_ENABLE_SHARED_MEMORY=1)
 add_definitions(-DWASM_ENABLE_THREAD_MGR=1)
 add_definitions(-DWASM_ENABLE_TAIL_CALL=1)
+add_definitions(-DWASM_ENABLE_SIMD=1)
 
 # Set WAMR_BUILD_TARGET, currently values supported:
 # "X86_64", "AMD_64", "X86_32", "ARM_32", "MIPS_32", "XTENSA_32"

+ 7 - 2
wamr-compiler/main.c

@@ -43,6 +43,7 @@ print_help()
   printf("  --enable-multi-thread     Enable multi-thread feature, the dependent features bulk-memory and\n");
   printf("  --enable-tail-call        Enable the post-MVP tail call feature\n");
   printf("                            thread-mgr will be enabled automatically\n");
+  printf("  --enable-simd             Enable the post-MVP 128-bit SIMD feature\n");
   printf("  -v=n                      Set log verbose level (0 to 5, default is 2), larger with more log\n");
   printf("Examples: wamrc -o test.aot test.wasm\n");
   printf("          wamrc --target=i386 -o test.aot test.wasm\n");
@@ -70,6 +71,7 @@ main(int argc, char *argv[])
   option.output_format = AOT_FORMAT_FILE;
   /* default value, enable or disable depends on the platform */
   option.bounds_checks = 2;
+  option.enable_simd = false;
 
   /* Process options.  */
   for (argc--, argv++; argc > 0 && argv[0][0] == '-'; argc--, argv++) {
@@ -150,6 +152,9 @@ main(int argc, char *argv[])
     else if (!strcmp(argv[0], "--enable-tail-call")) {
         option.enable_tail_call = true;
     }
+    else if (!strcmp(argv[0], "--enable-simd")) {
+        option.enable_simd = true;
+    }
     else
       return print_help();
   }
@@ -158,8 +163,8 @@ main(int argc, char *argv[])
     return print_help();
 
   if (sgx_mode) {
-      option.size_level = 1;
-      option.is_sgx_platform = true;
+    option.size_level = 1;
+    option.is_sgx_platform = true;
   }
 
   wasm_file_name = argv[0];