Browse Source

Upgrade XNNPACK workload (#2394)

- Sync source code to b9d4073a6913891ce9cbd8965c8d506075d2a45a, which is
  referred by tensorflow
- Upgrade emscripten to 3.1.44
- CMake outputs are .wasm files and .aot files
liang.he 2 years ago
parent
commit
ecd4fccc96

+ 183 - 132
samples/workload/XNNPACK/CMakeLists.txt

@@ -1,147 +1,198 @@
 # Copyright (C) 2019 Intel Corporation. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-cmake_minimum_required (VERSION 3.0)
+cmake_minimum_required (VERSION 3.14)
 
 project(xnnpack_wasm)
 
 ################  EMCC ################
 include(ExternalProject)
 
-ExternalProject_Add(xnnpack
+# grep xnnpack_benchmark -A 1 BUILD.bazel \
+#   | grep "name =" \
+#   | awk '{print $3}' \
+#   | sed -e 's/\"//g; s/,//g; s/^/\"/g; s/$/\"/g'
+list(APPEND NATIVE_BENCHMARKS
+  "qs8_dwconv_bench"
+  "qs8_f32_vcvt_bench"
+  "qs8_gemm_bench"
+  "qs8_requantization_bench"
+  "qs8_vadd_bench"
+  "qs8_vaddc_bench"
+  "qs8_vcvt_bench"
+  "qs16_qs8_vcvt_bench"
+  "qs8_vlrelu_bench"
+  "qs8_vmul_bench"
+  "qs8_vmulc_bench"
+  "qu8_f32_vcvt_bench"
+  "qu8_gemm_bench"
+  "qu8_requantization_bench"
+  "qu8_vadd_bench"
+  "qu8_vaddc_bench"
+  "qu8_vcvt_bench"
+  "qu8_vlrelu_bench"
+  "qu8_vmul_bench"
+  "qu8_vmulc_bench"
+  "bf16_gemm_bench"
+  "f16_f32acc_igemm_bench"
+  "f16_igemm_bench"
+  "f16_f32acc_gemm_bench"
+  "f16_gemm_bench"
+  "f16_raddstoreexpminusmax_bench"
+  "f16_spmm_bench"
+  "f16_vsigmoid_bench"
+  "f16_vtanh_bench"
+  "f16_f32_vcvt_bench"
+  "f32_igemm_bench"
+  "f32_conv_hwc_bench"
+  "f16_conv_hwc2chw_bench"
+#   "f16_gavgpool_cw_bench"
+#   "f32_gavgpool_cw_bench"
+  "f32_conv_hwc2chw_bench"
+  "f16_dwconv_bench"
+  "f32_dwconv_bench"
+  "f32_dwconv2d_chw_bench"
+  "f16_dwconv2d_chw_bench"
+  "f32_f16_vcvt_bench"
+  "xx_transpose_bench"
+  "x8_transpose_bench"
+  "x16_transpose_bench"
+  "x24_transpose_bench"
+  "x32_transpose_bench"
+  "x64_transpose_bench"
+  "f32_bgemm_bench"
+  "f32_gemm_bench"
+  "f32_qs8_vcvt_bench"
+  "f32_qu8_vcvt_bench"
+  "f32_raddexpminusmax_bench"
+  "f32_raddextexp_bench"
+  "f32_raddstoreexpminusmax_bench"
+  "f32_rmax_bench"
+  "f32_spmm_bench"
+  "f32_softmax_bench"
+  "f16_velu_bench"
+  "f32_velu_bench"
+  "f32_vhswish_bench"
+  "f32_vlrelu_bench"
+  "f32_vrelu_bench"
+  "f32_vscaleexpminusmax_bench"
+  "f32_vscaleextexp_bench"
+  "f32_vsigmoid_bench"
+  "f16_vsqrt_bench"
+  "f32_vsqrt_bench"
+  "f32_vtanh_bench"
+  "f32_im2col_gemm_bench"
+  "rounding_bench"
+  "s16_rmaxabs_bench"
+  "s16_window_bench"
+  "u32_filterbank_accumulate_bench"
+  "u32_filterbank_subtract_bench"
+  "u32_vlog_bench"
+  "u64_u32_vsqrtshift_bench"
+  "i16_vlshift_bench"
+  "cs16_vsquareabs_bench"
+  "cs16_bfly4_bench"
+  "cs16_fftr_bench"
+  "x8_lut_bench"
+  "x32_packw_bench"
+  "x16_packw_bench"
+  "abs_bench"
+  "average_pooling_bench"
+  "bankers_rounding_bench"
+  "ceiling_bench"
+  "channel_shuffle_bench"
+  "convert_bench"
+  "convolution_bench"
+  "deconvolution_bench"
+  "elu_bench"
+  "floor_bench"
+  "global_average_pooling_bench"
+  "hardswish_bench"
+  "leaky_relu_bench"
+  "max_pooling_bench"
+  "negate_bench"
+  "prelu_bench"
+  "sigmoid_bench"
+  "softmax_bench"
+  "square_bench"
+  "square_root_bench"
+  "tanh_bench"
+  "truncation_bench"
+  "f16_dwconv_e2e_bench"
+  "f16_gemm_e2e_bench"
+  "f32_dwconv_e2e_bench"
+  "f32_gemm_e2e_bench"
+  "qs8_dwconv_e2e_bench"
+  "qs8_gemm_e2e_bench"
+  "qu8_gemm_e2e_bench"
+  "qu8_dwconv_e2e_bench"
+#   "end2end_bench"
+  "f16_exp_ulp_eval"
+  "f16_expminus_ulp_eval"
+  "f16_expm1minus_ulp_eval"
+  "f16_sigmoid_ulp_eval"
+  "f16_sqrt_ulp_eval"
+  "f16_tanh_ulp_eval"
+  "f32_exp_ulp_eval"
+  "f32_expminus_ulp_eval"
+  "f32_expm1minus_ulp_eval"
+  "f32_extexp_ulp_eval"
+  "f32_sigmoid_ulp_eval"
+  "f32_sqrt_ulp_eval"
+  "f32_tanh_ulp_eval"
+)
+
+# Only Download
+ExternalProject_Add(xnnpack-download
     PREFIX xnnpack
     GIT_REPOSITORY https://github.com/google/XNNPACK.git
-    GIT_TAG        4570a7151aa4f3e57eca14a575eeff6bb13e26be
+    GIT_TAG        b9d4073a6913891ce9cbd8965c8d506075d2a45a
     GIT_PROGRESS   ON
     SOURCE_DIR     ${CMAKE_CURRENT_SOURCE_DIR}/xnnpack
-    UPDATE_COMMAND git restore .
-                   && cmake -E copy ${CMAKE_CURRENT_SOURCE_DIR}/xnnpack/google3/third_party/XNNPACK/microkernels.bzl
-                      ${CMAKE_CURRENT_SOURCE_DIR}/xnnpack/
-                   && git apply ${CMAKE_CURRENT_SOURCE_DIR}/xnnpack.patch
+    UPDATE_COMMAND ""
+    PATCH_COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/xnnpack.patch
     CONFIGURE_COMMAND ""
-    # grep xnnpack_benchmark -A 1 BUILD.bazel \
-    #   | grep "name =" \
-    #   | awk '{print $3}' \
-    #   | sed -e 's/\"//g' -e 's/,//g' -e 's/^/\/\/:/g'
-    BUILD_COMMAND  cd ${CMAKE_CURRENT_SOURCE_DIR}/xnnpack
-                   && bazel --output_user_root=build-user-output build -c opt --config=wasm
-                       //:qs8_dwconv_bench.wasm
-                       //:qs8_f32_vcvt_bench.wasm
-                       //:qs8_gemm_bench.wasm
-                       //:qs8_requantization_bench.wasm
-                       //:qs8_vadd_bench.wasm
-                       //:qs8_vaddc_bench.wasm
-                       //:qs8_vcvt_bench.wasm
-                       //:qs8_vlrelu_bench.wasm
-                       //:qs8_vmul_bench.wasm
-                       //:qs8_vmulc_bench.wasm
-                       //:qu8_f32_vcvt_bench.wasm
-                       //:qu8_gemm_bench.wasm
-                       //:qu8_requantization_bench.wasm
-                       //:qu8_vadd_bench.wasm
-                       //:qu8_vaddc_bench.wasm
-                       //:qu8_vcvt_bench.wasm
-                       //:qu8_vlrelu_bench.wasm
-                       //:qu8_vmul_bench.wasm
-                       //:qu8_vmulc_bench.wasm
-                       //:bf16_gemm_bench.wasm
-                       //:f16_igemm_bench.wasm
-                       //:f16_gemm_bench.wasm
-                       //:f16_raddstoreexpminusmax_bench.wasm
-                       //:f16_spmm_bench.wasm
-                       //:f16_vsigmoid_bench.wasm
-                       //:f16_f32_vcvt_bench.wasm
-                       //:f32_igemm_bench.wasm
-                       //:f32_conv_hwc_bench.wasm
-                       //:f16_conv_hwc2chw_bench.wasm
-                       //:f16_gavgpool_cw_bench.wasm
-                       //:f32_gavgpool_cw_bench.wasm
-                       //:f32_conv_hwc2chw_bench.wasm
-                       //:f16_dwconv_bench.wasm
-                       //:f32_dwconv_bench.wasm
-                       //:f32_dwconv2d_chw_bench.wasm
-                       //:f16_dwconv2d_chw_bench.wasm
-                       //:f32_f16_vcvt_bench.wasm
-                       //:xx_transpose_bench.wasm
-                       //:x8_transpose_bench.wasm
-                       //:x16_transpose_bench.wasm
-                       //:x24_transpose_bench.wasm
-                       //:x32_transpose_bench.wasm
-                       //:x64_transpose_bench.wasm
-                       //:f32_gemm_bench.wasm
-                       //:f32_qs8_vcvt_bench.wasm
-                       //:f32_qu8_vcvt_bench.wasm
-                       //:f32_raddexpminusmax_bench.wasm
-                       //:f32_raddextexp_bench.wasm
-                       //:f32_raddstoreexpminusmax_bench.wasm
-                       //:f32_rmax_bench.wasm
-                       //:f32_spmm_bench.wasm
-                       //:f32_softmax_bench.wasm
-                       //:f16_velu_bench.wasm
-                       //:f32_velu_bench.wasm
-                       //:f32_vhswish_bench.wasm
-                       //:f32_vlrelu_bench.wasm
-                       //:f32_vrelu_bench.wasm
-                       //:f32_vscaleexpminusmax_bench.wasm
-                       //:f32_vscaleextexp_bench.wasm
-                       //:f32_vsigmoid_bench.wasm
-                       //:f16_vsqrt_bench.wasm
-                       //:f32_vsqrt_bench.wasm
-                       //:f32_im2col_gemm_bench.wasm
-                       //:rounding_bench.wasm
-                       //:s16_rmaxabs_bench.wasm
-                       //:s16_window_bench.wasm
-                       //:u32_filterbank_accumulate_bench.wasm
-                       //:u32_filterbank_subtract_bench.wasm
-                       //:u32_vlog_bench.wasm
-                       //:u64_u32_vsqrtshift_bench.wasm
-                       //:i16_vlshift_bench.wasm
-                       //:cs16_vsquareabs_bench.wasm
-                       //:cs16_bfly4_bench.wasm
-                       //:cs16_fftr_bench.wasm
-                       //:x8_lut_bench.wasm
-                       //:abs_bench.wasm
-                       //:average_pooling_bench.wasm
-                       //:bankers_rounding_bench.wasm
-                       //:ceiling_bench.wasm
-                       //:channel_shuffle_bench.wasm
-                       //:convert_bench.wasm
-                       //:convolution_bench.wasm
-                       //:deconvolution_bench.wasm
-                       //:elu_bench.wasm
-                       //:floor_bench.wasm
-                       //:global_average_pooling_bench.wasm
-                       //:hardswish_bench.wasm
-                       //:leaky_relu_bench.wasm
-                       //:max_pooling_bench.wasm
-                       //:negate_bench.wasm
-                       //:sigmoid_bench.wasm
-                       //:prelu_bench.wasm
-                       //:softmax_bench.wasm
-                       //:square_bench.wasm
-                       //:square_root_bench.wasm
-                       //:truncation_bench.wasm
-                       //:f16_gemm_e2e_bench.wasm
-                       //:f32_dwconv_e2e_bench.wasm
-                       //:f32_gemm_e2e_bench.wasm
-                       //:qs8_dwconv_e2e_bench.wasm
-                       //:qs8_gemm_e2e_bench.wasm
-                       //:qu8_gemm_e2e_bench.wasm
-                       //:qu8_dwconv_e2e_bench.wasm
-                       //:end2end_bench.wasm
-                       //:f16_exp_ulp_eval.wasm
-                       //:f16_expminus_ulp_eval.wasm
-                       //:f16_expm1minus_ulp_eval.wasm
-                       //:f16_sigmoid_ulp_eval.wasm
-                       //:f16_sqrt_ulp_eval.wasm
-                       //:f32_exp_ulp_eval.wasm
-                       //:f32_expminus_ulp_eval.wasm
-                       //:f32_expm1minus_ulp_eval.wasm
-                       //:f32_extexp_ulp_eval.wasm
-                       //:f32_sigmoid_ulp_eval.wasm
-                       //:f32_sqrt_ulp_eval.wasm
-                       //:f32_tanh_ulp_eval.wasm
-    INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
-                      ${CMAKE_CURRENT_SOURCE_DIR}/xnnpack/bazel-out/wasm-opt/bin/
-                      ${CMAKE_BINARY_DIR}/wasm-opt
+    BUILD_COMMAND ""
+    INSTALL_COMMAND ""
+    TEST_COMMAND ""
 )
+
+set(WAMRC "${CMAKE_CURRENT_SOURCE_DIR}/../../../wamr-compiler/build/wamrc")
+if(EXISTS ${WAMRC})
+  message("-- Will generate .aot")
+else()
+  message("Will generate .wasm")
+endif()
+
+foreach(BENCHMARK IN LISTS NATIVE_BENCHMARKS)
+  string(CONCAT WASM_BENCHMARK "//:" ${BENCHMARK} "-wasm")
+  string(CONCAT WASM_OUTPUT ${BENCHMARK} ".wasm")
+
+  add_custom_command(
+    OUTPUT ${WASM_OUTPUT}
+    COMMAND bazel --output_user_root=build-user-output build -c opt --config=wasm ${WASM_BENCHMARK}
+              && ${CMAKE_COMMAND} -E copy_if_different ./bazel-bin/${WASM_OUTPUT} ${CMAKE_CURRENT_BINARY_DIR}/${WASM_OUTPUT}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/xnnpack
+    DEPENDS xnnpack-download
+    COMMENT "Generating ${WASM_OUTPUT} ..."
+  )
+
+  set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_CLEAN_FILES ${CMAKE_CURRENT_BINARY_DIR}/${WASM_OUTPUT})
+
+  if(EXISTS ${WAMRC})
+    string(CONCAT AOT_OUTPUT ${BENCHMARK} ".aot")
+
+    add_custom_command(
+      OUTPUT ${AOT_OUTPUT}
+      COMMAND ${WAMRC} -o ${AOT_OUTPUT} ${WASM_OUTPUT}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+      DEPENDS ${WASM_OUTPUT}
+      COMMENT "Generating ${AOT_OUTPUT} ..."
+    )
+
+    add_custom_target(${BENCHMARK} ALL DEPENDS ${AOT_OUTPUT})
+  else()
+    add_custom_target(${BENCHMARK} ALL DEPENDS ${WASM_OUTPUT})
+  endif()
+endforeach()
+

+ 21 - 20
samples/workload/XNNPACK/README.md

@@ -9,17 +9,29 @@ please refer to [installation instructions](../README.md).
 
 ## Build XNNPACK
 
-```bash
-cd <wamr-dir>/samples/workload/XNNPACK
-mkdir build
-cd build
+please build wamrc:
+
+``` bash
+cd <wamr-dir>/wamr-compiler
+./build_llvm.sh
+mkdir build && cd build
 cmake ..
+make
+```
+
+And then build xnnpack standalone wasm files
+
+```bash
+$ cd <wamr-dir>/samples/workload/XNNPACK
+$ cmake -S . -B build
+$ cmake --build build
 ```
-The wasm files are generated under folder samples/workload/XNNPACK/xnnpack/bazel-bin.
+
+Generated .wasm(and .aot) files are under *samples/workload/XNNPACK/build*.
 
 ## Run benchmarks
 
-Firstly please build iwasm with simd, libc-emcc and lib-pthread support:
+Firstly please build iwasm with simd, libc-emcc and lib-pthread supporting:
 
 ``` bash
 $ cd <wamr-dir>/product-mini/platforms/linux/
@@ -28,21 +40,10 @@ $ cmake .. -DWAMR_BUILD_LIBC_EMCC=1 -DWAMR_BUILD_LIB_PTHREAD=1
 $ make
 ```
 
-And please build wamrc:
-
-``` bash
-cd <wamr-dir>/wamr-compiler
-./build_llvm.sh
-mkdir build && cd build
-cmake ..
-make
-```
-
-Then compile wasm file to aot file and run:
+Then run:
 
 ``` shell
-$ cd <wamr-dir>/samples/workload/XNNPACK/xnnpack/bazel-bin
-$ wamrc -o average_pooling_bench.aot average_pooling_bench.wasm  (or other wasm files)
-$ iwasm average_pooling_bench.aot
+$ cd <wamr-dir>/samples/workload/XNNPACK/build
+$ iwasm averag_pooling_bench.aot # (or other aot files)
 ```
 

+ 95 - 98
samples/workload/XNNPACK/xnnpack.patch

@@ -1,141 +1,138 @@
 diff --git a/.bazelrc b/.bazelrc
-index 688279da1..376996885 100644
+index fcaff1063..e61d53337 100644
 --- a/.bazelrc
 +++ b/.bazelrc
-@@ -53,4 +53,9 @@ build:ios_fat --watchos_cpus=armv7k
- build:macos --apple_platform_type=macos
+@@ -1,6 +1,7 @@
+ # Basic build settings
+ build --jobs 128
+ build --cxxopt='-std=gnu++14'
++build --incompatible_enable_cc_toolchain_resolution
+ 
+ # Sets the default Apple platform to macOS.
+ build --apple_platform_type=macos
+@@ -55,3 +56,10 @@ build:macos --apple_platform_type=macos
  
  build:macos_arm64 --config=macos
--build:macos_arm64 --cpu=darwin_arm64
-\ No newline at end of file
-+build:macos_arm64 --cpu=darwin_arm64
+ build:macos_arm64 --cpu=darwin_arm64
 +
++# Emscripten configs
++build:wasm --copt="-Wno-unused"
++build:wasm --copt="-Wno-unused-function"
++build:wasm --copt="-Wno-unused-but-set-variable"
 +build:wasm --cpu=wasm
 +build:wasm --features=wasm_simd
-+build:wasm --crosstool_top=@emsdk//emscripten_toolchain:everything
-+build:wasm --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
 diff --git a/WORKSPACE b/WORKSPACE
-index cd8960ffa..787e03ca8 100644
+index 2e568088b..3961371ca 100644
 --- a/WORKSPACE
 +++ b/WORKSPACE
-@@ -29,8 +29,9 @@ http_archive(
- # Google Benchmark library, used in micro-benchmarks.
- http_archive(
-     name = "com_google_benchmark",
--    strip_prefix = "benchmark-main",
--    urls = ["https://github.com/google/benchmark/archive/main.zip"],
-+    sha256 = "1ba14374fddcd9623f126b1a60945e4deac4cdc4fb25a5f25e7f779e36f2db52",
-+    strip_prefix = "benchmark-d2a8a4ee41b923876c034afb939c4fc03598e622",
-+    urls = ["https://github.com/google/benchmark/archive/d2a8a4ee41b923876c034afb939c4fc03598e622.zip"],
+@@ -83,7 +83,23 @@ http_archive(
  )
  
- # FP16 library, used for half-precision conversions
-@@ -92,8 +93,25 @@ http_archive(
-     ],
- )
+ # Android NDK location and version is auto-detected from $ANDROID_NDK_HOME environment variable
+-android_ndk_repository(name = "androidndk")
++# android_ndk_repository(name = "androidndk")
  
-+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+ # Android SDK location and API is auto-detected from $ANDROID_HOME environment variable
+-android_sdk_repository(name = "androidsdk")
++# android_sdk_repository(name = "androidsdk")
++
 +http_archive(
 +    name = "emsdk",
-+    # Use emsdk-3.0.0 since the larger version may:
-+    #   - compress the wasm file into a tar file but not directly generate wasm file
-+    #   - generate incomplete implementation of libc API, e.g. throw exception in getentropy
-+    strip_prefix = "emsdk-3.0.0/bazel",
-+    url = "https://github.com/emscripten-core/emsdk/archive/refs/tags/3.0.0.tar.gz",
-+    sha256 = "a41dccfd15be9e85f923efaa0ac21943cbab77ec8d39e52f25eca1ec61a9ac9e"
++    sha256 = "5fa6f5eb45a4d50264610c4c9e1c155535359b63bfaad69b4e5101d16c1e7e32",
++    strip_prefix = "emsdk-a896e3d066448b3530dbcaa48869fafefd738f57/bazel",
++    url = "https://github.com/emscripten-core/emsdk/archive/a896e3d066448b3530dbcaa48869fafefd738f57.tar.gz",
 +)
 +
 +load("@emsdk//:deps.bzl", emsdk_deps = "deps")
 +emsdk_deps()
 +
 +load("@emsdk//:emscripten_deps.bzl", emsdk_emscripten_deps = "emscripten_deps")
-+emsdk_emscripten_deps()
++emsdk_emscripten_deps(emscripten_version = "3.1.44")
 +
- # Android NDK location and version is auto-detected from $ANDROID_NDK_HOME environment variable
--android_ndk_repository(name = "androidndk")
-+#android_ndk_repository(name = "androidndk")
++load("@emsdk//:toolchains.bzl", "register_emscripten_toolchains")
++register_emscripten_toolchains()
+diff --git a/bench/utils.cc b/bench/utils.cc
+index 3b32503a7..656845336 100644
+--- a/bench/utils.cc
++++ b/bench/utils.cc
+@@ -456,3 +456,13 @@ CodeMemoryHelper::~CodeMemoryHelper() {
  
- # Android SDK location and API is auto-detected from $ANDROID_HOME environment variable
--android_sdk_repository(name = "androidsdk")
-+#android_sdk_repository(name = "androidsdk")
+ }  // namespace utils
+ }  // namespace benchmark
++
++
++extern "C"
++__attribute__((import_module("env"), import_name("getentropy"))) int import_getentropy(void* buffer, size_t length);
++
++extern "C"
++int getentropy(void* buffer, size_t length)
++{
++  return import_getentropy(buffer, length);
++}
 diff --git a/build_defs.bzl b/build_defs.bzl
-index b8217a18d..6f2d1675e 100644
+index 01b436eb7..2738fd50a 100644
 --- a/build_defs.bzl
 +++ b/build_defs.bzl
-@@ -380,7 +380,7 @@ def xnnpack_benchmark(name, srcs, copts = [], deps = [], tags = []):
-             explicitly specified.
-     """
-     native.cc_binary(
--        name = name,
-+        name = name + ".wasm",
-         srcs = srcs,
-         copts = xnnpack_std_cxxopts() + [
-             "-Iinclude",
-@@ -405,5 +405,5 @@ def xnnpack_benchmark(name, srcs, copts = [], deps = [], tags = []):
+@@ -1,6 +1,7 @@
+ """Build definitions and rules for XNNPACK."""
+ 
+-load(":emscripten.bzl", "xnnpack_emscripten_benchmark_linkopts", "xnnpack_emscripten_deps", "xnnpack_emscripten_minimal_linkopts", "xnnpack_emscripten_test_linkopts")
++load(":emscripten.bzl", "xnnpack_emscripten_benchmark_linkopts", "xnnpack_emscripten_deps", "xnnpack_emscripten_minimal_linkopts", "xnnpack_emscripten_test_linkopts", "xnnpack_emscripten_standalone_benchmark_linkopts")
++load("@emsdk//emscripten_toolchain:wasm_rules.bzl", "wasm_cc_binary")
+ 
+ def xnnpack_visibility():
+     """Visibility of :XNNPACK target.
+@@ -393,7 +394,8 @@ def xnnpack_benchmark(name, srcs, copts = [], deps = [], tags = []):
+             "//conditions:default": ["-Wno-unused-function"],
+         }) + copts,
+         linkopts = select({
+-            ":emscripten": xnnpack_emscripten_benchmark_linkopts(),
++            ":emscripten": xnnpack_emscripten_standalone_benchmark_linkopts(),
++            ":emscripten_wasmsimd": xnnpack_emscripten_standalone_benchmark_linkopts(),
+             ":windows_x86_64_mingw": ["-lshlwapi"],
+             ":windows_x86_64_msys": ["-lshlwapi"],
+             "//conditions:default": [],
+@@ -405,5 +407,16 @@ def xnnpack_benchmark(name, srcs, copts = [], deps = [], tags = []):
              ":emscripten": xnnpack_emscripten_deps(),
              "//conditions:default": [],
          }),
 -	tags = tags,
-+	    tags = tags,
++        tags = tags,
++    )
++
++    wasm_cc_binary(
++        name = name + "-wasm",
++        cc_target = ":" + name,
++        threads = "off",
++        simd = True,
++        standalone= True,
++        outputs = [
++            name + ".wasm",
++        ]
      )
 diff --git a/emscripten.bzl b/emscripten.bzl
-index f1557a7b1..7f964a094 100644
+index f1557a7b1..a3c4f93b9 100644
 --- a/emscripten.bzl
 +++ b/emscripten.bzl
-@@ -25,12 +25,19 @@ def xnnpack_emscripten_benchmark_linkopts():
-     """Emscripten-specific linkopts for benchmarks."""
-     return [
-         "-s ASSERTIONS=1",
--        "-s ENVIRONMENT=node,shell,web",
--        "-s ERROR_ON_UNDEFINED_SYMBOLS=1",
--        "-s EXIT_RUNTIME=1",
+@@ -33,6 +33,21 @@ def xnnpack_emscripten_benchmark_linkopts():
+         "--pre-js $(location :preamble.js.lds)",
+     ]
+ 
++def xnnpack_emscripten_standalone_benchmark_linkopts():
++    return [
++        "-s ASSERTIONS=1",
 +        "-s ERROR_ON_UNDEFINED_SYMBOLS=0",
-         "-s ALLOW_MEMORY_GROWTH=1",
-         "-s TOTAL_MEMORY=536870912",  # 512M
--        "--pre-js $(location :preamble.js.lds)",
++        "-s ALLOW_MEMORY_GROWTH=1",
++        "-s TOTAL_MEMORY=536870912",  # 512M
 +        "-s USE_PTHREADS=0",
 +        "-s STANDALONE_WASM=1",
-+        "-Wno-unused",
-+        "-Wno-unused-variable",
-+        "-Wno-unused-command-line-argument",
 +        "-Wl,--export=__heap_base",
 +        "-Wl,--export=__data_end",
 +        "-Wl,--export=malloc",
 +        "-Wl,--export=free",
-+        "--oformat=wasm",
-     ]
- 
++    ]
++
++
  def xnnpack_emscripten_deps():
-diff --git a/src/log.c b/src/log.c
-index 5715f2f85..4b3e4261b 100644
---- a/src/log.c
-+++ b/src/log.c
-@@ -55,7 +55,7 @@
- #endif
- 
- #if XNN_LOG_TO_STDIO
--static void xnn_vlog(int output_handle, const char* prefix, size_t prefix_length, const char* format, va_list args) {
-+void xnn_vlog(int output_handle, const char* prefix, size_t prefix_length, const char* format, va_list args) {
-   char stack_buffer[XNN_LOG_STACK_BUFFER_SIZE];
-   char* heap_buffer = NULL;
-   char* out_buffer = &stack_buffer[0];
-diff --git a/third_party/cpuinfo.BUILD b/third_party/cpuinfo.BUILD
-index 1997f4e3a..5e03c43af 100644
---- a/third_party/cpuinfo.BUILD
-+++ b/third_party/cpuinfo.BUILD
-@@ -150,7 +150,7 @@ cc_library(
-         "src/arm/midr.h",
-     ],
-     deps = [
--        "@clog",
-+        "//deps/clog"
-     ],
- )
- 
-@@ -352,5 +352,5 @@ config_setting(
- 
- config_setting(
-     name = "emscripten",
--    values = {"crosstool_top": "//toolchain:emscripten"},
-+    values = {"crosstool_top": "@emsdk//emscripten_toolchain:everything"},
- )
+     """Emscripten-specific dependencies for unit tests and benchmarks."""
+     return [