Browse Source

Implement AOT static PGO (#2243)

LLVM PGO (Profile-Guided Optimization) allows the compiler to better optimize code
for how it actually runs. This PR implements the AOT static PGO, and is tested on
Linux x86-64 and x86-32. The basic steps are:

1. Use `wamrc --enable-llvm-pgo -o <aot_file_of_pgo> <wasm_file>`
   to generate an instrumented aot file.
2. Compile iwasm with `cmake -DWAMR_BUILD_STATIC_PGO=1` and run
      `iwasm --gen-prof-file=<raw_profile_file> <aot_file_of_pgo>`
    to generate the raw profile file.
3. Run `llvm-profdata merge -output=<profile_file> <raw_profile_file>`
    to merge the raw profile file into the profile file.
4. Run `wamrc --use-prof-file=<profile_file> -o <aot_file> <wasm_file>`
    to generate the optimized aot file.
5. Run the optimized aot_file: `iwasm <aot_file>`.

The test scripts are also added for each benchmark, run `test_pgo.sh` under
each benchmark's folder to test the AOT static pgo.
Wenyong Huang 2 years ago
parent
commit
8d88471c46

+ 1 - 1
build-scripts/build_llvm.py

@@ -61,7 +61,7 @@ def build_llvm(llvm_dir, platform, backends, projects, use_clang=False, extra_fl
         "-DLLVM_ENABLE_IDE:BOOL=OFF",
         "-DLLVM_ENABLE_LIBEDIT=OFF",
         "-DLLVM_ENABLE_TERMINFO:BOOL=OFF",
-        "-DLLVM_ENABLE_ZLIB:BOOL=OFF",
+        "-DLLVM_ENABLE_ZLIB:BOOL=ON",
         "-DLLVM_INCLUDE_BENCHMARKS:BOOL=OFF",
         "-DLLVM_INCLUDE_DOCS:BOOL=OFF",
         "-DLLVM_INCLUDE_EXAMPLES:BOOL=OFF",

+ 4 - 0
build-scripts/config_common.cmake

@@ -388,3 +388,7 @@ if ("$ENV{COLLECT_CODE_COVERAGE}" STREQUAL "1" OR COLLECT_CODE_COVERAGE EQUAL 1)
   add_definitions (-DCOLLECT_CODE_COVERAGE)
   message ("     Collect code coverage enabled")
 endif ()
+if (WAMR_BUILD_STATIC_PGO EQUAL 1)
+  add_definitions (-DWASM_ENABLE_STATIC_PGO=1)
+  message ("     AOT static PGO enabled")
+endif ()

+ 4 - 0
core/config.h

@@ -445,4 +445,8 @@
 #define WASM_ENABLE_WASM_CACHE 0
 #endif
 
+#ifndef WASM_ENABLE_STATIC_PGO
+#define WASM_ENABLE_STATIC_PGO 0
+#endif
+
 #endif /* end of _CONFIG_H_ */

+ 286 - 7
core/iwasm/aot/aot_loader.c

@@ -1430,8 +1430,28 @@ destroy_object_data_sections(AOTObjectDataSection *data_sections,
     uint32 i;
     AOTObjectDataSection *data_section = data_sections;
     for (i = 0; i < data_section_count; i++, data_section++)
-        if (data_section->data)
+        if (data_section->data) {
+#if WASM_ENABLE_STATIC_PGO != 0
+            if (!strncmp(data_section->name, "__llvm_prf_data", 15)) {
+                LLVMProfileData *data = (LLVMProfileData *)data_section->data;
+                if (data->values) {
+                    uint32 num_value_sites =
+                        data->num_value_sites[0] + data->num_value_sites[1];
+                    uint32 j;
+                    for (j = 0; j < num_value_sites; j++) {
+                        ValueProfNode *node = data->values[j], *node_next;
+                        while (node) {
+                            node_next = node->next;
+                            wasm_runtime_free(node);
+                            node = node_next;
+                        }
+                    }
+                    wasm_runtime_free(data->values);
+                }
+            }
+#endif
             os_munmap(data_section->data, data_section->size);
+        }
     wasm_runtime_free(data_sections);
 }
 
@@ -1900,6 +1920,8 @@ str2uint64(const char *buf, uint64 *p_res)
     return true;
 }
 
+#define R_X86_64_GOTPCREL 9 /* 32 bit signed PC relative offset to GOT */
+
 static bool
 do_text_relocation(AOTModule *module, AOTRelocationGroup *group,
                    char *error_buf, uint32 error_buf_size)
@@ -1937,6 +1959,14 @@ do_text_relocation(AOTModule *module, AOTRelocationGroup *group,
         bh_memcpy_s(symbol, symbol_len, relocation->symbol_name, symbol_len);
         symbol[symbol_len] = '\0';
 
+#if WASM_ENABLE_STATIC_PGO != 0
+        if (!strcmp(symbol, "__llvm_profile_runtime")
+            || !strcmp(symbol, "__llvm_profile_register_function")
+            || !strcmp(symbol, "__llvm_profile_register_names_function")) {
+            continue;
+        }
+#endif
+
         if (!strncmp(symbol, AOT_FUNC_PREFIX, strlen(AOT_FUNC_PREFIX))) {
             p = symbol + strlen(AOT_FUNC_PREFIX);
             if (*p == '\0'
@@ -1945,7 +1975,26 @@ do_text_relocation(AOTModule *module, AOTRelocationGroup *group,
                                 "invalid import symbol %s", symbol);
                 goto check_symbol_fail;
             }
+#if (defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)) \
+    && !defined(BH_PLATFORM_WINDOWS)
+            if (relocation->relocation_type == R_X86_64_GOTPCREL) {
+                GOTItem *got_item = module->got_item_list;
+                uint32 got_item_idx = 0;
+
+                while (got_item) {
+                    if (got_item->func_idx == func_index)
+                        break;
+                    got_item_idx++;
+                    got_item = got_item->next;
+                }
+                /* Calculate `GOT + G` */
+                symbol_addr = module->got_func_ptrs + got_item_idx;
+            }
+            else
+                symbol_addr = module->func_ptrs[func_index];
+#else
             symbol_addr = module->func_ptrs[func_index];
+#endif
         }
         else if (!strcmp(symbol, ".text")) {
             symbol_addr = module->code;
@@ -1956,7 +2005,13 @@ do_text_relocation(AOTModule *module, AOTRelocationGroup *group,
                  /* ".rodata.cst4/8/16/.." */
                  || !strncmp(symbol, ".rodata.cst", strlen(".rodata.cst"))
                  /* ".rodata.strn.m" */
-                 || !strncmp(symbol, ".rodata.str", strlen(".rodata.str"))) {
+                 || !strncmp(symbol, ".rodata.str", strlen(".rodata.str"))
+#if WASM_ENABLE_STATIC_PGO != 0
+                 || !strncmp(symbol, "__llvm_prf_cnts", 15)
+                 || !strncmp(symbol, "__llvm_prf_data", 15)
+                 || !strncmp(symbol, "__llvm_prf_names", 16)
+#endif
+        ) {
             symbol_addr = get_data_section_addr(module, symbol, NULL);
             if (!symbol_addr) {
                 set_error_buf_v(error_buf, error_buf_size,
@@ -2088,6 +2143,14 @@ do_data_relocation(AOTModule *module, AOTRelocationGroup *group,
     else if (!strcmp(group->section_name, ".rdata")) {
         data_section_name = group->section_name;
     }
+#if WASM_ENABLE_STATIC_PGO != 0
+    else if (!strncmp(group->section_name, ".rel__llvm_prf_data", 19)) {
+        data_section_name = group->section_name + strlen(".rel");
+    }
+    else if (!strncmp(group->section_name, ".rela__llvm_prf_data", 20)) {
+        data_section_name = group->section_name + strlen(".rela");
+    }
+#endif
     else {
         set_error_buf(error_buf, error_buf_size,
                       "invalid data relocation section name");
@@ -2107,6 +2170,49 @@ do_data_relocation(AOTModule *module, AOTRelocationGroup *group,
         if (!strcmp(symbol, ".text")) {
             symbol_addr = module->code;
         }
+#if WASM_ENABLE_STATIC_PGO != 0
+        else if (!strncmp(symbol, AOT_FUNC_PREFIX, strlen(AOT_FUNC_PREFIX))) {
+            char *p = symbol + strlen(AOT_FUNC_PREFIX);
+            uint32 func_index;
+            if (*p == '\0'
+                || (func_index = (uint32)atoi(p)) > module->func_count) {
+                set_error_buf_v(error_buf, error_buf_size,
+                                "invalid relocation symbol %s", symbol);
+                return false;
+            }
+            symbol_addr = module->func_ptrs[func_index];
+        }
+        else if (!strcmp(symbol, "__llvm_prf_cnts")) {
+            uint32 j;
+            for (j = 0; j < module->data_section_count; j++) {
+                if (!strncmp(module->data_sections[j].name, symbol, 15)) {
+                    bh_assert(relocation->relocation_addend + sizeof(uint64)
+                              <= module->data_sections[j].size);
+                    symbol_addr = module->data_sections[j].data;
+                    break;
+                }
+            }
+            if (j == module->data_section_count) {
+                set_error_buf_v(error_buf, error_buf_size,
+                                "invalid relocation symbol %s", symbol);
+                return false;
+            }
+        }
+        else if (!strncmp(symbol, "__llvm_prf_cnts", 15)) {
+            uint32 j;
+            for (j = 0; j < module->data_section_count; j++) {
+                if (!strcmp(module->data_sections[j].name, symbol)) {
+                    symbol_addr = module->data_sections[j].data;
+                    break;
+                }
+            }
+            if (j == module->data_section_count) {
+                set_error_buf_v(error_buf, error_buf_size,
+                                "invalid relocation symbol %s", symbol);
+                return false;
+            }
+        }
+#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */
         else {
             set_error_buf_v(error_buf, error_buf_size,
                             "invalid relocation symbol %s", symbol);
@@ -2154,7 +2260,7 @@ load_relocation_section(const uint8 *buf, const uint8 *buf_end,
 {
     AOTRelocationGroup *groups = NULL, *group;
     uint32 symbol_count = 0;
-    uint32 group_count = 0, i, j;
+    uint32 group_count = 0, i, j, got_item_count = 0;
     uint64 size;
     uint32 *symbol_offsets, total_string_len;
     uint8 *symbol_buf, *symbol_buf_end;
@@ -2216,6 +2322,8 @@ load_relocation_section(const uint8 *buf, const uint8 *buf_end,
 
         for (j = 0; j < relocation_count; j++) {
             AOTRelocation relocation = { 0 };
+            char group_name_buf[128] = { 0 };
+            char symbol_name_buf[128] = { 0 };
             uint32 symbol_index, offset32;
             int32 addend32;
             uint16 symbol_name_len;
@@ -2244,10 +2352,10 @@ load_relocation_section(const uint8 *buf, const uint8 *buf_end,
             symbol_name_len = *(uint16 *)symbol_name;
             symbol_name += sizeof(uint16);
 
-            char group_name_buf[128] = { 0 };
-            char symbol_name_buf[128] = { 0 };
-            memcpy(group_name_buf, group_name, group_name_len);
-            memcpy(symbol_name_buf, symbol_name, symbol_name_len);
+            bh_memcpy_s(group_name_buf, (uint32)sizeof(group_name_buf),
+                        group_name, group_name_len);
+            bh_memcpy_s(symbol_name_buf, (uint32)sizeof(symbol_name_buf),
+                        symbol_name, symbol_name_len);
 
             if ((group_name_len == strlen(".text")
                  || (module->is_indirect_mode
@@ -2309,6 +2417,139 @@ load_relocation_section(const uint8 *buf, const uint8 *buf_end,
     }
 #endif /* end of defined(BH_PLATFORM_WINDOWS) */
 
+#if (defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)) \
+    && !defined(BH_PLATFORM_WINDOWS)
+    buf = symbol_buf_end;
+    read_uint32(buf, buf_end, group_count);
+
+    /* Resolve the relocations of type R_X86_64_GOTPCREL */
+    for (i = 0; i < group_count; i++) {
+        uint32 name_index, relocation_count;
+        uint16 group_name_len;
+        uint8 *group_name;
+
+        /* section name address is 4 bytes aligned. */
+        buf = (uint8 *)align_ptr(buf, sizeof(uint32));
+        read_uint32(buf, buf_end, name_index);
+
+        if (name_index >= symbol_count) {
+            set_error_buf(error_buf, error_buf_size,
+                          "symbol index out of range");
+            goto fail;
+        }
+
+        group_name = symbol_buf + symbol_offsets[name_index];
+        group_name_len = *(uint16 *)group_name;
+        group_name += sizeof(uint16);
+
+        read_uint32(buf, buf_end, relocation_count);
+
+        for (j = 0; j < relocation_count; j++) {
+            AOTRelocation relocation = { 0 };
+            char group_name_buf[128] = { 0 };
+            char symbol_name_buf[128] = { 0 };
+            uint32 symbol_index;
+            uint16 symbol_name_len;
+            uint8 *symbol_name;
+
+            /* relocation offset and addend */
+            buf += sizeof(void *) * 2;
+
+            read_uint32(buf, buf_end, relocation.relocation_type);
+            read_uint32(buf, buf_end, symbol_index);
+
+            if (symbol_index >= symbol_count) {
+                set_error_buf(error_buf, error_buf_size,
+                              "symbol index out of range");
+                goto fail;
+            }
+
+            symbol_name = symbol_buf + symbol_offsets[symbol_index];
+            symbol_name_len = *(uint16 *)symbol_name;
+            symbol_name += sizeof(uint16);
+
+            bh_memcpy_s(group_name_buf, (uint32)sizeof(group_name_buf),
+                        group_name, group_name_len);
+            bh_memcpy_s(symbol_name_buf, (uint32)sizeof(symbol_name_buf),
+                        symbol_name, symbol_name_len);
+
+            if (relocation.relocation_type == R_X86_64_GOTPCREL
+                && !strncmp(symbol_name_buf, AOT_FUNC_PREFIX,
+                            strlen(AOT_FUNC_PREFIX))) {
+                uint32 func_idx =
+                    atoi(symbol_name_buf + strlen(AOT_FUNC_PREFIX));
+                GOTItem *got_item = module->got_item_list;
+
+                if (func_idx >= module->func_count) {
+                    set_error_buf(error_buf, error_buf_size,
+                                  "func index out of range");
+                    goto fail;
+                }
+
+                while (got_item) {
+                    if (got_item->func_idx == func_idx)
+                        break;
+                    got_item = got_item->next;
+                }
+
+                if (!got_item) {
+                    /* Create the got item and append to the list */
+                    got_item = wasm_runtime_malloc(sizeof(GOTItem));
+                    if (!got_item) {
+                        set_error_buf(error_buf, error_buf_size,
+                                      "allocate memory failed");
+                        goto fail;
+                    }
+
+                    got_item->func_idx = func_idx;
+                    got_item->next = NULL;
+                    if (!module->got_item_list) {
+                        module->got_item_list = module->got_item_list_end =
+                            got_item;
+                    }
+                    else {
+                        module->got_item_list_end->next = got_item;
+                        module->got_item_list_end = got_item;
+                    }
+
+                    got_item_count++;
+                }
+            }
+        }
+    }
+
+    if (got_item_count) {
+        GOTItem *got_item = module->got_item_list;
+        uint32 got_item_idx = 0;
+
+        map_prot = MMAP_PROT_READ | MMAP_PROT_WRITE;
+        /* aot code and data in x86_64 must be in range 0 to 2G due to
+           relocation for R_X86_64_32/32S/PC32 */
+        map_flags = MMAP_MAP_32BIT;
+
+        /* Create the GOT for func_ptrs, note that it is different from
+           the .got section of a dynamic object file */
+        size = (uint64)sizeof(void *) * got_item_count;
+        if (size > UINT32_MAX
+            || !(module->got_func_ptrs =
+                     os_mmap(NULL, (uint32)size, map_prot, map_flags))) {
+            set_error_buf(error_buf, error_buf_size, "mmap memory failed");
+            goto fail;
+        }
+
+        while (got_item) {
+            module->got_func_ptrs[got_item_idx++] =
+                module->func_ptrs[got_item->func_idx];
+            got_item = got_item->next;
+        }
+
+        module->got_item_count = got_item_count;
+    }
+#else
+    (void)got_item_count;
+#endif /* (defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)) && \
+          !defined(BH_PLATFORM_WINDOWS) */
+
     buf = symbol_buf_end;
     read_uint32(buf, buf_end, group_count);
 
@@ -2994,9 +3235,27 @@ aot_unload(AOTModule *module)
     }
 #endif
 
+#if (defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)) \
+    && !defined(BH_PLATFORM_WINDOWS)
+    {
+        GOTItem *got_item = module->got_item_list, *got_item_next;
+
+        if (module->got_func_ptrs) {
+            os_munmap(module->got_func_ptrs,
+                      sizeof(void *) * module->got_item_count);
+        }
+        while (got_item) {
+            got_item_next = got_item->next;
+            wasm_runtime_free(got_item);
+            got_item = got_item_next;
+        }
+    }
+#endif
+
     if (module->data_sections)
         destroy_object_data_sections(module->data_sections,
                                      module->data_section_count);
+
 #if WASM_ENABLE_DEBUG_AOT != 0
     jit_code_entry_destroy(module->elf_hdr);
 #endif
@@ -3043,3 +3302,23 @@ aot_get_custom_section(const AOTModule *module, const char *name, uint32 *len)
     return NULL;
 }
 #endif /* end of WASM_ENABLE_LOAD_CUSTOM_SECTION */
+
+#if WASM_ENABLE_STATIC_PGO != 0
+void
+aot_exchange_uint16(uint8 *p_data)
+{
+    return exchange_uint16(p_data);
+}
+
+void
+aot_exchange_uint32(uint8 *p_data)
+{
+    return exchange_uint32(p_data);
+}
+
+void
+aot_exchange_uint64(uint8 *p_data)
+{
+    return exchange_uint64(p_data);
+}
+#endif

+ 9 - 0
core/iwasm/aot/aot_reloc.h

@@ -121,6 +121,14 @@ typedef struct {
     REG_SYM(aot_intrinsic_i32_rem_s),     \
     REG_SYM(aot_intrinsic_i32_rem_u),     \
 
+#if WASM_ENABLE_STATIC_PGO != 0
+#define REG_LLVM_PGO_SYM()               \
+    { "__llvm_profile_instrument_target", llvm_profile_instrument_target }, \
+    { "__llvm_profile_instrument_memop", llvm_profile_instrument_memop },
+#else
+#define REG_LLVM_PGO_SYM()
+#endif
+
 #define REG_COMMON_SYMBOLS                \
     REG_SYM(aot_set_exception_with_id),   \
     REG_SYM(aot_invoke_native),           \
@@ -150,6 +158,7 @@ typedef struct {
     REG_REF_TYPES_SYM()                   \
     REG_AOT_TRACE_SYM()                   \
     REG_INTRINSIC_SYM()                   \
+    REG_LLVM_PGO_SYM()                    \
 
 #define CHECK_RELOC_OFFSET(data_size) do {              \
     if (!check_reloc_offset(target_section_size,        \

+ 517 - 0
core/iwasm/aot/aot_runtime.c

@@ -2852,3 +2852,520 @@ aot_dump_perf_profiling(const AOTModuleInstance *module_inst)
     }
 }
 #endif /* end of WASM_ENABLE_PERF_PROFILING */
+
+#if WASM_ENABLE_STATIC_PGO != 0
+
+/* indirect call target */
+#define IPVK_IndirectCallTarget 0
+/* memory intrinsic functions size */
+#define IPVK_MemOPSize 1
+#define IPVK_First IPVK_IndirectCallTarget
+#define IPVK_Last IPVK_MemOPSize
+
+#define INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE 24
+#define INSTR_PROF_MAX_NUM_VAL_PER_SITE 255
+
+static int hasNonDefaultValsPerSite = 0;
+static uint32 VPMaxNumValsPerSite = INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE;
+
+static bool
+cmpxchg_ptr(void **ptr, void *old_val, void *new_val)
+{
+#if defined(os_atomic_cmpxchg)
+    return os_atomic_cmpxchg(ptr, &old_val, new_val);
+#else
+    /* TODO: add lock when thread-manager is enabled */
+    void *read = *ptr;
+    if (read == old_val) {
+        *ptr = new_val;
+        return true;
+    }
+    return false;
+#endif
+}
+
+static int
+allocateValueProfileCounters(LLVMProfileData *Data)
+{
+    ValueProfNode **Mem;
+    uint64 NumVSites = 0, total_size;
+    uint32 VKI;
+
+    /* When dynamic allocation is enabled, allow tracking the max number of
+       values allowed. */
+    if (!hasNonDefaultValsPerSite)
+        VPMaxNumValsPerSite = INSTR_PROF_MAX_NUM_VAL_PER_SITE;
+
+    for (VKI = IPVK_First; VKI <= IPVK_Last; ++VKI)
+        NumVSites += Data->num_value_sites[VKI];
+
+    /* If NumVSites = 0, calloc is allowed to return a non-null pointer. */
+    bh_assert(NumVSites > 0 && "NumVSites can't be zero");
+
+    total_size = (uint64)sizeof(ValueProfNode *) * NumVSites;
+    if (total_size > UINT32_MAX
+        || !(Mem = (ValueProfNode **)wasm_runtime_malloc((uint32)total_size))) {
+        return 0;
+    }
+    memset(Mem, 0, (uint32)total_size);
+
+    if (!cmpxchg_ptr((void **)&Data->values, NULL, Mem)) {
+        wasm_runtime_free(Mem);
+        return 0;
+    }
+    return 1;
+}
+
+static ValueProfNode *
+allocateOneNode(void)
+{
+    ValueProfNode *Node;
+
+    Node = wasm_runtime_malloc((uint32)sizeof(ValueProfNode));
+    if (Node)
+        memset(Node, 0, sizeof(ValueProfNode));
+    return Node;
+}
+
+static void
+instrumentTargetValueImpl(uint64 TargetValue, void *Data, uint32 CounterIndex,
+                          uint64 CountValue)
+{
+    ValueProfNode **ValueCounters;
+    ValueProfNode *PrevVNode = NULL, *MinCountVNode = NULL, *CurVNode;
+    LLVMProfileData *PData = (LLVMProfileData *)Data;
+    uint64 MinCount = UINT64_MAX;
+    uint8 VDataCount = 0;
+    bool success = false;
+
+    if (!PData)
+        return;
+    if (!CountValue)
+        return;
+    if (!PData->values) {
+        if (!allocateValueProfileCounters(PData))
+            return;
+    }
+
+    ValueCounters = (ValueProfNode **)PData->values;
+    CurVNode = ValueCounters[CounterIndex];
+
+    while (CurVNode) {
+        if (TargetValue == CurVNode->value) {
+            CurVNode->count += CountValue;
+            return;
+        }
+        if (CurVNode->count < MinCount) {
+            MinCount = CurVNode->count;
+            MinCountVNode = CurVNode;
+        }
+        PrevVNode = CurVNode;
+        CurVNode = CurVNode->next;
+        ++VDataCount;
+    }
+
+    if (VDataCount >= VPMaxNumValsPerSite) {
+        if (MinCountVNode->count <= CountValue) {
+            CurVNode = MinCountVNode;
+            CurVNode->value = TargetValue;
+            CurVNode->count = CountValue;
+        }
+        else
+            MinCountVNode->count -= CountValue;
+
+        return;
+    }
+
+    CurVNode = allocateOneNode();
+    if (!CurVNode)
+        return;
+    CurVNode->value = TargetValue;
+    CurVNode->count += CountValue;
+
+    if (!ValueCounters[CounterIndex]) {
+        success =
+            cmpxchg_ptr((void **)&ValueCounters[CounterIndex], NULL, CurVNode);
+    }
+    else if (PrevVNode && !PrevVNode->next) {
+        success = cmpxchg_ptr((void **)&PrevVNode->next, 0, CurVNode);
+    }
+
+    if (!success) {
+        wasm_runtime_free(CurVNode);
+    }
+}
+
+void
+llvm_profile_instrument_target(uint64 target_value, void *data,
+                               uint32 counter_idx)
+{
+    instrumentTargetValueImpl(target_value, data, counter_idx, 1);
+}
+
+static inline uint32
+popcount64(uint64 u)
+{
+    uint32 ret = 0;
+    while (u) {
+        u = (u & (u - 1));
+        ret++;
+    }
+    return ret;
+}
+
+static inline uint32
+clz64(uint64 type)
+{
+    uint32 num = 0;
+    if (type == 0)
+        return 64;
+    while (!(type & 0x8000000000000000LL)) {
+        num++;
+        type <<= 1;
+    }
+    return num;
+}
+
+/* Map an (observed) memop size value to the representative value of its range.
+   For example, 5 -> 5, 22 -> 17, 99 -> 65, 256 -> 256, 1001 -> 513. */
+static uint64
+InstrProfGetRangeRepValue(uint64 Value)
+{
+    if (Value <= 8)
+        /* The first ranges are individually tracked. Use the value as is. */
+        return Value;
+    else if (Value >= 513)
+        /* The last range is mapped to its lowest value. */
+        return 513;
+    else if (popcount64(Value) == 1)
+        /* If it's a power of two, use it as is. */
+        return Value;
+    else
+        /* Otherwise, take to the previous power of two + 1. */
+        return (((uint64)1) << (64 - clz64(Value) - 1)) + 1;
+}
+
+void
+llvm_profile_instrument_memop(uint64 target_value, void *data,
+                              uint32 counter_idx)
+{
+    uint64 rep_value = InstrProfGetRangeRepValue(target_value);
+    instrumentTargetValueImpl(rep_value, data, counter_idx, 1);
+}
+
+static uint32
+get_pgo_prof_data_size(AOTModuleInstance *module_inst, uint32 *p_num_prof_data,
+                       uint32 *p_num_prof_counters, uint32 *p_padding_size,
+                       uint32 *p_prof_counters_size, uint32 *p_prof_names_size,
+                       uint32 *p_value_counters_size, uint8 **p_prof_names)
+{
+    AOTModule *module = (AOTModule *)module_inst->module;
+    LLVMProfileData *prof_data;
+    uint8 *prof_names = NULL;
+    uint32 num_prof_data = 0, num_prof_counters = 0, padding_size, i;
+    uint32 prof_counters_size = 0, prof_names_size = 0;
+    uint32 total_size, total_size_wo_value_counters;
+
+    for (i = 0; i < module->data_section_count; i++) {
+        if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) {
+            bh_assert(module->data_sections[i].size == sizeof(LLVMProfileData));
+            num_prof_data++;
+            prof_data = (LLVMProfileData *)module->data_sections[i].data;
+            num_prof_counters += prof_data->num_counters;
+        }
+        else if (!strncmp(module->data_sections[i].name, "__llvm_prf_cnts",
+                          15)) {
+            prof_counters_size += module->data_sections[i].size;
+        }
+        else if (!strncmp(module->data_sections[i].name, "__llvm_prf_names",
+                          16)) {
+            prof_names_size = module->data_sections[i].size;
+            prof_names = module->data_sections[i].data;
+        }
+    }
+
+    if (prof_counters_size != num_prof_counters * sizeof(uint64))
+        return 0;
+
+    total_size = sizeof(LLVMProfileRawHeader)
+                 + num_prof_data * sizeof(LLVMProfileData_64)
+                 + prof_counters_size + prof_names_size;
+    padding_size = sizeof(uint64) - (prof_names_size % sizeof(uint64));
+    if (padding_size != sizeof(uint64))
+        total_size += padding_size;
+
+    /* Total size excluding value counters */
+    total_size_wo_value_counters = total_size;
+
+    for (i = 0; i < module->data_section_count; i++) {
+        if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) {
+            uint32 j, k, num_value_sites, num_value_nodes;
+            ValueProfNode **values, *value_node;
+
+            prof_data = (LLVMProfileData *)module->data_sections[i].data;
+            values = prof_data->values;
+
+            if (prof_data->num_value_sites[0] > 0
+                || prof_data->num_value_sites[1] > 0) {
+                /* TotalSize (uint32) and NumValueKinds (uint32) */
+                total_size += 8;
+                for (j = 0; j < 2; j++) {
+                    if ((num_value_sites = prof_data->num_value_sites[j]) > 0) {
+                        /* ValueKind (uint32) and NumValueSites (uint32) */
+                        total_size += 8;
+                        /* (Value + Counter) group counts of each value site,
+                           each count is one byte */
+                        total_size += align_uint(num_value_sites, 8);
+
+                        if (values) {
+                            for (k = 0; k < num_value_sites; k++) {
+                                num_value_nodes = 0;
+                                value_node = *values;
+                                while (value_node) {
+                                    num_value_nodes++;
+                                    value_node = value_node->next;
+                                }
+                                if (num_value_nodes) {
+                                    /* (Value + Counter) groups */
+                                    total_size += num_value_nodes * 8 * 2;
+                                }
+                                values++;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if (p_num_prof_data)
+        *p_num_prof_data = num_prof_data;
+    if (p_num_prof_counters)
+        *p_num_prof_counters = num_prof_counters;
+    if (p_padding_size)
+        *p_padding_size = padding_size;
+    if (p_prof_counters_size)
+        *p_prof_counters_size = prof_counters_size;
+    if (p_prof_names_size)
+        *p_prof_names_size = prof_names_size;
+    if (p_value_counters_size)
+        *p_value_counters_size = total_size - total_size_wo_value_counters;
+    if (p_prof_names)
+        *p_prof_names = prof_names;
+
+    return total_size;
+}
+
+uint32
+aot_get_pgo_prof_data_size(AOTModuleInstance *module_inst)
+{
+    return get_pgo_prof_data_size(module_inst, NULL, NULL, NULL, NULL, NULL,
+                                  NULL, NULL);
+}
+
+static union {
+    int a;
+    char b;
+} __ue = { .a = 1 };
+
+#define is_little_endian() (__ue.b == 1)
+
+uint32
+aot_dump_pgo_prof_data_to_buf(AOTModuleInstance *module_inst, char *buf,
+                              uint32 len)
+{
+    AOTModule *module = (AOTModule *)module_inst->module;
+    LLVMProfileRawHeader prof_header = { 0 };
+    LLVMProfileData *prof_data;
+    uint8 *prof_names = NULL;
+    uint32 num_prof_data = 0, num_prof_counters = 0, padding_size, i;
+    uint32 prof_counters_size = 0, prof_names_size = 0;
+    uint32 value_counters_size = 0, value_counters_size_backup = 0;
+    uint32 total_size, size;
+    int64 counters_delta, offset_counters;
+
+    total_size = get_pgo_prof_data_size(module_inst, &num_prof_data,
+                                        &num_prof_counters, &padding_size,
+                                        &prof_counters_size, &prof_names_size,
+                                        &value_counters_size, &prof_names);
+    if (len < total_size)
+        return 0;
+
+    value_counters_size_backup = value_counters_size;
+    value_counters_size = 0;
+
+    prof_header.counters_delta = counters_delta =
+        sizeof(LLVMProfileData_64) * num_prof_data;
+    offset_counters = 0;
+    for (i = 0; i < module->data_section_count; i++) {
+        if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) {
+            prof_data = (LLVMProfileData *)module->data_sections[i].data;
+            prof_data->offset_counters = counters_delta + offset_counters;
+            offset_counters += prof_data->num_counters * sizeof(uint64);
+            counters_delta -= sizeof(LLVMProfileData_64);
+        }
+    }
+
+    prof_header.magic = 0xFF6C70726F667281LL;
+    /* Version 8 */
+    prof_header.version = 0x0000000000000008LL;
+    /* with VARIANT_MASK_IR_PROF (IR Instrumentation) */
+    prof_header.version |= 0x1ULL << 56;
+    /* with VARIANT_MASK_MEMPROF (Memory Profile) */
+    prof_header.version |= 0x1ULL << 62;
+    prof_header.num_prof_data = num_prof_data;
+    prof_header.num_prof_counters = num_prof_counters;
+    prof_header.names_size = prof_names_size;
+    prof_header.value_kind_last = 1;
+
+    if (!is_little_endian()) {
+        aot_exchange_uint64((uint8 *)&prof_header.magic);
+        aot_exchange_uint64((uint8 *)&prof_header.version);
+        aot_exchange_uint64((uint8 *)&prof_header.num_prof_data);
+        aot_exchange_uint64((uint8 *)&prof_header.num_prof_counters);
+        aot_exchange_uint64((uint8 *)&prof_header.names_size);
+        aot_exchange_uint64((uint8 *)&prof_header.counters_delta);
+        aot_exchange_uint64((uint8 *)&prof_header.value_kind_last);
+    }
+
+    size = sizeof(LLVMProfileRawHeader);
+    bh_memcpy_s(buf, size, &prof_header, size);
+    buf += size;
+
+    for (i = 0; i < module->data_section_count; i++) {
+        if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) {
+            LLVMProfileData_64 *prof_data_64 = (LLVMProfileData_64 *)buf;
+
+            /* Convert LLVMProfileData to LLVMProfileData_64, the pointer width
+               in the output file is alawys 8 bytes */
+            prof_data = (LLVMProfileData *)module->data_sections[i].data;
+            prof_data_64->func_md5 = prof_data->func_md5;
+            prof_data_64->func_hash = prof_data->func_hash;
+            prof_data_64->offset_counters = prof_data->offset_counters;
+            prof_data_64->func_ptr = prof_data->func_ptr;
+            prof_data_64->values = (uint64)(uintptr_t)prof_data->values;
+            prof_data_64->num_counters = prof_data->num_counters;
+            prof_data_64->num_value_sites[0] = prof_data->num_value_sites[0];
+            prof_data_64->num_value_sites[1] = prof_data->num_value_sites[1];
+
+            if (!is_little_endian()) {
+                aot_exchange_uint64((uint8 *)&prof_data_64->func_hash);
+                aot_exchange_uint64((uint8 *)&prof_data_64->offset_counters);
+                aot_exchange_uint64((uint8 *)&prof_data_64->offset_counters);
+                aot_exchange_uint64((uint8 *)&prof_data_64->func_ptr);
+                aot_exchange_uint64((uint8 *)&prof_data_64->values);
+                aot_exchange_uint32((uint8 *)&prof_data_64->num_counters);
+                aot_exchange_uint16((uint8 *)&prof_data_64->num_value_sites[0]);
+                aot_exchange_uint16((uint8 *)&prof_data_64->num_value_sites[1]);
+            }
+            buf += sizeof(LLVMProfileData_64);
+        }
+    }
+
+    for (i = 0; i < module->data_section_count; i++) {
+        if (!strncmp(module->data_sections[i].name, "__llvm_prf_cnts", 15)) {
+            size = module->data_sections[i].size;
+            bh_memcpy_s(buf, size, module->data_sections[i].data, size);
+            buf += size;
+        }
+    }
+
+    if (prof_names && prof_names_size > 0) {
+        size = prof_names_size;
+        bh_memcpy_s(buf, size, prof_names, size);
+        buf += size;
+        padding_size = sizeof(uint64) - (prof_names_size % sizeof(uint64));
+        if (padding_size != sizeof(uint64)) {
+            char padding_buf[8] = { 0 };
+            bh_memcpy_s(buf, padding_size, padding_buf, padding_size);
+            buf += padding_size;
+        }
+    }
+
+    for (i = 0; i < module->data_section_count; i++) {
+        if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) {
+            uint32 j, k, num_value_sites, num_value_nodes;
+            ValueProfNode **values, **values_tmp, *value_node;
+
+            prof_data = (LLVMProfileData *)module->data_sections[i].data;
+            values = values_tmp = prof_data->values;
+
+            if (prof_data->num_value_sites[0] > 0
+                || prof_data->num_value_sites[1] > 0) {
+                uint32 *buf_total_size = (uint32 *)buf;
+
+                buf += 4; /* emit TotalSize later */
+                *(uint32 *)buf = (prof_data->num_value_sites[0] > 0
+                                  && prof_data->num_value_sites[1] > 0)
+                                     ? 2
+                                     : 1;
+                if (!is_little_endian())
+                    aot_exchange_uint32((uint8 *)buf);
+                buf += 4;
+
+                for (j = 0; j < 2; j++) {
+                    if ((num_value_sites = prof_data->num_value_sites[j]) > 0) {
+                        /* ValueKind */
+                        *(uint32 *)buf = j;
+                        if (!is_little_endian())
+                            aot_exchange_uint32((uint8 *)buf);
+                        buf += 4;
+                        /* NumValueSites */
+                        *(uint32 *)buf = num_value_sites;
+                        if (!is_little_endian())
+                            aot_exchange_uint32((uint8 *)buf);
+                        buf += 4;
+
+                        for (k = 0; k < num_value_sites; k++) {
+                            num_value_nodes = 0;
+                            if (values_tmp) {
+                                value_node = *values_tmp;
+                                while (value_node) {
+                                    num_value_nodes++;
+                                    value_node = value_node->next;
+                                }
+                                values_tmp++;
+                            }
+                            bh_assert(num_value_nodes < 255);
+                            *(uint8 *)buf++ = (uint8)num_value_nodes;
+                        }
+                        if (num_value_sites % 8) {
+                            buf += 8 - (num_value_sites % 8);
+                        }
+
+                        for (k = 0; k < num_value_sites; k++) {
+                            if (values) {
+                                value_node = *values;
+                                while (value_node) {
+                                    *(uint64 *)buf = value_node->value;
+                                    if (!is_little_endian())
+                                        aot_exchange_uint64((uint8 *)buf);
+                                    buf += 8;
+                                    *(uint64 *)buf = value_node->count;
+                                    if (!is_little_endian())
+                                        aot_exchange_uint64((uint8 *)buf);
+                                    buf += 8;
+                                    value_node = value_node->next;
+                                }
+                                values++;
+                            }
+                        }
+                    }
+                }
+
+                /* TotalSize */
+                *(uint32 *)buf_total_size =
+                    (uint8 *)buf - (uint8 *)buf_total_size;
+                if (!is_little_endian())
+                    aot_exchange_uint64((uint8 *)buf_total_size);
+                value_counters_size += (uint8 *)buf - (uint8 *)buf_total_size;
+            }
+        }
+    }
+
+    bh_assert(value_counters_size == value_counters_size_backup);
+    (void)value_counters_size_backup;
+
+    return total_size;
+}
+#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */

+ 98 - 0
core/iwasm/aot/aot_runtime.h

@@ -41,6 +41,10 @@ typedef struct AOTObjectDataSection {
     char *name;
     uint8 *data;
     uint32 size;
+#if WASM_ENABLE_WAMR_COMPILER != 0 || WASM_ENABLE_JIT != 0
+    bool is_name_allocated;
+    bool is_data_allocated;
+#endif
 } AOTObjectDataSection;
 
 /* Relocation info */
@@ -51,6 +55,9 @@ typedef struct AOTRelocation {
     char *symbol_name;
     /* index in the symbol offset field */
     uint32 symbol_index;
+#if WASM_ENABLE_WAMR_COMPILER != 0 || WASM_ENABLE_JIT != 0
+    bool is_symbol_name_allocated;
+#endif
 } AOTRelocation;
 
 /* Relocation Group */
@@ -60,6 +67,9 @@ typedef struct AOTRelocationGroup {
     uint32 name_index;
     uint32 relocation_count;
     AOTRelocation *relocations;
+#if WASM_ENABLE_WAMR_COMPILER != 0 || WASM_ENABLE_JIT != 0
+    bool is_section_name_allocated;
+#endif
 } AOTRelocationGroup;
 
 /* AOT function instance */
@@ -108,6 +118,13 @@ typedef struct AOTUnwindInfo {
 #define PLT_ITEM_SIZE 12
 #endif
 
+#if defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)
+typedef struct GOTItem {
+    uint32 func_idx;
+    struct GOTItem *next;
+} GOTItem, *GOTItemList;
+#endif
+
 typedef struct AOTModule {
     uint32 module_type;
 
@@ -204,6 +221,13 @@ typedef struct AOTModule {
     bool rtl_func_table_registered;
 #endif
 
+#if defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)
+    uint32 got_item_count;
+    GOTItemList got_item_list;
+    GOTItemList got_item_list_end;
+    void **got_func_ptrs;
+#endif
+
     /* data sections in AOT object file, including .data, .rodata
        and .rodata.cstN. */
     AOTObjectDataSection *data_sections;
@@ -294,6 +318,54 @@ typedef struct AOTFrame {
 #endif
 } AOTFrame;
 
+#if WASM_ENABLE_STATIC_PGO != 0
+typedef struct LLVMProfileRawHeader {
+    uint64 magic;
+    uint64 version;
+    uint64 binary_ids_size;
+    uint64 num_prof_data;
+    uint64 padding_bytes_before_counters;
+    uint64 num_prof_counters;
+    uint64 padding_bytes_after_counters;
+    uint64 names_size;
+    uint64 counters_delta;
+    uint64 names_delta;
+    uint64 value_kind_last;
+} LLVMProfileRawHeader;
+
+typedef struct ValueProfNode {
+    uint64 value;
+    uint64 count;
+    struct ValueProfNode *next;
+} ValueProfNode;
+
+/* The profiling data of data sections created by aot compiler and
+   used when profiling, the width of pointer can be 8 bytes (64-bit)
+   or 4 bytes (32-bit) */
+typedef struct LLVMProfileData {
+    uint64 func_md5;
+    uint64 func_hash;
+    uint64 offset_counters;
+    uintptr_t func_ptr;
+    ValueProfNode **values;
+    uint32 num_counters;
+    uint16 num_value_sites[2];
+} LLVMProfileData;
+
+/* The profiling data for writting to the output file, the width of
+   pointer is 8 bytes suppose we always use wamrc and llvm-profdata
+   with 64-bit mode */
+typedef struct LLVMProfileData_64 {
+    uint64 func_md5;
+    uint64 func_hash;
+    uint64 offset_counters;
+    uint64 func_ptr;
+    uint64 values;
+    uint32 num_counters;
+    uint16 num_value_sites[2];
+} LLVMProfileData_64;
+#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */
+
 /**
  * Load a AOT module from aot file buffer
  * @param buf the byte buffer which contains the AOT file data
@@ -564,6 +636,32 @@ aot_dump_perf_profiling(const AOTModuleInstance *module_inst);
 const uint8 *
 aot_get_custom_section(const AOTModule *module, const char *name, uint32 *len);
 
+#if WASM_ENABLE_STATIC_PGO != 0
+void
+llvm_profile_instrument_target(uint64 target_value, void *data,
+                               uint32 counter_idx);
+
+void
+llvm_profile_instrument_memop(uint64 target_value, void *data,
+                              uint32 counter_idx);
+
+uint32
+aot_get_pgo_prof_data_size(AOTModuleInstance *module_inst);
+
+uint32
+aot_dump_pgo_prof_data_to_buf(AOTModuleInstance *module_inst, char *buf,
+                              uint32 len);
+
+void
+aot_exchange_uint16(uint8 *p_data);
+
+void
+aot_exchange_uint32(uint8 *p_data);
+
+void
+aot_exchange_uint64(uint8 *p_data);
+#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */
+
 #ifdef __cplusplus
 } /* end of extern "C" */
 #endif

+ 6 - 0
core/iwasm/aot/arch/aot_reloc_x86_32.c

@@ -8,6 +8,9 @@
 #define R_386_32 1    /* Direct 32 bit  */
 #define R_386_PC32 2  /* PC relative 32 bit */
 #define R_386_PLT32 4 /* 32-bit address ProcedureLinkageTable */
+#define R_386_TLS_GD_32                      \
+    24 /*  Direct 32 bit for general dynamic \
+           thread local data */
 
 #if !defined(_WIN32) && !defined(_WIN32_)
 /* clang-format off */
@@ -110,6 +113,9 @@ apply_relocation(AOTModule *module, uint8 *target_section_addr,
 {
     switch (reloc_type) {
         case R_386_32:
+#if WASM_ENABLE_STATIC_PGO != 0
+        case R_386_TLS_GD_32:
+#endif
         {
             intptr_t value;
 

+ 18 - 5
core/iwasm/aot/arch/aot_reloc_x86_64.c

@@ -6,11 +6,13 @@
 #include "aot_reloc.h"
 
 #if !defined(BH_PLATFORM_WINDOWS)
-#define R_X86_64_64 1    /* Direct 64 bit  */
-#define R_X86_64_PC32 2  /* PC relative 32 bit signed */
-#define R_X86_64_PLT32 4 /* 32 bit PLT address */
-#define R_X86_64_32 10   /* Direct 32 bit zero extended */
-#define R_X86_64_32S 11  /* Direct 32 bit sign extended */
+#define R_X86_64_64 1       /* Direct 64 bit  */
+#define R_X86_64_PC32 2     /* PC relative 32 bit signed */
+#define R_X86_64_PLT32 4    /* 32 bit PLT address */
+#define R_X86_64_GOTPCREL 9 /* 32 bit signed PC relative offset to GOT */
+#define R_X86_64_32 10      /* Direct 32 bit zero extended */
+#define R_X86_64_32S 11     /* Direct 32 bit sign extended */
+#define R_X86_64_PC64 24    /* PC relative 64 bit */
 #else
 #ifndef IMAGE_REL_AMD64_ADDR64
 #define IMAGE_REL_AMD64_ADDR64 1 /* The 64-bit VA of the relocation target */
@@ -164,6 +166,7 @@ apply_relocation(AOTModule *module, uint8 *target_section_addr,
 #endif
 #if !defined(BH_PLATFORM_WINDOWS)
         case R_X86_64_PC32:
+        case R_X86_64_GOTPCREL: /* GOT + G has been calculated as symbol_addr */
         {
             intptr_t target_addr = (intptr_t) /* S + A - P */
                 ((uintptr_t)symbol_addr + reloc_addend
@@ -182,6 +185,16 @@ apply_relocation(AOTModule *module, uint8 *target_section_addr,
             *(int32 *)(target_section_addr + reloc_offset) = (int32)target_addr;
             break;
         }
+        case R_X86_64_PC64:
+        {
+            intptr_t target_addr = (intptr_t) /* S + A - P */
+                ((uintptr_t)symbol_addr + reloc_addend
+                 - (uintptr_t)(target_section_addr + reloc_offset));
+
+            CHECK_RELOC_OFFSET(sizeof(int64));
+            *(int64 *)(target_section_addr + reloc_offset) = (int64)target_addr;
+            break;
+        }
         case R_X86_64_32:
         case R_X86_64_32S:
         {

+ 27 - 0
core/iwasm/common/wasm_runtime_common.c

@@ -5033,6 +5033,33 @@ wasm_runtime_dump_call_stack_to_buf(wasm_exec_env_t exec_env, char *buf,
 }
 #endif /* end of WASM_ENABLE_DUMP_CALL_STACK */
 
+#if WASM_ENABLE_STATIC_PGO != 0
+uint32
+wasm_runtime_get_pgo_prof_data_size(WASMModuleInstanceCommon *module_inst)
+{
+#if WASM_ENABLE_AOT != 0
+    if (module_inst->module_type == Wasm_Module_AoT) {
+        AOTModuleInstance *aot_inst = (AOTModuleInstance *)module_inst;
+        return aot_get_pgo_prof_data_size(aot_inst);
+    }
+#endif
+    return 0;
+}
+
+uint32
+wasm_runtime_dump_pgo_prof_data_to_buf(WASMModuleInstanceCommon *module_inst,
+                                       char *buf, uint32 len)
+{
+#if WASM_ENABLE_AOT != 0
+    if (module_inst->module_type == Wasm_Module_AoT) {
+        AOTModuleInstance *aot_inst = (AOTModuleInstance *)module_inst;
+        return aot_dump_pgo_prof_data_to_buf(aot_inst, buf, len);
+    }
+#endif
+    return 0;
+}
+#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */
+
 bool
 wasm_runtime_get_table_elem_type(const WASMModuleCommon *module_comm,
                                  uint32 table_idx, uint8 *out_elem_type,

+ 284 - 32
core/iwasm/compilation/aot_emit_aot_file.c

@@ -111,6 +111,8 @@ typedef struct AOTSymbolList {
 
 /* AOT object data */
 typedef struct AOTObjectData {
+    AOTCompContext *comp_ctx;
+
     LLVMMemoryBufferRef mem_buf;
     LLVMBinaryRef binary;
 
@@ -119,6 +121,12 @@ typedef struct AOTObjectData {
     void *text;
     uint32 text_size;
 
+    void *text_unlikely;
+    uint32 text_unlikely_size;
+
+    void *text_hot;
+    uint32 text_hot_size;
+
     /* literal data and size */
     void *literal;
     uint32 literal_size;
@@ -558,8 +566,10 @@ get_init_data_section_size(AOTCompContext *comp_ctx, AOTCompData *comp_data,
 static uint32
 get_text_section_size(AOTObjectData *obj_data)
 {
-    return (sizeof(uint32) + obj_data->literal_size + obj_data->text_size + 3)
-           & ~3;
+    return sizeof(uint32) + align_uint(obj_data->literal_size, 4)
+           + align_uint(obj_data->text_size, 4)
+           + align_uint(obj_data->text_unlikely_size, 4)
+           + align_uint(obj_data->text_hot_size, 4);
 }
 
 static uint32
@@ -1702,12 +1712,28 @@ aot_emit_text_section(uint8 *buf, uint8 *buf_end, uint32 *p_offset,
     EMIT_U32(AOT_SECTION_TYPE_TEXT);
     EMIT_U32(section_size);
     EMIT_U32(obj_data->literal_size);
-    if (obj_data->literal_size > 0)
+
+    if (obj_data->literal_size > 0) {
         EMIT_BUF(obj_data->literal, obj_data->literal_size);
-    EMIT_BUF(obj_data->text, obj_data->text_size);
+        while (offset & 3)
+            EMIT_BUF(&placeholder, 1);
+    }
 
-    while (offset & 3)
-        EMIT_BUF(&placeholder, 1);
+    if (obj_data->text_size > 0) {
+        EMIT_BUF(obj_data->text, obj_data->text_size);
+        while (offset & 3)
+            EMIT_BUF(&placeholder, 1);
+    }
+    if (obj_data->text_unlikely_size > 0) {
+        EMIT_BUF(obj_data->text_unlikely, obj_data->text_unlikely_size);
+        while (offset & 3)
+            EMIT_BUF(&placeholder, 1);
+    }
+    if (obj_data->text_hot_size > 0) {
+        EMIT_BUF(obj_data->text_hot, obj_data->text_hot_size);
+        while (offset & 3)
+            EMIT_BUF(&placeholder, 1);
+    }
 
     if (offset - *p_offset != section_size + sizeof(uint32) * 2) {
         aot_set_last_error("emit text section failed.");
@@ -2211,11 +2237,23 @@ aot_resolve_text(AOTObjectData *obj_data)
         }
         while (
             !LLVMObjectFileIsSectionIteratorAtEnd(obj_data->binary, sec_itr)) {
-            if ((name = (char *)LLVMGetSectionName(sec_itr))
-                && !strcmp(name, ".text")) {
-                obj_data->text = (char *)LLVMGetSectionContents(sec_itr);
-                obj_data->text_size = (uint32)LLVMGetSectionSize(sec_itr);
-                break;
+            if ((name = (char *)LLVMGetSectionName(sec_itr))) {
+                if (!strcmp(name, ".text")) {
+                    obj_data->text = (char *)LLVMGetSectionContents(sec_itr);
+                    obj_data->text_size = (uint32)LLVMGetSectionSize(sec_itr);
+                }
+                else if (!strcmp(name, ".text.unlikely.")) {
+                    obj_data->text_unlikely =
+                        (char *)LLVMGetSectionContents(sec_itr);
+                    obj_data->text_unlikely_size =
+                        (uint32)LLVMGetSectionSize(sec_itr);
+                }
+                else if (!strcmp(name, ".text.hot.")) {
+                    obj_data->text_hot =
+                        (char *)LLVMGetSectionContents(sec_itr);
+                    obj_data->text_hot_size =
+                        (uint32)LLVMGetSectionSize(sec_itr);
+                }
             }
             LLVMMoveToNextSection(sec_itr);
         }
@@ -2253,7 +2291,8 @@ static bool
 get_relocations_count(LLVMSectionIteratorRef sec_itr, uint32 *p_count);
 
 static bool
-is_data_section(LLVMSectionIteratorRef sec_itr, char *section_name)
+is_data_section(AOTObjectData *obj_data, LLVMSectionIteratorRef sec_itr,
+                char *section_name)
 {
     uint32 relocation_count = 0;
 
@@ -2265,7 +2304,11 @@ is_data_section(LLVMSectionIteratorRef sec_itr, char *section_name)
             || !strncmp(section_name, ".rodata.str", strlen(".rodata.str"))
             || (!strcmp(section_name, ".rdata")
                 && get_relocations_count(sec_itr, &relocation_count)
-                && relocation_count > 0));
+                && relocation_count > 0)
+            || (obj_data->comp_ctx->enable_llvm_pgo
+                && (!strncmp(section_name, "__llvm_prf_cnts", 15)
+                    || !strncmp(section_name, "__llvm_prf_data", 15)
+                    || !strncmp(section_name, "__llvm_prf_names", 16))));
 }
 
 static bool
@@ -2281,7 +2324,7 @@ get_object_data_sections_count(AOTObjectData *obj_data, uint32 *p_count)
     }
     while (!LLVMObjectFileIsSectionIteratorAtEnd(obj_data->binary, sec_itr)) {
         if ((name = (char *)LLVMGetSectionName(sec_itr))
-            && (is_data_section(sec_itr, name))) {
+            && (is_data_section(obj_data, sec_itr, name))) {
             count++;
         }
         LLVMMoveToNextSection(sec_itr);
@@ -2306,6 +2349,9 @@ aot_resolve_object_data_sections(AOTObjectData *obj_data)
     }
 
     if (sections_count > 0) {
+        uint32 llvm_prf_cnts_idx = 0, llvm_prf_data_idx = 0;
+        char buf[32];
+
         size = (uint32)sizeof(AOTObjectDataSection) * sections_count;
         if (!(data_section = obj_data->data_sections =
                   wasm_runtime_malloc(size))) {
@@ -2322,10 +2368,46 @@ aot_resolve_object_data_sections(AOTObjectData *obj_data)
         while (
             !LLVMObjectFileIsSectionIteratorAtEnd(obj_data->binary, sec_itr)) {
             if ((name = (char *)LLVMGetSectionName(sec_itr))
-                && (is_data_section(sec_itr, name))) {
+                && (is_data_section(obj_data, sec_itr, name))) {
                 data_section->name = name;
-                data_section->data = (uint8 *)LLVMGetSectionContents(sec_itr);
-                data_section->size = (uint32)LLVMGetSectionSize(sec_itr);
+                if (obj_data->comp_ctx->enable_llvm_pgo
+                    && !strcmp(name, "__llvm_prf_cnts")) {
+                    snprintf(buf, sizeof(buf), "%s%u", name,
+                             llvm_prf_cnts_idx++);
+                    size = strlen(buf) + 1;
+                    if (!(data_section->name = wasm_runtime_malloc(size))) {
+                        aot_set_last_error(
+                            "allocate memory for data section name failed.");
+                        return false;
+                    }
+                    bh_memcpy_s(data_section->name, size, buf, size);
+                    data_section->is_name_allocated = true;
+                }
+                else if (obj_data->comp_ctx->enable_llvm_pgo
+                         && !strcmp(name, "__llvm_prf_data")) {
+                    snprintf(buf, sizeof(buf), "%s%u", name,
+                             llvm_prf_data_idx++);
+                    size = strlen(buf) + 1;
+                    if (!(data_section->name = wasm_runtime_malloc(size))) {
+                        aot_set_last_error(
+                            "allocate memory for data section name failed.");
+                        return false;
+                    }
+                    bh_memcpy_s(data_section->name, size, buf, size);
+                    data_section->is_name_allocated = true;
+                }
+
+                if (obj_data->comp_ctx->enable_llvm_pgo
+                    && !strcmp(name, "__llvm_prf_names")) {
+                    data_section->data = (uint8 *)aot_compress_aot_func_names(
+                        obj_data->comp_ctx, &data_section->size);
+                    data_section->is_data_allocated = true;
+                }
+                else {
+                    data_section->data =
+                        (uint8 *)LLVMGetSectionContents(sec_itr);
+                    data_section->size = (uint32)LLVMGetSectionSize(sec_itr);
+                }
                 data_section++;
             }
             LLVMMoveToNextSection(sec_itr);
@@ -2365,9 +2447,36 @@ aot_resolve_functions(AOTCompContext *comp_ctx, AOTObjectData *obj_data)
             && str_starts_with(name, prefix)) {
             func_index = (uint32)atoi(name + strlen(prefix));
             if (func_index < obj_data->func_count) {
+                LLVMSectionIteratorRef contain_section;
+                char *contain_section_name;
+
                 func = obj_data->funcs + func_index;
                 func->func_name = name;
-                func->text_offset = LLVMGetSymbolAddress(sym_itr);
+
+                if (!(contain_section = LLVMObjectFileCopySectionIterator(
+                          obj_data->binary))) {
+                    aot_set_last_error("llvm get section iterator failed.");
+                    LLVMDisposeSymbolIterator(sym_itr);
+                    return false;
+                }
+                LLVMMoveToContainingSection(contain_section, sym_itr);
+                contain_section_name =
+                    (char *)LLVMGetSectionName(contain_section);
+                LLVMDisposeSectionIterator(contain_section);
+
+                if (!strcmp(contain_section_name, ".text.unlikely.")) {
+                    func->text_offset = align_uint(obj_data->text_size, 4)
+                                        + LLVMGetSymbolAddress(sym_itr);
+                }
+                else if (!strcmp(contain_section_name, ".text.hot.")) {
+                    func->text_offset =
+                        align_uint(obj_data->text_size, 4)
+                        + align_uint(obj_data->text_unlikely_size, 4)
+                        + LLVMGetSymbolAddress(sym_itr);
+                }
+                else {
+                    func->text_offset = LLVMGetSymbolAddress(sym_itr);
+                }
             }
         }
         LLVMMoveToNextSymbol(sym_itr);
@@ -2478,9 +2587,86 @@ aot_resolve_object_relocation_group(AOTObjectData *obj_data,
         }
 
         /* set relocation fields */
-        relocation->relocation_offset = offset;
         relocation->relocation_type = (uint32)type;
         relocation->symbol_name = (char *)LLVMGetSymbolName(rel_sym);
+        relocation->relocation_offset = offset;
+        if (!strcmp(group->section_name, ".rela.text.unlikely.")
+            || !strcmp(group->section_name, ".rel.text.unlikely.")) {
+            relocation->relocation_offset += align_uint(obj_data->text_size, 4);
+        }
+        else if (!strcmp(group->section_name, ".rela.text.hot.")
+                 || !strcmp(group->section_name, ".rel.text.hot.")) {
+            relocation->relocation_offset +=
+                align_uint(obj_data->text_size, 4)
+                + align_uint(obj_data->text_unlikely_size, 4);
+        }
+        if (!strcmp(relocation->symbol_name, ".text.unlikely.")) {
+            relocation->symbol_name = ".text";
+            relocation->relocation_addend += align_uint(obj_data->text_size, 4);
+        }
+        if (!strcmp(relocation->symbol_name, ".text.hot.")) {
+            relocation->symbol_name = ".text";
+            relocation->relocation_addend +=
+                align_uint(obj_data->text_size, 4)
+                + align_uint(obj_data->text_unlikely_size, 4);
+        }
+
+        if (obj_data->comp_ctx->enable_llvm_pgo
+            && (!strcmp(relocation->symbol_name, "__llvm_prf_cnts")
+                || !strcmp(relocation->symbol_name, "__llvm_prf_data"))) {
+            LLVMSectionIteratorRef sec_itr;
+            char buf[32], *section_name;
+            uint32 prof_section_idx = 0;
+
+            if (!(sec_itr =
+                      LLVMObjectFileCopySectionIterator(obj_data->binary))) {
+                aot_set_last_error("llvm get section iterator failed.");
+                LLVMDisposeSymbolIterator(rel_sym);
+                goto fail;
+            }
+            while (!LLVMObjectFileIsSectionIteratorAtEnd(obj_data->binary,
+                                                         sec_itr)) {
+                section_name = (char *)LLVMGetSectionName(sec_itr);
+                if (section_name
+                    && !strcmp(section_name, relocation->symbol_name)) {
+                    if (LLVMGetSectionContainsSymbol(sec_itr, rel_sym))
+                        break;
+                    prof_section_idx++;
+                }
+                LLVMMoveToNextSection(sec_itr);
+            }
+            LLVMDisposeSectionIterator(sec_itr);
+
+            if (!strcmp(group->section_name, ".rela.text")
+                || !strcmp(group->section_name, ".rel.text")) {
+                snprintf(buf, sizeof(buf), "%s%u", relocation->symbol_name,
+                         prof_section_idx);
+                size = strlen(buf) + 1;
+                if (!(relocation->symbol_name = wasm_runtime_malloc(size))) {
+                    aot_set_last_error(
+                        "allocate memory for relocation symbol name failed.");
+                    LLVMDisposeSymbolIterator(rel_sym);
+                    goto fail;
+                }
+                bh_memcpy_s(relocation->symbol_name, size, buf, size);
+                relocation->is_symbol_name_allocated = true;
+            }
+            else if (!strncmp(group->section_name, ".rela__llvm_prf_data", 20)
+                     || !strncmp(group->section_name, ".rel__llvm_prf_data",
+                                 19)) {
+                snprintf(buf, sizeof(buf), "%s%u", relocation->symbol_name,
+                         prof_section_idx);
+                size = strlen(buf) + 1;
+                if (!(relocation->symbol_name = wasm_runtime_malloc(size))) {
+                    aot_set_last_error(
+                        "allocate memory for relocation symbol name failed.");
+                    LLVMDisposeSymbolIterator(rel_sym);
+                    goto fail;
+                }
+                bh_memcpy_s(relocation->symbol_name, size, buf, size);
+                relocation->is_symbol_name_allocated = true;
+            }
+        }
 
         /* for ".LCPIxxx", ".LJTIxxx", ".LBBxxx" and switch lookup table
          * relocation, transform the symbol name to real section name and set
@@ -2525,10 +2711,14 @@ fail:
 }
 
 static bool
-is_relocation_section_name(char *section_name)
+is_relocation_section_name(AOTObjectData *obj_data, char *section_name)
 {
     return (!strcmp(section_name, ".rela.text")
             || !strcmp(section_name, ".rel.text")
+            || !strcmp(section_name, ".rela.text.unlikely.")
+            || !strcmp(section_name, ".rel.text.unlikely.")
+            || !strcmp(section_name, ".rela.text.hot.")
+            || !strcmp(section_name, ".rel.text.hot.")
             || !strcmp(section_name, ".rela.literal")
             || !strcmp(section_name, ".rela.data")
             || !strcmp(section_name, ".rel.data")
@@ -2536,6 +2726,9 @@ is_relocation_section_name(char *section_name)
             || !strcmp(section_name, ".rel.sdata")
             || !strcmp(section_name, ".rela.rodata")
             || !strcmp(section_name, ".rel.rodata")
+            || (obj_data->comp_ctx->enable_llvm_pgo
+                && (!strcmp(section_name, ".rela__llvm_prf_data")
+                    || !strcmp(section_name, ".rel__llvm_prf_data")))
             /* ".rela.rodata.cst4/8/16/.." */
             || !strncmp(section_name, ".rela.rodata.cst",
                         strlen(".rela.rodata.cst"))
@@ -2545,14 +2738,15 @@ is_relocation_section_name(char *section_name)
 }
 
 static bool
-is_relocation_section(LLVMSectionIteratorRef sec_itr)
+is_relocation_section(AOTObjectData *obj_data, LLVMSectionIteratorRef sec_itr)
 {
     uint32 count = 0;
     char *name = (char *)LLVMGetSectionName(sec_itr);
     if (name) {
-        if (is_relocation_section_name(name))
+        if (is_relocation_section_name(obj_data, name))
             return true;
-        else if ((!strcmp(name, ".text") || !strcmp(name, ".rdata"))
+        else if ((!strcmp(name, ".text") || !strcmp(name, ".text.unlikely.")
+                  || !strcmp(name, ".text.hot.") || !strcmp(name, ".rdata"))
                  && get_relocations_count(sec_itr, &count) && count > 0)
             return true;
     }
@@ -2570,7 +2764,7 @@ get_relocation_groups_count(AOTObjectData *obj_data, uint32 *p_count)
         return false;
     }
     while (!LLVMObjectFileIsSectionIteratorAtEnd(obj_data->binary, sec_itr)) {
-        if (is_relocation_section(sec_itr)) {
+        if (is_relocation_section(obj_data, sec_itr)) {
             count++;
         }
         LLVMMoveToNextSection(sec_itr);
@@ -2586,7 +2780,7 @@ aot_resolve_object_relocation_groups(AOTObjectData *obj_data)
 {
     LLVMSectionIteratorRef sec_itr;
     AOTRelocationGroup *relocation_group;
-    uint32 group_count;
+    uint32 group_count, llvm_prf_data_idx = 0;
     char *name;
     uint32 size;
 
@@ -2612,14 +2806,50 @@ aot_resolve_object_relocation_groups(AOTObjectData *obj_data)
         return false;
     }
     while (!LLVMObjectFileIsSectionIteratorAtEnd(obj_data->binary, sec_itr)) {
-        if (is_relocation_section(sec_itr)) {
+        if (is_relocation_section(obj_data, sec_itr)) {
             name = (char *)LLVMGetSectionName(sec_itr);
             relocation_group->section_name = name;
+
+            if (obj_data->comp_ctx->enable_llvm_pgo
+                && (!strcmp(name, ".rela__llvm_prf_data")
+                    || !strcmp(name, ".rel__llvm_prf_data"))) {
+                char buf[32];
+                snprintf(buf, sizeof(buf), "%s%u", name, llvm_prf_data_idx);
+                size = strlen(buf) + 1;
+                if (!(relocation_group->section_name =
+                          wasm_runtime_malloc(size))) {
+                    aot_set_last_error(
+                        "allocate memory for section name failed.");
+                    LLVMDisposeSectionIterator(sec_itr);
+                    return false;
+                }
+                bh_memcpy_s(relocation_group->section_name, size, buf, size);
+                relocation_group->is_section_name_allocated = true;
+            }
+
             if (!aot_resolve_object_relocation_group(obj_data, relocation_group,
                                                      sec_itr)) {
                 LLVMDisposeSectionIterator(sec_itr);
                 return false;
             }
+
+            if (obj_data->comp_ctx->enable_llvm_pgo
+                && (!strcmp(name, ".rela__llvm_prf_data")
+                    || !strcmp(name, ".rel__llvm_prf_data"))) {
+                llvm_prf_data_idx++;
+            }
+
+            if (!strcmp(relocation_group->section_name, ".rela.text.unlikely.")
+                || !strcmp(relocation_group->section_name, ".rela.text.hot.")) {
+                relocation_group->section_name = ".rela.text";
+            }
+            else if (!strcmp(relocation_group->section_name,
+                             ".rel.text.unlikely.")
+                     || !strcmp(relocation_group->section_name,
+                                ".rel.text.hot.")) {
+                relocation_group->section_name = ".rel.text";
+            }
+
             relocation_group++;
         }
         LLVMMoveToNextSection(sec_itr);
@@ -2633,12 +2863,21 @@ static void
 destroy_relocation_groups(AOTRelocationGroup *relocation_groups,
                           uint32 relocation_group_count)
 {
-    uint32 i;
+    uint32 i, j;
     AOTRelocationGroup *relocation_group = relocation_groups;
 
-    for (i = 0; i < relocation_group_count; i++, relocation_group++)
-        if (relocation_group->relocations)
+    for (i = 0; i < relocation_group_count; i++, relocation_group++) {
+        if (relocation_group->relocations) {
+            for (j = 0; j < relocation_group->relocation_count; j++) {
+                if (relocation_group->relocations[j].is_symbol_name_allocated)
+                    wasm_runtime_free(
+                        relocation_group->relocations[j].symbol_name);
+            }
             wasm_runtime_free(relocation_group->relocations);
+        }
+        if (relocation_group->is_section_name_allocated)
+            wasm_runtime_free(relocation_group->section_name);
+    }
     wasm_runtime_free(relocation_groups);
 }
 
@@ -2664,8 +2903,20 @@ aot_obj_data_destroy(AOTObjectData *obj_data)
         LLVMDisposeMemoryBuffer(obj_data->mem_buf);
     if (obj_data->funcs)
         wasm_runtime_free(obj_data->funcs);
-    if (obj_data->data_sections)
+    if (obj_data->data_sections) {
+        uint32 i;
+        for (i = 0; i < obj_data->data_sections_count; i++) {
+            if (obj_data->data_sections[i].name
+                && obj_data->data_sections[i].is_name_allocated) {
+                wasm_runtime_free(obj_data->data_sections[i].name);
+            }
+            if (obj_data->data_sections[i].data
+                && obj_data->data_sections[i].is_data_allocated) {
+                wasm_runtime_free(obj_data->data_sections[i].data);
+            }
+        }
         wasm_runtime_free(obj_data->data_sections);
+    }
     if (obj_data->relocation_groups)
         destroy_relocation_groups(obj_data->relocation_groups,
                                   obj_data->relocation_group_count);
@@ -2688,6 +2939,7 @@ aot_obj_data_create(AOTCompContext *comp_ctx)
         return false;
     }
     memset(obj_data, 0, sizeof(AOTObjectData));
+    obj_data->comp_ctx = comp_ctx;
 
     bh_print_time("Begin to emit object file");
     if (comp_ctx->external_llc_compiler || comp_ctx->external_asm_compiler) {
@@ -2821,8 +3073,8 @@ aot_obj_data_create(AOTCompContext *comp_ctx)
     if (!aot_resolve_target_info(comp_ctx, obj_data)
         || !aot_resolve_text(obj_data) || !aot_resolve_literal(obj_data)
         || !aot_resolve_object_data_sections(obj_data)
-        || !aot_resolve_object_relocation_groups(obj_data)
-        || !aot_resolve_functions(comp_ctx, obj_data))
+        || !aot_resolve_functions(comp_ctx, obj_data)
+        || !aot_resolve_object_relocation_groups(obj_data))
         goto fail;
 
     return obj_data;

+ 26 - 0
core/iwasm/compilation/aot_llvm.c

@@ -1670,6 +1670,12 @@ aot_create_comp_context(const AOTCompData *comp_data, aot_comp_option_t option)
     if (option->disable_llvm_lto)
         comp_ctx->disable_llvm_lto = true;
 
+    if (option->enable_llvm_pgo)
+        comp_ctx->enable_llvm_pgo = true;
+
+    if (option->use_prof_file)
+        comp_ctx->use_prof_file = option->use_prof_file;
+
     if (option->enable_stack_estimation)
         comp_ctx->enable_stack_estimation = true;
 
@@ -2829,3 +2835,23 @@ aot_load_const_from_table(AOTCompContext *comp_ctx, LLVMValueRef base,
     (void)const_type;
     return const_value;
 }
+
+bool
+aot_set_cond_br_weights(AOTCompContext *comp_ctx, LLVMValueRef cond_br,
+                        int32 weights_true, int32 weights_false)
+{
+    LLVMMetadataRef md_nodes[3], meta_data;
+    LLVMValueRef meta_data_as_value;
+
+    md_nodes[0] = LLVMMDStringInContext2(comp_ctx->context, "branch_weights",
+                                         strlen("branch_weights"));
+    md_nodes[1] = LLVMValueAsMetadata(I32_CONST(weights_true));
+    md_nodes[2] = LLVMValueAsMetadata(I32_CONST(weights_false));
+
+    meta_data = LLVMMDNodeInContext2(comp_ctx->context, md_nodes, 3);
+    meta_data_as_value = LLVMMetadataAsValue(comp_ctx->context, meta_data);
+
+    LLVMSetMetadata(cond_br, 2, meta_data_as_value);
+
+    return true;
+}

+ 15 - 0
core/iwasm/compilation/aot_llvm.h

@@ -349,6 +349,12 @@ typedef struct AOTCompContext {
     /* Disable LLVM link time optimization */
     bool disable_llvm_lto;
 
+    /* Enable LLVM PGO (Profile-Guided Optimization) */
+    bool enable_llvm_pgo;
+
+    /* Use profile file collected by LLVM PGO */
+    char *use_prof_file;
+
     /* Enable to use segument register as the base addr
        of linear memory for load/store operations */
     bool enable_segue_i32_load;
@@ -428,7 +434,9 @@ typedef struct AOTCompOption {
     bool enable_aux_stack_frame;
     bool disable_llvm_intrinsics;
     bool disable_llvm_lto;
+    bool enable_llvm_pgo;
     bool enable_stack_estimation;
+    char *use_prof_file;
     uint32 opt_level;
     uint32 size_level;
     uint32 output_format;
@@ -541,6 +549,13 @@ aot_apply_llvm_new_pass_manager(AOTCompContext *comp_ctx, LLVMModuleRef module);
 void
 aot_handle_llvm_errmsg(const char *string, LLVMErrorRef err);
 
+char *
+aot_compress_aot_func_names(AOTCompContext *comp_ctx, uint32 *p_size);
+
+bool
+aot_set_cond_br_weights(AOTCompContext *comp_ctx, LLVMValueRef cond_br,
+                        int32 weights_true, int32 weights_false);
+
 #ifdef __cplusplus
 } /* end of extern "C" */
 #endif

+ 60 - 7
core/iwasm/compilation/aot_llvm_extra.cpp

@@ -44,6 +44,7 @@
 #if LLVM_VERSION_MAJOR >= 12
 #include <llvm/Analysis/AliasAnalysis.h>
 #endif
+#include <llvm/ProfileData/InstrProf.h>
 
 #include <cstring>
 #include "../aot/aot_runtime.h"
@@ -232,14 +233,26 @@ aot_apply_llvm_new_pass_manager(AOTCompContext *comp_ctx, LLVMModuleRef module)
     PTO.SLPVectorization = true;
     PTO.LoopUnrolling = true;
 
+    Optional<PGOOptions> PGO = None;
+    if (comp_ctx->enable_llvm_pgo) {
+        /* Disable static counter allocation for value profiler,
+           it will be allocated by runtime */
+        const char *argv[] = { "", "-vp-static-alloc=false" };
+        cl::ParseCommandLineOptions(2, argv);
+        PGO = PGOOptions("", "", "", PGOOptions::IRInstr);
+    }
+    else if (comp_ctx->use_prof_file) {
+        PGO = PGOOptions(comp_ctx->use_prof_file, "", "", PGOOptions::IRUse);
+    }
+
 #ifdef DEBUG_PASS
     PassInstrumentationCallbacks PIC;
-    PassBuilder PB(TM, PTO, None, &PIC);
+    PassBuilder PB(TM, PTO, PGO, &PIC);
 #else
 #if LLVM_VERSION_MAJOR == 12
-    PassBuilder PB(false, TM, PTO);
+    PassBuilder PB(false, TM, PTO, PGO);
 #else
-    PassBuilder PB(TM, PTO);
+    PassBuilder PB(TM, PTO, PGO);
 #endif
 #endif
 
@@ -334,8 +347,16 @@ aot_apply_llvm_new_pass_manager(AOTCompContext *comp_ctx, LLVMModuleRef module)
         FPM.addPass(SLPVectorizerPass());
         FPM.addPass(LoadStoreVectorizerPass());
 
+        if (comp_ctx->enable_llvm_pgo || comp_ctx->use_prof_file) {
+            LICMOptions licm_opt;
+            /* LICM pass: loop invariant code motion, attempting to remove
+               as much code from the body of a loop as possible. Experiments
+               show it is good to enable it when pgo is enabled. */
+            FPM.addPass(
+                createFunctionToLoopPassAdaptor(LICMPass(licm_opt), true));
+        }
+
         /*
-        FPM.addPass(createFunctionToLoopPassAdaptor(LICMPass()));
         FPM.addPass(createFunctionToLoopPassAdaptor(LoopRotatePass()));
         FPM.addPass(createFunctionToLoopPassAdaptor(SimpleLoopUnswitchPass()));
         */
@@ -344,9 +365,10 @@ aot_apply_llvm_new_pass_manager(AOTCompContext *comp_ctx, LLVMModuleRef module)
 
         if (!disable_llvm_lto) {
             /* Apply LTO for AOT mode */
-            if (comp_ctx->comp_data->func_count >= 10)
-                /* Adds the pre-link optimizations if the func count
-                   is large enough */
+            if (comp_ctx->comp_data->func_count >= 10
+                || comp_ctx->enable_llvm_pgo || comp_ctx->use_prof_file)
+                /* Add the pre-link optimizations if the func count
+                   is large enough or PGO is enabled */
                 MPM.addPass(PB.buildLTOPreLinkDefaultPipeline(OL));
             else
                 MPM.addPass(PB.buildLTODefaultPipeline(OL, NULL));
@@ -358,3 +380,34 @@ aot_apply_llvm_new_pass_manager(AOTCompContext *comp_ctx, LLVMModuleRef module)
 
     MPM.run(*M, MAM);
 }
+
+char *
+aot_compress_aot_func_names(AOTCompContext *comp_ctx, uint32 *p_size)
+{
+    std::vector<std::string> NameStrs;
+    std::string Result;
+    char buf[32], *compressed_str;
+    uint32 compressed_str_len, i;
+
+    for (i = 0; i < comp_ctx->func_ctx_count; i++) {
+        snprintf(buf, sizeof(buf), "%s%d", AOT_FUNC_PREFIX, i);
+        std::string str(buf);
+        NameStrs.push_back(str);
+    }
+
+    if (collectPGOFuncNameStrings(NameStrs, true, Result)) {
+        aot_set_last_error("collect pgo func name strings failed");
+        return NULL;
+    }
+
+    compressed_str_len = Result.size();
+    if (!(compressed_str = (char *)wasm_runtime_malloc(compressed_str_len))) {
+        aot_set_last_error("allocate memory failed");
+        return NULL;
+    }
+
+    bh_memcpy_s(compressed_str, compressed_str_len, Result.c_str(),
+                compressed_str_len);
+    *p_size = compressed_str_len;
+    return compressed_str;
+}

+ 2 - 0
core/iwasm/include/aot_export.h

@@ -55,7 +55,9 @@ typedef struct AOTCompOption {
     bool enable_aux_stack_frame;
     bool disable_llvm_intrinsics;
     bool disable_llvm_lto;
+    bool enable_llvm_pgo;
     bool enable_stack_estimation;
+    char *use_prof_file;
     uint32_t opt_level;
     uint32_t size_level;
     uint32_t output_format;

+ 24 - 0
core/iwasm/include/wasm_export.h

@@ -1331,6 +1331,30 @@ WASM_RUNTIME_API_EXTERN uint32_t
 wasm_runtime_dump_call_stack_to_buf(wasm_exec_env_t exec_env, char *buf,
                                     uint32_t len);
 
+/**
+ * Get the size required to store the LLVM PGO profile data
+ *
+ * @param module_inst the WASM module instance
+ *
+ * @return size required to store the contents, 0 means error
+ */
+WASM_RUNTIME_API_EXTERN uint32_t
+wasm_runtime_get_pgo_prof_data_size(wasm_module_inst_t module_inst);
+
+/**
+ * Dump the LLVM PGO profile data to buffer
+ *
+ * @param module_inst the WASM module instance
+ * @param buf buffer to store the dumped content
+ * @param len length of the buffer
+ *
+ * @return bytes dumped to the buffer, 0 means error and data in buf
+ *         may be invalid
+ */
+WASM_RUNTIME_API_EXTERN uint32_t
+wasm_runtime_dump_pgo_prof_data_to_buf(wasm_module_inst_t module_inst,
+                                       char *buf, uint32_t len);
+
 /**
  * Get a custom section by name
  *

+ 1 - 0
core/shared/platform/include/platform_api_extension.h

@@ -130,6 +130,7 @@ os_thread_exit(void *retval);
 #define os_memory_order_release memory_order_release
 #define os_memory_order_seq_cst memory_order_seq_cst
 #define os_atomic_thread_fence atomic_thread_fence
+#define os_atomic_cmpxchg atomic_compare_exchange_strong
 #endif
 
 #endif /* end of os_atomic_thread_fence */

+ 57 - 0
product-mini/platforms/posix/main.c

@@ -97,6 +97,9 @@ print_help()
 #if WASM_ENABLE_DEBUG_INTERP != 0
     printf("  -g=ip:port               Set the debug sever address, default is debug disabled\n");
     printf("                             if port is 0, then a random port will be used\n");
+#endif
+#if WASM_ENABLE_STATIC_PGO != 0
+    printf("  --gen-prof-file=<path>   Generate LLVM PGO (Profile-Guided Optimization) profile file\n");
 #endif
     printf("  --version                Show version information\n");
     return 1;
@@ -413,6 +416,44 @@ moudle_destroyer(uint8 *buffer, uint32 size)
 static char global_heap_buf[WASM_GLOBAL_HEAP_SIZE] = { 0 };
 #endif
 
+#if WASM_ENABLE_STATIC_PGO != 0
+static void
+dump_pgo_prof_data(wasm_module_inst_t module_inst, const char *path)
+{
+    char *buf;
+    uint32 len;
+    FILE *file;
+
+    if (!(len = wasm_runtime_get_pgo_prof_data_size(module_inst))) {
+        printf("failed to get LLVM PGO profile data size\n");
+        return;
+    }
+
+    if (!(buf = wasm_runtime_malloc(len))) {
+        printf("allocate memory failed\n");
+        return;
+    }
+
+    if (len != wasm_runtime_dump_pgo_prof_data_to_buf(module_inst, buf, len)) {
+        printf("failed to dump LLVM PGO profile data\n");
+        wasm_runtime_free(buf);
+        return;
+    }
+
+    if (!(file = fopen(path, "wb"))) {
+        printf("failed to create file %s", path);
+        wasm_runtime_free(buf);
+        return;
+    }
+    fwrite(buf, len, 1, file);
+    fclose(file);
+
+    wasm_runtime_free(buf);
+
+    printf("LLVM raw profile file %s was generated.\n", path);
+}
+#endif
+
 int
 main(int argc, char *argv[])
 {
@@ -460,6 +501,9 @@ main(int argc, char *argv[])
     char *ip_addr = NULL;
     int instance_port = 0;
 #endif
+#if WASM_ENABLE_STATIC_PGO != 0
+    const char *gen_prof_file = NULL;
+#endif
 
     /* Process options. */
     for (argc--, argv++; argc > 0 && argv[0][0] == '-'; argc--, argv++) {
@@ -663,6 +707,13 @@ main(int argc, char *argv[])
                 return print_help();
             ip_addr = argv[0] + 3;
         }
+#endif
+#if WASM_ENABLE_STATIC_PGO != 0
+        else if (!strncmp(argv[0], "--gen-prof-file=", 16)) {
+            if (argv[0][16] == '\0')
+                return print_help();
+            gen_prof_file = argv[0] + 16;
+        }
 #endif
         else if (!strncmp(argv[0], "--version", 9)) {
             uint32 major, minor, patch;
@@ -826,6 +877,12 @@ main(int argc, char *argv[])
     }
 #endif
 
+#if WASM_ENABLE_STATIC_PGO != 0 && WASM_ENABLE_AOT != 0
+    if (get_package_type(wasm_file_buf, wasm_file_size) == Wasm_Module_AoT
+        && gen_prof_file)
+        dump_pgo_prof_data(wasm_module_inst, gen_prof_file);
+#endif
+
 #if WASM_ENABLE_DEBUG_INTERP != 0
 fail4:
 #endif

+ 62 - 0
tests/benchmarks/README.md

@@ -0,0 +1,62 @@
+# WAMR test benchmarks
+
+This folder contains test benchmarks for wamr.
+
+## Build and Run
+
+Refer to the `README.md` under each folder for how to build and run the benchmark.
+
+## Install `llvm-profdata`
+
+The tool `llvm-profdata` is used when running the `test_pgo.sh` script under the benchmark folder. There are two ways to install it:
+
+1. Refer to https://apt.llvm.org/, e.g. in Ubuntu 20.04, add lines below to /etc/apt/source.list
+
+```bash
+deb http://apt.llvm.org/focal/ llvm-toolchain-focal main
+deb-src http://apt.llvm.org/focal/ llvm-toolchain-focal main
+# 15
+deb http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main
+deb-src http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main
+```
+
+Then run `sudo apt update`, `sudo apt install llvm`. And after installing:
+
+```bash
+cd /usr/bin
+sudo ln -s llvm-profdata-15 llvm-profdata
+```
+
+2. Build manually
+
+```bash
+git clone --depth 1 --branch release/15.x https://github.com/llvm/llvm-project.git
+cd llvm-project
+mkdir build && cd build
+cmake ../llvm \
+    -DCMAKE_BUILD_TYPE:STRING="Release" \
+    -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+    -DLLVM_APPEND_VC_REV:BOOL=ON \
+    -DLLVM_BUILD_EXAMPLES:BOOL=OFF \
+    -DLLVM_BUILD_LLVM_DYLIB:BOOL=OFF \
+    -DLLVM_BUILD_TESTS:BOOL=OFF \
+    -DLLVM_CCACHE_BUILD:BOOL=ON \
+    -DLLVM_ENABLE_BINDINGS:BOOL=OFF \
+    -DLLVM_ENABLE_IDE:BOOL=OFF \
+    -DLLVM_ENABLE_LIBEDIT=OFF \
+    -DLLVM_ENABLE_TERMINFO:BOOL=OFF \
+    -DLLVM_ENABLE_ZLIB:BOOL=ON \
+    -DLLVM_INCLUDE_BENCHMARKS:BOOL=OFF \
+    -DLLVM_INCLUDE_DOCS:BOOL=OFF \
+    -DLLVM_INCLUDE_EXAMPLES:BOOL=OFF \
+    -DLLVM_INCLUDE_UTILS:BOOL=OFF \
+    -DLLVM_INCLUDE_TESTS:BOOL=OFF \
+    -DLLVM_BUILD_TESTS:BOOL=OFF \
+    -DLLVM_OPTIMIZED_TABLEGEN:BOOL=ON \
+    -DLLVM_ENABLE_LIBXML2:BOOL=OFF \
+    -DLLVM_TARGETS_TO_BUILD:STRING="X86" \
+    -DLLVM_INCLUDE_TOOLS:BOOL=ON \
+    -G'Ninja'
+ninja -j 8
+# tool `llvm-profdata` is generated under this folder.
+```

+ 2 - 0
tests/benchmarks/coremark/README.md

@@ -17,3 +17,5 @@ And then run `./build.sh` to build the source code, file `coremark.exe`, `corema
 # Running
 
 Run `./run.sh` to test the benchmark, the native mode, iwasm aot mode and iwasm interpreter mode will be tested respectively.
+
+Run `./test_pgo.sh` to test the benchmark with AOT static PGO (Profile-Guided Optimization) enabled, please refer [here](../README.md#install-llvm-profdata) to install tool `llvm-profdata` and build `iwasm` with `cmake -DWAMR_BUILD_STATIC_PGO=1`.

+ 50 - 0
tests/benchmarks/coremark/test_pgo.sh

@@ -0,0 +1,50 @@
+#!/bin/sh
+
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+PLATFORM=$(uname -s | tr A-Z a-z)
+
+IWASM="../../../product-mini/platforms/${PLATFORM}/build/iwasm"
+WAMRC="../../../wamr-compiler/build/wamrc"
+
+if [ ! -e "coremark.wasm" ]; then
+    echo "coremark.wasm doesn't exist, please run build.sh first"
+    exit
+fi
+
+echo ""
+echo "Compile coremark.wasm to coremark.aot .."
+${WAMRC} -o coremark.aot coremark.wasm
+
+echo ""
+echo "Compile coremark.wasm to coremark_pgo.aot .."
+${WAMRC} --enable-llvm-pgo -o coremark_pgo.aot coremark.wasm
+
+echo ""
+echo "Run coremark_pgo.aot to generate the raw profile data .."
+${IWASM} --gen-prof-file=coremark.profraw coremark_pgo.aot
+
+echo ""
+echo "Merge the raw profile data to coremark.profdata .."
+rm -f coremark.profdata && llvm-profdata merge -output=coremark.profdata coremark.profraw
+
+echo ""
+echo "Compile coremark.wasm to coremark_opt.aot with the profile data .."
+${WAMRC} --use-prof-file=coremark.profdata -o coremark_opt.aot coremark.wasm
+
+echo ""
+echo "Run the coremark native"
+./coremark.exe
+
+echo ""
+echo "Run the original aot file coremark.aot"
+${IWASM} coremark.aot
+
+echo ""
+echo "Run the PGO optimized aot file coremark_opt.aot"
+${IWASM} coremark_opt.aot
+
+# Show the profile data:
+# llvm-profdata show --all-functions --detailed-summary --binary-ids --counts \
+# --hot-func-list --memop-sizes --show-prof-sym-list coremark.profraw

+ 50 - 0
tests/benchmarks/dhrystone/test_pgo.sh

@@ -0,0 +1,50 @@
+#!/bin/sh
+
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+PLATFORM=$(uname -s | tr A-Z a-z)
+
+IWASM="../../../product-mini/platforms/${PLATFORM}/build/iwasm"
+WAMRC="../../../wamr-compiler/build/wamrc"
+
+if [ ! -e "dhrystone.wasm" ]; then
+    echo "dhrystone.wasm doesn't exist, please run build.sh first"
+    exit
+fi
+
+echo ""
+echo "Compile dhrystone.wasm to dhrystone.aot .."
+${WAMRC} -o dhrystone.aot dhrystone.wasm
+
+echo ""
+echo "Compile dhrystone.wasm to dhrystone_pgo.aot .."
+${WAMRC} --enable-llvm-pgo -o dhrystone_pgo.aot dhrystone.wasm
+
+echo ""
+echo "Run dhrystone_pgo.aot to generate the raw profile data .."
+${IWASM} --gen-prof-file=dhrystone.profraw dhrystone_pgo.aot
+
+echo ""
+echo "Merge the raw profile data to dhrystone.profdata .."
+rm -f dhrystone.profdata && llvm-profdata merge -output=dhrystone.profdata dhrystone.profraw
+
+echo ""
+echo "Compile dhrystone.wasm to dhrystone_opt.aot with the profile data .."
+${WAMRC} --use-prof-file=dhrystone.profdata -o dhrystone_opt.aot dhrystone.wasm
+
+echo ""
+echo "Run the dhrystone native"
+./dhrystone_native
+
+echo ""
+echo "Run the original aot file dhrystone.aot"
+${IWASM} dhrystone.aot
+
+echo ""
+echo "Run the PGO optimized aot file dhrystone_opt.aot"
+${IWASM} dhrystone_opt.aot
+
+# Show the profile data:
+# llvm-profdata show --all-functions --detailed-summary --binary-ids --counts \
+# --hot-func-list --memop-sizes --show-prof-sym-list dhrystone.profraw

+ 2 - 0
tests/benchmarks/jetstream/README.md

@@ -27,3 +27,5 @@ And then run `./build.sh` to build the source code, the folder `out` will be cre
 # Running
 
 Run `./run_aot.sh` to test the benchmark, the native mode and iwasm aot mode will be tested for each workload, and the file `report.txt` will be generated.
+
+Run `./test_pgo.sh` to test the benchmark with AOT static PGO (Profile-Guided Optimization) enabled, please refer [here](../README.md#install-llvm-profdata) to install tool `llvm-profdata` and build `iwasm` with `cmake -DWAMR_BUILD_STATIC_PGO=1`.

+ 87 - 0
tests/benchmarks/jetstream/test_pgo.sh

@@ -0,0 +1,87 @@
+#!/bin/bash
+
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+CUR_DIR=$PWD
+OUT_DIR=$CUR_DIR/out
+REPORT=$CUR_DIR/report.txt
+TIME=/usr/bin/time
+
+PLATFORM=$(uname -s | tr A-Z a-z)
+IWASM_CMD=$CUR_DIR/../../../product-mini/platforms/${PLATFORM}/build/iwasm
+WAMRC_CMD=$CUR_DIR/../../../wamr-compiler/build/wamrc
+
+BENCH_NAME_MAX_LEN=20
+
+JETSTREAM_CASES="gcc-loops HashSet tsf float-mm quicksort"
+
+rm -f $REPORT
+touch $REPORT
+
+function print_bench_name()
+{
+    name=$1
+    echo -en "$name" >> $REPORT
+    name_len=${#name}
+    if [ $name_len -lt $BENCH_NAME_MAX_LEN ]
+    then
+        spaces=$(( $BENCH_NAME_MAX_LEN - $name_len ))
+        for i in $(eval echo "{1..$spaces}"); do echo -n " " >> $REPORT; done
+    fi
+}
+
+pushd $OUT_DIR > /dev/null 2>&1
+for t in $JETSTREAM_CASES
+do
+    if [ ! -e "${t}.wasm" ]; then
+        echo "${t}.wasm doesn't exist, please run build.sh first"
+        exit
+    fi
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}.aot .."
+    ${WAMRC_CMD} -o ${t}.aot ${t}.wasm
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}_pgo.aot .."
+    ${WAMRC_CMD} --enable-llvm-pgo -o ${t}_pgo.aot ${t}.wasm
+
+    echo ""
+    echo "Run ${t}_pgo.aot to generate the raw profile data .."
+    ${IWASM_CMD} --gen-prof-file=${t}.profraw --dir=. ${t}_pgo.aot
+
+    echo ""
+    echo "Merge the raw profile data to ${t}.profdata .."
+    rm -f ${t}.profdata && llvm-profdata merge -output=${t}.profdata ${t}.profraw
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}_opt.aot with the profile data .."
+    ${WAMRC_CMD} --use-prof-file=${t}.profdata -o ${t}_opt.aot ${t}.wasm
+done
+popd > /dev/null 2>&1
+
+echo "Start to run cases, the result is written to report.txt"
+
+#run benchmarks
+cd $OUT_DIR
+echo -en "\t\t\t\t\t  native\tiwasm-aot\tiwasm-aot-pgo\n" >> $REPORT
+
+for t in $JETSTREAM_CASES
+do
+    print_bench_name $t
+
+    echo "run $t with native .."
+    echo -en "\t" >> $REPORT
+    $TIME -f "real-%e-time" ./${t}_native 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    echo "run $t with iwasm aot .."
+    echo -en "\t" >> $REPORT
+    $TIME -f "real-%e-time" $IWASM_CMD --dir=. ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    echo "run $t with iwasm aot opt .."
+    echo -en "\t" >> $REPORT
+    $TIME -f "real-%e-time" $IWASM_CMD --dir=. ${t}_opt.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    echo -en "\n" >> $REPORT
+done

+ 116 - 0
tests/benchmarks/libsodium/test_pgo.sh

@@ -0,0 +1,116 @@
+#!/bin/bash
+
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+libsodium_CASES="aead_aes256gcm2 aead_aes256gcm aead_chacha20poly13052 aead_chacha20poly1305 \
+                 aead_xchacha20poly1305 auth2 auth3 auth5 auth6 auth7 auth box2 box7 box8 \
+                 box_easy2 box_easy box_seal box_seed box chacha20 codecs core1 core2 core3 \
+                 core4 core5 core6 core_ed25519 core_ristretto255 ed25519_convert generichash2 \
+                 generichash3 generichash hash3 hash kdf keygen kx metamorphic misuse \
+                 onetimeauth2 onetimeauth7 onetimeauth pwhash_argon2id pwhash_argon2i \
+                 pwhash_scrypt_ll pwhash_scrypt randombytes scalarmult2 scalarmult5 \
+                 scalarmult6 scalarmult7 scalarmult8 scalarmult_ed25519 scalarmult_ristretto255 \
+                 scalarmult secretbox2 secretbox7 secretbox8 secretbox_easy2 secretbox_easy \
+                 secretbox secretstream shorthash sign siphashx24 sodium_core sodium_utils2 \
+                 sodium_utils stream2 stream3 stream4 stream verify1 xchacha20"
+
+PLATFORM=$(uname -s | tr A-Z a-z)
+
+readonly OUT_DIR=$PWD/libsodium/zig-out/bin
+readonly REPORT=$PWD/report.txt
+readonly IWASM_CMD=$PWD/../../../product-mini/platforms/${PLATFORM}/build/iwasm
+readonly WAMRC_CMD=$PWD/../../../wamr-compiler/build/wamrc
+readonly TIME=/usr/bin/time
+
+BENCH_NAME_MAX_LEN=20
+
+rm -f $REPORT
+touch $REPORT
+
+function print_bench_name()
+{
+    name=$1
+    echo -en "$name" >> $REPORT
+    name_len=${#name}
+    if [ $name_len -lt $BENCH_NAME_MAX_LEN ]
+    then
+        spaces=$(( $BENCH_NAME_MAX_LEN - $name_len ))
+        for i in $(eval echo "{1..$spaces}"); do echo -n " " >> $REPORT; done
+    fi
+}
+
+pushd $OUT_DIR > /dev/null 2>&1
+for t in $libsodium_CASES
+do
+    if [ ! -e "${t}.wasm" ]; then
+        echo "${t}.wasm doesn't exist, please run build.sh first"
+        exit
+    fi
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}.aot .."
+    ${WAMRC_CMD} -o ${t}.aot ${t}.wasm
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}_pgo.aot .."
+    ${WAMRC_CMD} --enable-llvm-pgo -o ${t}_pgo.aot ${t}.wasm
+
+    echo ""
+    echo "Run ${t}_pgo.aot to generate the raw profile data .."
+    ${IWASM_CMD} --gen-prof-file=${t}.profraw --dir=. ${t}_pgo.aot
+
+    echo ""
+    echo "Merge the raw profile data to ${t}.profdata .."
+    rm -f ${t}.profdata && llvm-profdata merge -output=${t}.profdata ${t}.profraw
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}_opt.aot with the profile data .."
+    ${WAMRC_CMD} --use-prof-file=${t}.profdata -o ${t}_opt.aot ${t}.wasm
+done
+
+# run benchmarks
+cd $OUT_DIR
+
+echo -en "\t\t\t\t\t\tnative\tiwasm-aot\tiwasm-aot-pgo\n" >> $REPORT
+
+for t in $libsodium_CASES
+do
+    print_bench_name $t
+
+    echo "run $t with native..."
+    echo -en "\t" >> $REPORT
+    if [[ $t != "sodium_utils2" ]]; then
+        ./${t} | awk '{printf "%-10.2f", $0/1000000.0}' >> $REPORT
+    else
+        # sodium_utils2 doesn't print the result,
+        # use time command to get result instead
+        $TIME -f "real-%e-time" ./${t} 2>&1 | grep "real-.*-time" |
+            awk -F '-' '{printf "%-10.2f", $2}' >> $REPORT
+    fi
+
+    echo "run $t with iwasm aot..."
+    echo -en "\t  \t" >> $REPORT
+    if [[ $t != "sodium_utils2" ]]; then
+        $IWASM_CMD ${t}.aot | awk '{printf "%-10.2f", $0/1000000.0}' >> $REPORT
+    else
+        # sodium_utils2 doesn't print the result,
+        # use time command to get result instead
+        $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" |
+            awk -F '-' '{printf "%-10.2f", $2}' >> $REPORT
+    fi
+
+    echo "run $t with iwasm aot opt..."
+    echo -en "\t  \t" >> $REPORT
+    if [[ $t != "sodium_utils2" ]]; then
+        $IWASM_CMD ${t}_opt.aot | awk '{printf "%-10.2f", $0/1000000.0}' >> $REPORT
+    else
+        # sodium_utils2 doesn't print the result,
+        # use time command to get result instead
+        $TIME -f "real-%e-time" $IWASM_CMD ${t}_opt.aot 2>&1 | grep "real-.*-time" |
+            awk -F '-' '{printf "%-10.2f", $2}' >> $REPORT
+    fi
+
+    echo -en "\n" >> $REPORT
+done
+

+ 90 - 0
tests/benchmarks/polybench/test_pgo.sh

@@ -0,0 +1,90 @@
+#!/bin/bash
+
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+CUR_DIR=$PWD
+OUT_DIR=$CUR_DIR/out
+REPORT=$CUR_DIR/report.txt
+TIME=/usr/bin/time
+
+PLATFORM=$(uname -s | tr A-Z a-z)
+IWASM_CMD=$CUR_DIR/../../../product-mini/platforms/${PLATFORM}/build/iwasm
+WAMRC_CMD=$CUR_DIR/../../../wamr-compiler/build/wamrc
+
+BENCH_NAME_MAX_LEN=20
+
+POLYBENCH_CASES="2mm 3mm adi atax bicg cholesky correlation covariance \
+                 deriche doitgen durbin fdtd-2d floyd-warshall gemm gemver \
+                 gesummv gramschmidt heat-3d jacobi-1d jacobi-2d ludcmp lu \
+                 mvt nussinov seidel-2d symm syr2k syrk trisolv trmm"
+
+rm -f $REPORT
+touch $REPORT
+
+function print_bench_name()
+{
+    name=$1
+    echo -en "$name" >> $REPORT
+    name_len=${#name}
+    if [ $name_len -lt $BENCH_NAME_MAX_LEN ]
+    then
+        spaces=$(( $BENCH_NAME_MAX_LEN - $name_len ))
+        for i in $(eval echo "{1..$spaces}"); do echo -n " " >> $REPORT; done
+    fi
+}
+
+pushd $OUT_DIR > /dev/null 2>&1
+for t in $POLYBENCH_CASES
+do
+    if [ ! -e "${t}.wasm" ]; then
+        echo "${t}.wasm doesn't exist, please run build.sh first"
+        exit
+    fi
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}.aot .."
+    ${WAMRC_CMD} -o ${t}.aot ${t}.wasm
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}_pgo.aot .."
+    ${WAMRC_CMD} --enable-llvm-pgo -o ${t}_pgo.aot ${t}.wasm
+
+    echo ""
+    echo "Run ${t}_pgo.aot to generate the raw profile data .."
+    ${IWASM_CMD} --gen-prof-file=${t}.profraw --dir=. ${t}_pgo.aot
+
+    echo ""
+    echo "Merge the raw profile data to ${t}.profdata .."
+    rm -f ${t}.profdata && llvm-profdata merge -output=${t}.profdata ${t}.profraw
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}_opt.aot with the profile data .."
+    ${WAMRC_CMD} --use-prof-file=${t}.profdata -o ${t}_opt.aot ${t}.wasm
+done
+popd > /dev/null 2>&1
+
+echo "Start to run cases, the result is written to report.txt"
+
+#run benchmarks
+cd $OUT_DIR
+echo -en "\t\t\t\t\t  native\tiwasm-aot\tiwasm-aot-pgo\n" >> $REPORT
+
+for t in $POLYBENCH_CASES
+do
+    print_bench_name $t
+
+    echo "run $t with native .."
+    echo -en "\t" >> $REPORT
+    $TIME -f "real-%e-time" ./${t}_native 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    echo "run $t with iwasm aot .."
+    echo -en "\t" >> $REPORT
+    $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    echo "run $t with iwasm aot opt .."
+    echo -en "\t" >> $REPORT
+    $TIME -f "real-%e-time" $IWASM_CMD ${t}_opt.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    echo -en "\n" >> $REPORT
+done

+ 2 - 0
tests/benchmarks/sightglass/README.md

@@ -19,3 +19,5 @@ And then run `./build.sh` to build the source code, the folder `out` will be cre
 Run `./run_aot.sh` to test the benchmark, the native mode and iwasm aot mode will be tested for each workload, and the file `report.txt` will be generated.
 
 Run `./run_interp.sh` to test the benchmark, the native mode and iwasm interpreter mode will be tested for each workload, and the file `report.txt` will be generated.
+
+Run `./test_pgo.sh` to test the benchmark with AOT static PGO (Profile-Guided Optimization) enabled, please refer [here](../README.md#install-llvm-profdata) to install tool `llvm-profdata` and build `iwasm` with `cmake -DWAMR_BUILD_STATIC_PGO=1`.

+ 89 - 0
tests/benchmarks/sightglass/test_pgo.sh

@@ -0,0 +1,89 @@
+#!/bin/bash
+
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+CUR_DIR=$PWD
+OUT_DIR=$CUR_DIR/out
+REPORT=$CUR_DIR/report.txt
+TIME=/usr/bin/time
+
+PLATFORM=$(uname -s | tr A-Z a-z)
+IWASM_CMD=$CUR_DIR/../../../product-mini/platforms/${PLATFORM}/build/iwasm
+WAMRC_CMD=$CUR_DIR/../../../wamr-compiler/build/wamrc
+
+BENCH_NAME_MAX_LEN=20
+
+SHOOTOUT_CASES="base64 fib2 gimli heapsort matrix memmove nestedloop \
+                nestedloop2 nestedloop3 random seqhash sieve strchr \
+                switch2"
+
+rm -f $REPORT
+touch $REPORT
+
+function print_bench_name()
+{
+    name=$1
+    echo -en "$name" >> $REPORT
+    name_len=${#name}
+    if [ $name_len -lt $BENCH_NAME_MAX_LEN ]
+    then
+        spaces=$(( $BENCH_NAME_MAX_LEN - $name_len ))
+        for i in $(eval echo "{1..$spaces}"); do echo -n " " >> $REPORT; done
+    fi
+}
+
+pushd $OUT_DIR > /dev/null 2>&1
+for t in $SHOOTOUT_CASES
+do
+    if [ ! -e "${t}.wasm" ]; then
+        echo "${t}.wasm doesn't exist, please run build.sh first"
+        exit
+    fi
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}.aot .."
+    ${WAMRC_CMD} -o ${t}.aot ${t}.wasm
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}_pgo.aot .."
+    ${WAMRC_CMD} --enable-llvm-pgo -o ${t}_pgo.aot ${t}.wasm
+
+    echo ""
+    echo "Run ${t}_pgo.aot to generate the raw profile data .."
+    ${IWASM_CMD} --gen-prof-file=${t}.profraw --dir=. ${t}_pgo.aot
+
+    echo ""
+    echo "Merge the raw profile data to ${t}.profdata .."
+    rm -f ${t}.profdata && llvm-profdata merge -output=${t}.profdata ${t}.profraw
+
+    echo ""
+    echo "Compile ${t}.wasm to ${t}_opt.aot with the profile data .."
+    ${WAMRC_CMD} --use-prof-file=${t}.profdata -o ${t}_opt.aot ${t}.wasm
+done
+popd > /dev/null 2>&1
+
+echo "Start to run cases, the result is written to report.txt"
+
+#run benchmarks
+cd $OUT_DIR
+echo -en "\t\t\t\t\t  native\tiwasm-aot\tiwasm-aot-pgo\n" >> $REPORT
+
+for t in $SHOOTOUT_CASES
+do
+    print_bench_name $t
+
+    echo "run $t with native .."
+    echo -en "\t" >> $REPORT
+    $TIME -f "real-%e-time" ./${t}_native 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    echo "run $t with iwasm aot .."
+    echo -en "\t" >> $REPORT
+    $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    echo "run $t with iwasm aot opt .."
+    echo -en "\t" >> $REPORT
+    $TIME -f "real-%e-time" $IWASM_CMD ${t}_opt.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT
+
+    echo -en "\n" >> $REPORT
+done

+ 10 - 0
wamr-compiler/main.c

@@ -65,6 +65,8 @@ print_help()
     printf("  --enable-indirect-mode    Enalbe call function through symbol table but not direct call\n");
     printf("  --disable-llvm-intrinsics Disable the LLVM built-in intrinsics\n");
     printf("  --disable-llvm-lto        Disable the LLVM link time optimization\n");
+    printf("  --enable-llvm-pgo         Enable LLVM PGO (Profile-Guided Optimization)\n");
+    printf("  --use-prof-file=<file>    Use profile file collected by LLVM PGO (Profile-Guided Optimization)\n");
     printf("  --enable-segue[=<flags>]  Enable using segment register GS as the base address of linear memory,\n");
     printf("                            only available on linux/linux-sgx x86-64, which may improve performance,\n");
     printf("                            flags can be: i32.load, i64.load, f32.load, f64.load, v128.load,\n");
@@ -329,6 +331,14 @@ main(int argc, char *argv[])
         else if (!strcmp(argv[0], "--disable-llvm-lto")) {
             option.disable_llvm_lto = true;
         }
+        else if (!strcmp(argv[0], "--enable-llvm-pgo")) {
+            option.enable_llvm_pgo = true;
+        }
+        else if (!strncmp(argv[0], "--use-prof-file=", 16)) {
+            if (argv[0][16] == '\0')
+                PRINT_HELP_AND_EXIT();
+            option.use_prof_file = argv[0] + 16;
+        }
         else if (!strcmp(argv[0], "--enable-segue")) {
             /* all flags are enabled */
             option.segue_flags = 0x1F1F;