/*
 * Copyright (c) 2019 Nuclei Limited. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#ifndef __CORE_FEATURE_DSP__
#define __CORE_FEATURE_DSP__

/*!
 * @file     core_feature_dsp.h
 * @brief    DSP feature API header file for Nuclei N/NX Core
 */
/*
 * DSP Feature Configuration Macro:
 * 1. __DSP_PRESENT:  Define whether Digital Signal Processing Unit(DSP) is present or not
 *   * 0: Not present
 *   * 1: Present
 */
#ifdef __cplusplus
 extern "C" {
#endif

#include "core_feature_base.h"

#if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1)

#if defined(__INC_INTRINSIC_API) && (__INC_INTRINSIC_API == 1)
#if defined(__zcc__)
#include <rvp_intrinsic.h>
#else
#if !defined(__ICCRISCV__) && !defined(__llvm__)
#include <rvp_intrinsic.h>
#endif
#endif
#endif

#ifndef __ICCRISCV__
/* ###########################  CPU SIMD DSP Intrinsic Functions ########################### */
/**
 * \defgroup NMSIS_Core_DSP_Intrinsic   Intrinsic Functions for SIMD Instructions
 * \ingroup  NMSIS_Core
 * \brief    Functions that generate RISC-V DSP SIMD instructions.
 * \details
 *
 * The following functions generate specified RISC-V SIMD instructions that cannot be directly accessed by compiler.
 * * **DSP ISA Extension Instruction Summary**
 *   + **Shorthand Definitions**
 *     - r.H == rH1: r[31:16], r.L == r.H0: r[15:0]
 *     - r.B3: r[31:24], r.B2: r[23:16], r.B1: r[15:8], r.B0: r[7:0]
 *     - r.B[x]: r[(x*8+7):(x*8+0)]
 *     - r.H[x]: r[(x*16+7):(x*16+0)]
 *     - r.W[x]: r[(x*32+31):(x*32+0)]
 *     - r[xU]: the upper 32-bit of a 64-bit number; xU represents the GPR number that contains this upper part 32-bit value.
 *     - r[xL]: the lower 32-bit of a 64-bit number; xL represents the GPR number that contains this lower part 32-bit value.
 *     - r[xU].r[xL]: a 64-bit number that is formed from a pair of GPRs.
 *     - s>>: signed arithmetic right shift:
 *     - u>>: unsigned logical right shift
 *     - SAT.Qn(): Saturate to the range of [-2^n, 2^n-1], if saturation happens, set PSW.OV.
 *     - SAT.Um(): Saturate to the range of [0, 2^m-1], if saturation happens, set PSW.OV.
 *     - RUND(): Indicate `rounding`, i.e., add 1 to the most significant discarded bit for right shift or MSW-type multiplication instructions.
 *     - Sign or Zero Extending functions:
 *       - SEm(data): Sign-Extend data to m-bit.:
 *       - ZEm(data): Zero-Extend data to m-bit.
 *     - ABS(x): Calculate the absolute value of `x`.
 *     - CONCAT(x,y): Concatinate `x` and `y` to form a value.
 *     - u<: Unsinged less than comparison.
 *     - u<=: Unsinged less than & equal comparison.
 *     - u>: Unsinged greater than comparison.
 *     - s*: Signed multiplication.
 *     - u*: Unsigned multiplication.
 *
 *   @{
 */
/** @} */ /* End of Doxygen Group NMSIS_Core_DSP_Intrinsic */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS      SIMD Data Processing Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic
 * \brief    SIMD Data Processing Instructions
 * \details
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB      SIMD 16-bit Add/Subtract Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
 * \brief    SIMD 16-bit Add/Subtract Instructions
 * \details
 * Based on the combination of the types of the two 16-bit arithmetic operations, the SIMD 16-bit
 * add/subtract instructions can be classified into 6 main categories: Addition (two 16-bit addition),
 * Subtraction (two 16-bit subtraction), Crossed Add & Sub (one addition and one subtraction), and
 * Crossed Sub & Add (one subtraction and one addition), Straight Add & Sub (one addition and one
 * subtraction), and Straight Sub & Add (one subtraction and one addition).
 * Based on the way of how an overflow condition is handled, the SIMD 16-bit add/subtract
 * instructions can be classified into 5 groups: Wrap-around (dropping overflow), Signed Halving
 * (keeping overflow by dropping 1 LSB bit), Unsigned Halving, Signed Saturation (clipping overflow),
 * and Unsigned Saturation.
 * Together, there are 30 SIMD 16-bit add/subtract instructions.
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB      SIMD 8-bit Addition & Subtraction Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
 * \brief    SIMD 8-bit Addition & Subtraction Instructions
 * \details
 * Based on the types of the four 8-bit arithmetic operations, the SIMD 8-bit add/subtract instructions
 * can be classified into 2 main categories: Addition (four 8-bit addition), and Subtraction (four 8-bit
 * subtraction).
 * Based on the way of how an overflow condition is handled for singed or unsigned operation, the
 * SIMD 8-bit add/subtract instructions can be classified into 5 groups: Wrap-around (dropping
 * overflow), Signed Halving (keeping overflow by dropping 1 LSB bit), Unsigned Halving, Signed
 * Saturation (clipping overflow), and Unsigned Saturation.
 * Together, there are 10 SIMD 8-bit add/subtract instructions.
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT      SIMD 16-bit Shift Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
 * \brief    SIMD 16-bit Shift Instructions
 * \details
 * there are 14 SIMD 16-bit shift instructions.
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT      SIMD 8-bit Shift Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
 * \brief    SIMD 8-bit Shift Instructions
 * \details
 *  there are 14 SIMD 8-bit shift instructions.
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP      SIMD 16-bit Compare Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
 * \brief    SIMD 16-bit Compare Instructions
 * \details
 *  there are 5 SIMD 16-bit Compare instructions.
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP      SIMD 8-bit Compare Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
 * \brief    SIMD 8-bit Compare Instructions
 * \details
 *  there are 5  SIMD 8-bit Compare instructions.
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY      SIMD 16-bit Multiply Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
 * \brief    SIMD 16-bit Multiply Instructions
 * \details
 * there are 6 SIMD 16-bit Multiply instructions.
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY      SIMD 8-bit Multiply Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
 * \brief    SIMD 8-bit Multiply Instructions
 * \details
 *  there are 6 SIMD 8-bit Multiply instructions.
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC      SIMD 16-bit Miscellaneous Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
 * \brief    SIMD 16-bit Miscellaneous Instructions
 * \details
 *  there are 10 SIMD 16-bit Misc instructions.
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC      SIMD 8-bit Miscellaneous Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
 * \brief    SIMD 8-bit Miscellaneous Instructions
 * \details
 *  there are 10 SIMD 8-bit Miscellaneous instructions.
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK      SIMD 8-bit Unpacking Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
 * \brief    SIMD 8-bit Unpacking Instructions
 * \details
 *  there are 8 SIMD 8-bit Unpacking instructions.
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_NON_SIMD      Non-SIMD Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic
 * \brief    Non-SIMD Instructions
 * \details
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU      Non-SIMD Q15 saturation ALU Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NON_SIMD
 * \brief    Non-SIMD Q15 saturation ALU Instructions
 * \details
 * there are 7 Non-SIMD Q15 saturation ALU Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU      Non-SIMD Q31 saturation ALU Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NON_SIMD
 * \brief    Non-SIMD Q31 saturation ALU Instructions
 * \details
 *  there are Non-SIMD Q31 saturation ALU Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION      32-bit Computation Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NON_SIMD
 * \brief    32-bit Computation Instructions
 * \details
 * there are 8 32-bit Computation Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_OV_FLAG_SC      OV (Overflow) flag Set/Clear Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NON_SIMD
 * \brief    OV (Overflow) flag Set/Clear Instructions
 * \details
 * The following table lists the user instructions related to Overflow (OV) flag manipulation. there are 2 OV (Overflow) flag Set/Clear Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC      Non-SIMD Miscellaneous Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NON_SIMD
 * \brief    Non-SIMD Miscellaneous Instructions
 * \details
 * There are 13 Miscellaneous Instructions here.
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS      Partial-SIMD Data Processing Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic
 * \brief    Partial-SIMD Data Processing Instructions
 * \details
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK      SIMD 16-bit Packing Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
 * \brief    SIMD 16-bit Packing Instructions
 * \details
 * there are 4 SIMD16-bit Packing Instructions.
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC      Signed MSW 32x32 Multiply and Add Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
 * \brief    Signed MSW 32x32 Multiply and Add Instructions
 * \details
 *  there are 8 Signed MSW 32x32 Multiply and Add Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC      Signed MSW 32x16 Multiply and Add Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
 * \brief    Signed MSW 32x16 Multiply and Add Instructions
 * \details
 * there are 15 Signed MSW 32x16 Multiply and Add Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB      Signed 16-bit Multiply 32-bit Add/Subtract Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
 * \brief    Signed 16-bit Multiply 32-bit Add/Subtract Instructions
 * \details
 *  there are 18 Signed 16-bit Multiply 32-bit Add/Subtract Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB      Signed 16-bit Multiply 64-bit Add/Subtract Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
 * \brief    Signed 16-bit Multiply 64-bit Add/Subtract Instructions
 * \details
 *  there is Signed 16-bit Multiply 64-bit Add/Subtract Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC      Partial-SIMD Miscellaneous Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
 * \brief    Partial-SIMD Miscellaneous Instructions
 * \details
 *  there are  7 Partial-SIMD Miscellaneous Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_8B_MULT_32B_ADD      8-bit Multiply with 32-bit Add Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
 * \brief    8-bit Multiply with 32-bit Add Instructions
 * \details
 * there are  3 8-bit Multiply with 32-bit Add Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_64B_PROFILE      64-bit Profile Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic
 * \brief    64-bit Profile Instructions
 * \details
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB      64-bit Addition & Subtraction Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_64B_PROFILE
 * \brief    64-bit Addition & Subtraction Instructions
 * \details
 * there are 10 64-bit Addition & Subtraction Instructions.
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB      32-bit Multiply with 64-bit Add/Subtract Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_64B_PROFILE
 * \brief    32-bit Multiply with 64-bit Add/Subtract Instructions
 * \details
 *  there are 32-bit Multiply 64-bit Add/Subtract Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB      Signed 16-bit Multiply with 64-bit Add/Subtract Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_64B_PROFILE
 * \brief    Signed 16-bit Multiply with 64-bit Add/Subtract Instructions
 * \details
 * there are 10 Signed 16-bit Multiply with 64-bit Add/Subtract Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_ONLY      RV64 Only Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic
 * \brief    RV64 Only Instructions
 * \details
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB      (RV64 Only) SIMD 32-bit Add/Subtract Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
 * \brief    (RV64 Only) SIMD 32-bit Add/Subtract Instructions
 * \details
 * The following tables list instructions that are only present in RV64.
 * There are 30 SIMD 32-bit addition or subtraction instructions.there are 4 SIMD16-bit Packing Instructions.
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT      (RV64 Only) SIMD 32-bit Shift Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
 * \brief    (RV64 Only) SIMD 32-bit Shift Instructions
 * \details
 *  there are 14 (RV64 Only) SIMD 32-bit Shift Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC      (RV64 Only) SIMD 32-bit Miscellaneous Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
 * \brief    (RV64 Only) SIMD 32-bit Miscellaneous Instructions
 * \details
 * there are 5  (RV64 Only) SIMD 32-bit Miscellaneous Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT      (RV64 Only) SIMD Q15 Saturating Multiply Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
 * \brief    (RV64 Only) SIMD Q15 Saturating Multiply Instructions
 * \details
 *  there are 9 (RV64 Only) SIMD Q15 saturating Multiply Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT      (RV64 Only) 32-bit Multiply Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
 * \brief    (RV64 Only) 32-bit Multiply Instructions
 * \details
 *  there is 3 RV64 Only) 32-bit Multiply Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT_ADD      (RV64 Only) 32-bit Multiply & Add Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
 * \brief    (RV64 Only) 32-bit Multiply & Add Instructions
 * \details
 *  there are  3 (RV64 Only) 32-bit Multiply & Add Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC      (RV64 Only) 32-bit Parallel Multiply & Add Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
 * \brief    (RV64 Only) 32-bit Parallel Multiply & Add Instructions
 * \details
 * there are 12 (RV64 Only) 32-bit Parallel Multiply & Add Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_NON_SIMD_32B_SHIFT      (RV64 Only) Non-SIMD 32-bit Shift Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
 * \brief    (RV64 Only) Non-SIMD 32-bit Shift Instructions
 * \details
 *  there are 1  (RV64 Only) Non-SIMD 32-bit Shift Instructions
 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK      32-bit Packing Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
 * \brief    32-bit Packing Instructions
 * \details
 *  There are four 32-bit packing instructions here
 */

/* ===== Inline Function Start for 3.1. ADD8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
 * \brief ADD8 (SIMD 8-bit Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * ADD8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit integer element additions simultaneously.
 *
 * **Description**:\n
 * This instruction adds the 8-bit integer elements in Rs1 with the 8-bit integer elements
 * in Rs2, and then writes the 8-bit element results to Rd.
 *
 * **Note**:\n
 * This instruction can be used for either signed or unsigned addition.
 *
 * **Operations**:\n
 * ~~~
 * Rd.B[x] = Rs1.B[x] + Rs2.B[x];
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_ADD8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("add8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.1. ADD8 ===== */

/* ===== Inline Function Start for 3.2. ADD16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief ADD16 (SIMD 16-bit Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * ADD16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit integer element additions simultaneously.
 *
 * **Description**:\n
 * This instruction adds the 16-bit integer elements in Rs1 with the 16-bit integer
 * elements in Rs2, and then writes the 16-bit element results to Rd.
 *
 * **Note**:\n
 * This instruction can be used for either signed or unsigned addition.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = Rs1.H[x] + Rs2.H[x];
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_ADD16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("add16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.2. ADD16 ===== */

/* ===== Inline Function Start for 3.3. ADD64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
 * \brief ADD64 (64-bit Addition)
 * \details
 * **Type**: 64-bit Profile
 *
 * **Syntax**:\n
 * ~~~
 * ADD64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Add two 64-bit signed or unsigned integers.
 *
 * **RV32 Description**:\n
 * This instruction adds the 64-bit integer of an even/odd pair of registers specified
 * by Rs1(4,1) with the 64-bit integer of an even/odd pair of registers specified by Rs2(4,1), and then
 * writes the 64-bit result to an even/odd pair of registers specified by Rd(4,1).
 * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register
 * pair includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 *
 * **RV64 Description**:\n
 * This instruction has the same behavior as the ADD instruction in RV64I.
 *
 * **Note**:\n
 * This instruction can be used for either signed or unsigned addition.
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 *  t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 *  a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
 *  b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
 *  R[t_H].R[t_L] = R[a_H].R[a_L] + R[b_H].R[b_L];
 * RV64:
 *  Rd = Rs1 + Rs2;
 * ~~~
 *
 * \param [in]  a    unsigned long long type of value stored in a
 * \param [in]  b    unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_ADD64(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("add64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.3. ADD64 ===== */

/* ===== Inline Function Start for 3.4. AVE ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
 * \brief AVE (Average with Rounding)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * AVE Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Calculate the average of the contents of two general registers.
 *
 * **Description**:\n
 * This instruction calculates the average value of two signed integers stored in Rs1 and
 * Rs2, rounds up a half-integer result to the nearest integer, and writes the result to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Sum = CONCAT(Rs1[MSB],Rs1[MSB:0]) + CONCAT(Rs2[MSB],Rs2[MSB:0]) + 1;
 * Rd = Sum[(MSB+1):1];
 * for RV32: MSB=31,
 * for RV64: MSB=63
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_AVE(long a, long b)
{
    long result;
    __ASM volatile("ave %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.4. AVE ===== */

/* ===== Inline Function Start for 3.5. BITREV ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
 * \brief BITREV (Bit Reverse)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * BITREV Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Reverse the bit positions of the source operand within a specified width starting from bit
 * 0. The reversed width is a variable from a GPR.
 *
 * **Description**:\n
 * This instruction reverses the bit positions of the content of Rs1. The reversed bit width
 * is calculated as Rs2[4:0]+1 (RV32) or Rs2[5:0]+1 (RV64). The upper bits beyond the reversed width
 * are filled with zeros. After the bit reverse operation, the result is written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * msb = Rs2[4:0]; (for RV32)
 * msb = Rs2[5:0]; (for RV64)
 * rev[0:msb] = Rs1[msb:0];
 * Rd = ZE(rev[msb:0]);
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_BITREV(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("bitrev %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.5. BITREV ===== */

/* ===== Inline Function Start for 3.6. BITREVI ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
 * \brief BITREVI (Bit Reverse Immediate)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * (RV32) BITREVI Rd, Rs1, imm[4:0]
 * (RV64) BITREVI Rd, Rs1, imm[5:0]
 * ~~~
 *
 * **Purpose**:\n
 * Reverse the bit positions of the source operand within a specified width starting from bit
 * 0. The reversed width is an immediate value.
 *
 * **Description**:\n
 * This instruction reverses the bit positions of the content of Rs1. The reversed bit width
 * is calculated as imm[4:0]+1 (RV32) or imm[5:0]+1 (RV64). The upper bits beyond the reversed width
 * are filled with zeros. After the bit reverse operation, the result is written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * msb = imm[4:0]; (RV32)
 * msb = imm[5:0]; (RV64)
 * rev[0:msb] = Rs1[msb:0];
 * Rd = ZE(rev[msb:0]);
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_BITREVI(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("bitrevi %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.6. BITREVI ===== */

/* ===== Inline Function Start for 3.7. BPICK ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
 * \brief BPICK (Bit-wise Pick)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * BPICK Rd, Rs1, Rs2, Rc
 * ~~~
 *
 * **Purpose**:\n
 * Select from two source operands based on a bit mask in the third operand.
 *
 * **Description**:\n
 * This instruction selects individual bits from Rs1 or Rs2, based on the bit mask value in
 * Rc. If a bit in Rc is 1, the corresponding bit is from Rs1; otherwise, the corresponding bit is from Rs2.
 * The selection results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd[x] = Rc[x]? Rs1[x] : Rs2[x];
 * for RV32, x=31...0
 * for RV64, x=63...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \param [in]  c    unsigned long type of value stored in c
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_BPICK(unsigned long a, unsigned long b, unsigned long c)
{
    unsigned long result;
    __ASM volatile("bpick %0, %1, %2, %3" : "=r"(result) : "r"(a), "r"(b), "r"(c));
    return result;
}
/* ===== Inline Function End for 3.7. BPICK ===== */

/* ===== Inline Function Start for 3.8. CLROV ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_OV_FLAG_SC
 * \brief CLROV (Clear OV flag)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * CLROV # pseudo mnemonic
 * ~~~
 *
 * **Purpose**:\n
 * This pseudo instruction is an alias to `CSRRCI x0, ucode, 1` instruction.
 *
 *
 */
__STATIC_FORCEINLINE void __RV_CLROV(void)
{
    __ASM volatile("clrov ");
}
/* ===== Inline Function End for 3.8. CLROV ===== */

/* ===== Inline Function Start for 3.9. CLRS8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
 * \brief CLRS8 (SIMD 8-bit Count Leading Redundant Sign)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * CLRS8 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Count the number of redundant sign bits of the 8-bit elements of a general register.
 *
 * **Description**:\n
 * Starting from the bits next to the sign bits of the 8-bit elements of Rs1, this instruction
 * counts the number of redundant sign bits and writes the result to the corresponding 8-bit elements
 * of Rd.
 *
 * **Operations**:\n
 * ~~~
 * snum[x] = Rs1.B[x];
 * cnt[x] = 0;
 * for (i = 6 to 0) {
 *   if (snum[x](i) == snum[x](7)) {
 *     cnt[x] = cnt[x] + 1;
 *   } else {
 *     break;
 *   }
 * }
 * Rd.B[x] = cnt[x];
 * for RV32: x=3...0
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_CLRS8(unsigned long a)
{
    unsigned long result;
    __ASM volatile("clrs8 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.9. CLRS8 ===== */

/* ===== Inline Function Start for 3.10. CLRS16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
 * \brief CLRS16 (SIMD 16-bit Count Leading Redundant Sign)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * CLRS16 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Count the number of redundant sign bits of the 16-bit elements of a general register.
 *
 * **Description**:\n
 * Starting from the bits next to the sign bits of the 16-bit elements of Rs1, this
 * instruction counts the number of redundant sign bits and writes the result to the corresponding 16-
 * bit elements of Rd.
 *
 * **Operations**:\n
 * ~~~
 * snum[x] = Rs1.H[x];
 * cnt[x] = 0;
 * for (i = 14 to 0) {
 *   if (snum[x](i) == snum[x](15)) {
 *     cnt[x] = cnt[x] + 1;
 *   } else {
 *     break;
 *   }
 * }
 * Rd.H[x] = cnt[x];
 * for RV32: x=1...0
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_CLRS16(unsigned long a)
{
    unsigned long result;
    __ASM volatile("clrs16 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.10. CLRS16 ===== */

/* ===== Inline Function Start for 3.11. CLRS32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
 * \brief CLRS32 (SIMD 32-bit Count Leading Redundant Sign)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * CLRS32 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Count the number of redundant sign bits of the 32-bit elements of a general register.
 *
 * **Description**:\n
 * Starting from the bits next to the sign bits of the 32-bit elements of Rs1, this
 * instruction counts the number of redundant sign bits and writes the result to the corresponding 32-
 * bit elements of Rd.
 *
 * **Operations**:\n
 * ~~~
 * snum[x] = Rs1.W[x];
 * cnt[x] = 0;
 * for (i = 30 to 0) {
 *   if (snum[x](i) == snum[x](31)) {
 *     cnt[x] = cnt[x] + 1;
 *   } else {
 *     break;
 *   }
 * }
 * Rd.W[x] = cnt[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_CLRS32(unsigned long a)
{
    unsigned long result;
    __ASM volatile("clrs32 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.11. CLRS32 ===== */

/* ===== Inline Function Start for 3.12. CLO8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
 * \brief CLO8 (SIMD 8-bit Count Leading One)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * CLO8 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Count the number of leading one bits of the 8-bit elements of a general register.
 *
 * **Description**:\n
 * Starting from the most significant bits of the 8-bit elements of Rs1, this instruction
 * counts the number of leading one bits and writes the results to the corresponding 8-bit elements of
 * Rd.
 *
 * **Operations**:\n
 * ~~~
 * snum[x] = Rs1.B[x];
 * cnt[x] = 0;
 *   for (i = 7 to 0) {
 *   if (snum[x](i) == 1) {
 *     cnt[x] = cnt[x] + 1;
 *   } else {
 *     break;
 *   }
 * }
 * Rd.B[x] = cnt[x];
 * for RV32: x=3...0
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_CLO8(unsigned long a)
{
    unsigned long result;
    __ASM volatile("clo8 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.12. CLO8 ===== */

/* ===== Inline Function Start for 3.13. CLO16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
 * \brief CLO16 (SIMD 16-bit Count Leading One)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * CLO16 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Count the number of leading one bits of the 16-bit elements of a general register.
 *
 * **Description**:\n
 * Starting from the most significant bits of the 16-bit elements of Rs1, this instruction
 * counts the number of leading one bits and writes the results to the corresponding 16-bit elements
 * of Rd.
 *
 * **Operations**:\n
 * ~~~
 * snum[x] = Rs1.H[x];
 * cnt[x] = 0;
 * for (i = 15 to 0) {
 *   if (snum[x](i) == 1) {
 *     cnt[x] = cnt[x] + 1;
 *   } else {
 *     break;
 *   }
 * }
 * Rd.H[x] = cnt[x];
 * for RV32: x=1...0
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_CLO16(unsigned long a)
{
    unsigned long result;
    __ASM volatile("clo16 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.13. CLO16 ===== */

/* ===== Inline Function Start for 3.14. CLO32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
 * \brief CLO32 (SIMD 32-bit Count Leading One)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * CLO32 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Count the number of leading one bits of the 32-bit elements of a general register.
 *
 * **Description**:\n
 * Starting from the most significant bits of the 32-bit elements of Rs1, this instruction
 * counts the number of leading one bits and writes the results to the corresponding 32-bit elements
 * of Rd.
 *
 * **Operations**:\n
 * ~~~
 * snum[x] = Rs1.W[x];
 * cnt[x] = 0;
 * for (i = 31 to 0) {
 *   if (snum[x](i) == 1) {
 *     cnt[x] = cnt[x] + 1;
 *   } else {
 *     break;
 *   }
 * }
 * Rd.W[x] = cnt[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_CLO32(unsigned long a)
{
    unsigned long result;
    __ASM volatile("clo32 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.14. CLO32 ===== */

/* ===== Inline Function Start for 3.15. CLZ8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
 * \brief CLZ8 (SIMD 8-bit Count Leading Zero)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * CLZ8 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Count the number of leading zero bits of the 8-bit elements of a general register.
 *
 * **Description**:\n
 * Starting from the most significant bits of the 8-bit elements of Rs1, this instruction
 * counts the number of leading zero bits and writes the results to the corresponding 8-bit elements of
 * Rd.
 *
 * **Operations**:\n
 * ~~~
 * snum[x] = Rs1.B[x];
 * cnt[x] = 0;
 * for (i = 7 to 0) {
 *   if (snum[x](i) == 0) {
 *     cnt[x] = cnt[x] + 1;
 *   } else {
 *     break;
 *   }
 * }
 * Rd.B[x] = cnt[x];
 * for RV32: x=3...0
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_CLZ8(unsigned long a)
{
    unsigned long result;
    __ASM volatile("clz8 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.15. CLZ8 ===== */

/* ===== Inline Function Start for 3.16. CLZ16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
 * \brief CLZ16 (SIMD 16-bit Count Leading Zero)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * CLZ16 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Count the number of leading zero bits of the 16-bit elements of a general register.
 *
 * **Description**:\n
 * Starting from the most significant bits of the 16-bit elements of Rs1, this instruction
 * counts the number of leading zero bits and writes the results to the corresponding 16-bit elements
 * of Rd.
 *
 * **Operations**:\n
 * ~~~
 * snum[x] = Rs1.H[x];
 * cnt[x] = 0;
 * for (i = 15 to 0) {
 *   if (snum[x](i) == 0) {
 *     cnt[x] = cnt[x] + 1;
 *   } else {
 *     break;
 *   }
 * }
 * Rd.H[x] = cnt[x];
 * for RV32: x=1...0
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_CLZ16(unsigned long a)
{
    unsigned long result;
    __ASM volatile("clz16 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.16. CLZ16 ===== */

/* ===== Inline Function Start for 3.17. CLZ32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
 * \brief CLZ32 (SIMD 32-bit Count Leading Zero)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * CLZ32 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Count the number of leading zero bits of the 32-bit elements of a general register.
 *
 * **Description**:\n
 * Starting from the most significant bits of the 32-bit elements of Rs1, this instruction
 * counts the number of leading zero bits and writes the results to the corresponding 32-bit elements
 * of Rd.
 *
 * **Operations**:\n
 * ~~~
 * snum[x] = Rs1.W[x];
 * cnt[x] = 0;
 * for (i = 31 to 0) {
 *   if (snum[x](i) == 0) {
 *     cnt[x] = cnt[x] + 1;
 *   } else {
 *     break;
 *   }
 * }
 * Rd.W[x] = cnt[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_CLZ32(unsigned long a)
{
    unsigned long result;
    __ASM volatile("clz32 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.17. CLZ32 ===== */

/* ===== Inline Function Start for 3.18. CMPEQ8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP
 * \brief CMPEQ8 (SIMD 8-bit Integer Compare Equal)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * CMPEQ8 Rs, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit integer elements equal comparisons simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 8-bit integer elements in Rs1 with the 8-bit integer
 * elements in Rs2 to see if they are equal. If they are equal, the result is 0xFF; otherwise, the result is
 * 0x0. The 8-bit element comparison results are written to Rd.
 *
 * **Note**:\n
 * This instruction can be used for either signed or unsigned numbers.
 *
 * **Operations**:\n
 * ~~~
 * Rd.B[x] = (Rs1.B[x] == Rs2.B[x])? 0xff : 0x0;
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_CMPEQ8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("cmpeq8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.18. CMPEQ8 ===== */

/* ===== Inline Function Start for 3.19. CMPEQ16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP
 * \brief CMPEQ16 (SIMD 16-bit Integer Compare Equal)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * CMPEQ16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit integer elements equal comparisons simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 16-bit integer elements in Rs1 with the 16-bit integer
 * elements in Rs2 to see if they are equal. If they are equal, the result is 0xFFFF; otherwise, the result
 * is 0x0. The 16-bit element comparison results are written to Rt.
 *
 * **Note**:\n
 * This instruction can be used for either signed or unsigned numbers.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = (Rs1.H[x] == Rs2.H[x])? 0xffff : 0x0;
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_CMPEQ16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("cmpeq16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.19. CMPEQ16 ===== */

/* ===== Inline Function Start for 3.20. CRAS16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief CRAS16 (SIMD 16-bit Cross Addition & Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * CRAS16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit integer element addition and 16-bit integer element subtraction in a 32-bit
 * chunk simultaneously. Operands are from crossed positions in 32-bit chunks.
 *
 * **Description**:\n
 * This instruction adds the 16-bit integer element in [31:16] of 32-bit chunks in Rs1 with
 * the 16-bit integer element in [15:0] of 32-bit chunks in Rs2, and writes the result to [31:16] of 32-bit
 * chunks in Rd; at the same time, it subtracts the 16-bit integer element in [31:16] of 32-bit chunks in
 * Rs2 from the 16-bit integer element in [15:0] of 32-bit chunks, and writes the result to [15:0] of 32-
 * bit chunks in Rd.
 *
 * **Note**:\n
 * This instruction can be used for either signed or unsigned operations.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:16] = Rs1.W[x][31:16] + Rs2.W[x][15:0];
 * Rd.W[x][15:0] = Rs1.W[x][15:0] - Rs2.W[x][31:16];
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_CRAS16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("cras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.20. CRAS16 ===== */

/* ===== Inline Function Start for 3.21. CRSA16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief CRSA16 (SIMD 16-bit Cross Subtraction & Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * CRSA16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit integer element subtraction and 16-bit integer element addition in a 32-bit
 * chunk simultaneously. Operands are from crossed positions in 32-bit chunks.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit integer element in [15:0] of 32-bit chunks in Rs2
 * from the 16-bit integer element in [31:16] of 32-bit chunks in Rs1, and writes the result to [31:16] of
 * 32-bit chunks in Rd; at the same time, it adds the 16-bit integer element in [31:16] of 32-bit chunks
 * in Rs2 with the 16-bit integer element in [15:0] of 32-bit chunks in Rs1, and writes the result to
 * [15:0] of 32-bit chunks in Rd.
 *
 * **Note**:\n
 * This instruction can be used for either signed or unsigned operations.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:16] = Rs1.W[x][31:16] - Rs2.W[x][15:0];
 * Rd.W[x][15:0] = Rs1.W[x][15:0] + Rs2.W[x][31:16];
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_CRSA16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("crsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.21. CRSA16 ===== */

/* ===== Inline Function Start for 3.22. INSB ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
 * \brief INSB (Insert Byte)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * (RV32) INSB Rd, Rs1, imm[1:0]
 * (RV64) INSB Rd, Rs1, imm[2:0]
 * ~~~
 *
 * **Purpose**:\n
 * Insert byte 0 of a 32-bit or 64-bit register into one of the byte elements of another register.
 *
 * **Description**:\n
 * This instruction inserts byte 0 of Rs1 into byte `imm[1:0]` (RV32) or `imm[2:0]` (RV64)
 * of Rd.
 *
 * **Operations**:\n
 * ~~~
 * bpos = imm[1:0]; (RV32)
 * bpos = imm[2:0]; (RV64)
 * Rd.B[bpos] = Rs1.B[0]
 * ~~~
 *
 * \param [in]  t    unsigned long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_INSB(t, a, b)    \
    ({    \
        unsigned long __t = (unsigned long)(t);    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("insb %0, %1, %2" : "+r"(__t) : "r"(__a), "K"(b));    \
        __t;    \
    })
/* ===== Inline Function End for 3.22. INSB ===== */

/* ===== Inline Function Start for 3.23. KABS8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
 * \brief KABS8 (SIMD 8-bit Saturating Absolute)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KABS8 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Get the absolute value of 8-bit signed integer elements simultaneously.
 *
 * **Description**:\n
 * This instruction calculates the absolute value of 8-bit signed integer elements stored
 * in Rs1 and writes the element results to Rd. If the input number is 0x80, this instruction generates
 * 0x7f as the output and sets the OV bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * src = Rs1.B[x];
 * if (src == 0x80) {
 *   src = 0x7f;
 *   OV = 1;
 * } else if (src[7] == 1)
 *   src = -src;
 * }
 * Rd.B[x] = src;
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KABS8(unsigned long a)
{
    unsigned long result;
    __ASM volatile("kabs8 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.23. KABS8 ===== */

/* ===== Inline Function Start for 3.24. KABS16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
 * \brief KABS16 (SIMD 16-bit Saturating Absolute)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KABS16 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Get the absolute value of 16-bit signed integer elements simultaneously.
 *
 * **Description**:\n
 * This instruction calculates the absolute value of 16-bit signed integer elements stored
 * in Rs1 and writes the element results to Rd. If the input number is 0x8000, this instruction
 * generates 0x7fff as the output and sets the OV bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * src = Rs1.H[x];
 * if (src == 0x8000) {
 *   src = 0x7fff;
 *   OV = 1;
 * } else if (src[15] == 1)
 *   src = -src;
 * }
 * Rd.H[x] = src;
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KABS16(unsigned long a)
{
    unsigned long result;
    __ASM volatile("kabs16 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.24. KABS16 ===== */

/* ===== Inline Function Start for 3.25. KABSW ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
 * \brief KABSW (Scalar 32-bit Absolute Value with Saturation)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * KABSW Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Get the absolute value of a signed 32-bit integer in a general register.
 *
 * **Description**:\n
 * This instruction calculates the absolute value of a signed 32-bit integer stored in Rs1.
 * The result is sign-extended (for RV64) and written to Rd. This instruction with the minimum
 * negative integer input of 0x80000000 will produce a saturated output of maximum positive integer
 * of 0x7fffffff and the OV flag will be set to 1.
 *
 * **Operations**:\n
 * ~~~
 * if (Rs1.W[0] >= 0) {
 *   res = Rs1.W[0];
 * } else {
 *   If (Rs1.W[0] == 0x80000000) {
 *     res = 0x7fffffff;
 *     OV = 1;
 *   } else {
 *     res = -Rs1.W[0];
 *   }
 * }
 * Rd = SE32(res);
 * ~~~
 *
 * \param [in]  a    signed long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KABSW(signed long a)
{
    unsigned long result;
    __ASM volatile("kabsw %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.25. KABSW ===== */

/* ===== Inline Function Start for 3.26. KADD8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
 * \brief KADD8 (SIMD 8-bit Signed Saturating Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KADD8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit signed integer element saturating additions simultaneously.
 *
 * **Description**:\n
 * This instruction adds the 8-bit signed integer elements in Rs1 with the 8-bit signed
 * integer elements in Rs2. If any of the results are beyond the Q7 number range (-2^7 <= Q7 <= 2^7-1), they
 * are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rs1.B[x] + Rs2.B[x];
 * if (res[x] > 127) {
 *   res[x] = 127;
 *   OV = 1;
 * } else if (res[x] < -128) {
 *   res[x] = -128;
 *   OV = 1;
 * }
 * Rd.B[x] = res[x];
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KADD8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("kadd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.26. KADD8 ===== */

/* ===== Inline Function Start for 3.27. KADD16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief KADD16 (SIMD 16-bit Signed Saturating Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KADD16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element saturating additions simultaneously.
 *
 * **Description**:\n
 * This instruction adds the 16-bit signed integer elements in Rs1 with the 16-bit signed
 * integer elements in Rs2. If any of the results are beyond the Q15 number range (-2^15 <= Q15 <= 2^15-1),
 * they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rs1.H[x] + Rs2.H[x];
 * if (res[x] > 32767) {
 *   res[x] = 32767;
 *   OV = 1;
 * } else if (res[x] < -32768) {
 *   res[x] = -32768;
 *   OV = 1;
 * }
 * Rd.H[x] = res[x];
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KADD16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("kadd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.27. KADD16 ===== */

/* ===== Inline Function Start for 3.28. KADD64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
 * \brief KADD64 (64-bit Signed Saturating Addition)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * KADD64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Add two 64-bit signed integers. The result is saturated to the Q63 range.
 *
 * **RV32 Description**:\n
 * This instruction adds the 64-bit signed integer of an even/odd pair of registers
 * specified by Rs1(4,1) with the 64-bit signed integer of an even/odd pair of registers specified by
 * Rs2(4,1). If the 64-bit result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the
 * range and the OV bit is set to 1. The saturated result is written to an even/odd pair of registers
 * specified by Rd(4,1).
 * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register
 * pair includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 *
 * **RV64 Description**:\n
 * This instruction adds the 64-bit signed integer in Rs1 with the 64-bit signed
 * integer in Rs2. If the result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the
 * range and the OV bit is set to 1. The saturated result is written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 *  t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 *  a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
 *  b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
 *  result = R[a_H].R[a_L] + R[b_H].R[b_L];
 *  if (result > (2^63)-1) {
 *    result = (2^63)-1; OV = 1;
 *  } else if (result < -2^63) {
 *    result = -2^63; OV = 1;
 *  }
 *  R[t_H].R[t_L] = result;
 * RV64:
 *  result = Rs1 + Rs2;
 *  if (result > (2^63)-1) {
 *    result = (2^63)-1; OV = 1;
 *  } else if (result < -2^63) {
 *    result = -2^63; OV = 1;
 *  }
 *  Rd = result;
 * ~~~
 *
 * \param [in]  a    long long type of value stored in a
 * \param [in]  b    long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_KADD64(long long a, long long b)
{
    long long result;
    __ASM volatile("kadd64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.28. KADD64 ===== */

/* ===== Inline Function Start for 3.29. KADDH ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
 * \brief KADDH (Signed Addition with Q15 Saturation)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * KADDH Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Add the signed lower 32-bit content of two registers with Q15 saturation.
 *
 * **Description**:\n
 * The signed lower 32-bit content of Rs1 is added with the signed lower 32-bit content of
 * Rs2. And the result is saturated to the 16-bit signed integer range of [-2^15, 2^15-1] and then sign-
 * extended and written to Rd. If saturation happens, this instruction sets the OV flag.
 *
 * **Operations**:\n
 * ~~~
 * tmp = Rs1.W[0] + Rs2.W[0];
 * if (tmp > 32767) {
 *   res = 32767;
 *   OV = 1;
 * } else if (tmp < -32768) {
 *   res = -32768;
 *   OV = 1
 * } else {
 *   res = tmp;
 * }
 * Rd = SE(tmp[15:0]);
 * ~~~
 *
 * \param [in]  a    int type of value stored in a
 * \param [in]  b    int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KADDH(int a, int b)
{
    long result;
    __ASM volatile("kaddh %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.29. KADDH ===== */

/* ===== Inline Function Start for 3.30. KADDW ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
 * \brief KADDW (Signed Addition with Q31 Saturation)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * KADDW Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Add the lower 32-bit signed content of two registers with Q31 saturation.
 *
 * **Description**:\n
 * The lower 32-bit signed content of Rs1 is added with the lower 32-bit signed content of
 * Rs2. And the result is saturated to the 32-bit signed integer range of [-2^31, 2^31-1] and then sign-
 * extended and written to Rd. If saturation happens, this instruction sets the OV flag.
 *
 * **Operations**:\n
 * ~~~
 * tmp = Rs1.W[0] + Rs2.W[0];
 * if (tmp > (2^31)-1) {
 *   res = (2^31)-1;
 *   OV = 1;
 * } else if (tmp < -2^31) {
 *   res = -2^31;
 *   OV = 1
 * } else {
 *   res = tmp;
 * }
 * Rd = res[31:0]; // RV32
 * Rd = SE(res[31:0]) // RV64
 * ~~~
 *
 * \param [in]  a    int type of value stored in a
 * \param [in]  b    int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KADDW(int a, int b)
{
    long result;
    __ASM volatile("kaddw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.30. KADDW ===== */

/* ===== Inline Function Start for 3.31. KCRAS16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief KCRAS16 (SIMD 16-bit Signed Saturating Cross Addition & Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KCRAS16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element saturating addition and 16-bit signed integer element
 * saturating subtraction in a 32-bit chunk simultaneously. Operands are from crossed positions in 32-
 * bit chunks.
 *
 * **Description**:\n
 * This instruction adds the 16-bit signed integer element in [31:16] of 32-bit chunks in
 * Rs1 with the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs2; at the same time, it
 * subtracts the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs2 from the 16-bit signed
 * integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the Q15 number
 * range (-2^15 <= Q15 <= 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated
 * results are written to [31:16] of 32-bit chunks in Rd for addition and [15:0] of 32-bit chunks in Rd for
 * subtraction.
 *
 * **Operations**:\n
 * ~~~
 * res1 = Rs1.W[x][31:16] + Rs2.W[x][15:0];
 * res2 = Rs1.W[x][15:0] - Rs2.W[x][31:16];
 * for (res in [res1, res2]) {
 *   if (res > (2^15)-1) {
 *     res = (2^15)-1;
 *     OV = 1;
 *   } else if (res < -2^15) {
 *     res = -2^15;
 *     OV = 1;
 *   }
 * }
 * Rd.W[x][31:16] = res1;
 * Rd.W[x][15:0] = res2;
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KCRAS16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("kcras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.31. KCRAS16 ===== */

/* ===== Inline Function Start for 3.32. KCRSA16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief KCRSA16 (SIMD 16-bit Signed Saturating Cross Subtraction & Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KCRSA16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element saturating subtraction and 16-bit signed integer element
 * saturating addition in a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit
 * chunks.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit signed integer element in [15:0] of 32-bit chunks
 * in Rs2 from the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs1; at the same time, it
 * adds the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs2 with the 16-bit signed
 * integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the Q15 number
 * range (-2^15 <= Q15 <= 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated
 * results are written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of 32-bit chunks in Rd
 * for addition.
 *
 * **Operations**:\n
 * ~~~
 * res1 = Rs1.W[x][31:16] - Rs2.W[x][15:0];
 * res2 = Rs1.W[x][15:0] + Rs2.W[x][31:16];
 * for (res in [res1, res2]) {
 *   if (res > (2^15)-1) {
 *     res = (2^15)-1;
 *     OV = 1;
 *   } else if (res < -2^15) {
 *     res = -2^15;
 *     OV = 1;
 *   }
 * }
 * Rd.W[x][31:16] = res1;
 * Rd.W[x][15:0] = res2;
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KCRSA16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("kcrsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.32. KCRSA16 ===== */

/* ===== Inline Function Start for 3.33.1. KDMBB ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
 * \brief KDMBB (Signed Saturating Double Multiply B16 x B16)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * KDMxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
 * of the lower 32-bit chunk in registers and then double and saturate the Q31 result. The result is
 * written into the destination register for RV32 or sign-extended to 64-bits and written into the
 * destination register for RV64. If saturation happens, an overflow flag OV will be set.
 *
 * **Description**:\n
 * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
 * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then
 * doubled and saturated into a Q31 value. The Q31 value is then written into Rd (sign-extended in
 * RV64). When both the two Q15 inputs are 0x8000, saturation will happen. The result will be
 * saturated to 0x7FFFFFFF and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMBB
 * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMBT
 * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMTT
 * If (0x8000 != aop | 0x8000 != bop) {
 *   Mresult = aop * bop;
 *   resQ31 = Mresult << 1;
 *   Rd = resQ31; // RV32
 *   Rd = SE(resQ31); // RV64
 * } else {
 *   resQ31 = 0x7FFFFFFF;
 *   Rd = resQ31; // RV32
 *   Rd = SE(resQ31); // RV64
 *   OV = 1;
 * }
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KDMBB(unsigned int a, unsigned int b)
{
    long result;
    __ASM volatile("kdmbb %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.33.1. KDMBB ===== */

/* ===== Inline Function Start for 3.33.2. KDMBT ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
 * \brief KDMBT (Signed Saturating Double Multiply B16 x T16)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * KDMxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
 * of the lower 32-bit chunk in registers and then double and saturate the Q31 result. The result is
 * written into the destination register for RV32 or sign-extended to 64-bits and written into the
 * destination register for RV64. If saturation happens, an overflow flag OV will be set.
 *
 * **Description**:\n
 * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
 * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then
 * doubled and saturated into a Q31 value. The Q31 value is then written into Rd (sign-extended in
 * RV64). When both the two Q15 inputs are 0x8000, saturation will happen. The result will be
 * saturated to 0x7FFFFFFF and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMBB
 * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMBT
 * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMTT
 * If (0x8000 != aop | 0x8000 != bop) {
 *   Mresult = aop * bop;
 *   resQ31 = Mresult << 1;
 *   Rd = resQ31; // RV32
 *   Rd = SE(resQ31); // RV64
 * } else {
 *   resQ31 = 0x7FFFFFFF;
 *   Rd = resQ31; // RV32
 *   Rd = SE(resQ31); // RV64
 *   OV = 1;
 * }
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KDMBT(unsigned int a, unsigned int b)
{
    long result;
    __ASM volatile("kdmbt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.33.2. KDMBT ===== */

/* ===== Inline Function Start for 3.33.3. KDMTT ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
 * \brief KDMTT (Signed Saturating Double Multiply T16 x T16)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * KDMxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
 * of the lower 32-bit chunk in registers and then double and saturate the Q31 result. The result is
 * written into the destination register for RV32 or sign-extended to 64-bits and written into the
 * destination register for RV64. If saturation happens, an overflow flag OV will be set.
 *
 * **Description**:\n
 * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
 * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then
 * doubled and saturated into a Q31 value. The Q31 value is then written into Rd (sign-extended in
 * RV64). When both the two Q15 inputs are 0x8000, saturation will happen. The result will be
 * saturated to 0x7FFFFFFF and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMBB
 * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMBT
 * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMTT
 * If (0x8000 != aop | 0x8000 != bop) {
 *   Mresult = aop * bop;
 *   resQ31 = Mresult << 1;
 *   Rd = resQ31; // RV32
 *   Rd = SE(resQ31); // RV64
 * } else {
 *   resQ31 = 0x7FFFFFFF;
 *   Rd = resQ31; // RV32
 *   Rd = SE(resQ31); // RV64
 *   OV = 1;
 * }
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KDMTT(unsigned int a, unsigned int b)
{
    long result;
    __ASM volatile("kdmtt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.33.3. KDMTT ===== */

/* ===== Inline Function Start for 3.34.1. KDMABB ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
 * \brief KDMABB (Signed Saturating Double Multiply Addition B16 x B16)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * KDMAxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
 * of the lower 32-bit chunk in registers and then double and saturate the Q31 result, add the result
 * with the sign-extended lower 32-bit chunk destination register and write the saturated addition
 * result into the destination register. If saturation happens, an overflow flag OV will be set.
 *
 * **Description**:\n
 * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
 * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then
 * doubled and saturated into a Q31 value. The Q31 value is then added with the content of Rd. If the
 * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and
 * the OV flag is set to 1. The result after saturation is written to Rd.
 * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be
 * set.
 *
 * **Operations**:\n
 * ~~~
 * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMABB
 * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMABT
 * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMATT
 * If (0x8000 != aop | 0x8000 != bop) {
 *   Mresult = aop * bop;
 *   resQ31 = Mresult << 1;
 * } else {
 *   resQ31 = 0x7FFFFFFF;
 *   OV = 1;
 * }
 * resadd = Rd + resQ31; // RV32
 * resadd = Rd.W[0] + resQ31; // RV64
 * if (resadd > (2^31)-1) {
 *   resadd = (2^31)-1;
 *   OV = 1;
 * } else if (resadd < -2^31) {
 *   resadd = -2^31;
 *   OV = 1;
 * }
 * Rd = resadd; // RV32
 * Rd = SE(resadd); // RV64
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KDMABB(long t, unsigned int a, unsigned int b)
{
    __ASM volatile("kdmabb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.34.1. KDMABB ===== */

/* ===== Inline Function Start for 3.34.2. KDMABT ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
 * \brief KDMABT (Signed Saturating Double Multiply Addition B16 x T16)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * KDMAxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
 * of the lower 32-bit chunk in registers and then double and saturate the Q31 result, add the result
 * with the sign-extended lower 32-bit chunk destination register and write the saturated addition
 * result into the destination register. If saturation happens, an overflow flag OV will be set.
 *
 * **Description**:\n
 * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
 * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then
 * doubled and saturated into a Q31 value. The Q31 value is then added with the content of Rd. If the
 * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and
 * the OV flag is set to 1. The result after saturation is written to Rd.
 * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be
 * set.
 *
 * **Operations**:\n
 * ~~~
 * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMABB
 * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMABT
 * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMATT
 * If (0x8000 != aop | 0x8000 != bop) {
 *   Mresult = aop * bop;
 *   resQ31 = Mresult << 1;
 * } else {
 *   resQ31 = 0x7FFFFFFF;
 *   OV = 1;
 * }
 * resadd = Rd + resQ31; // RV32
 * resadd = Rd.W[0] + resQ31; // RV64
 * if (resadd > (2^31)-1) {
 *   resadd = (2^31)-1;
 *   OV = 1;
 * } else if (resadd < -2^31) {
 *   resadd = -2^31;
 *   OV = 1;
 * }
 * Rd = resadd; // RV32
 * Rd = SE(resadd); // RV64
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KDMABT(long t, unsigned int a, unsigned int b)
{
    __ASM volatile("kdmabt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.34.2. KDMABT ===== */

/* ===== Inline Function Start for 3.34.3. KDMATT ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
 * \brief KDMATT (Signed Saturating Double Multiply Addition T16 x T16)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * KDMAxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
 * of the lower 32-bit chunk in registers and then double and saturate the Q31 result, add the result
 * with the sign-extended lower 32-bit chunk destination register and write the saturated addition
 * result into the destination register. If saturation happens, an overflow flag OV will be set.
 *
 * **Description**:\n
 * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
 * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then
 * doubled and saturated into a Q31 value. The Q31 value is then added with the content of Rd. If the
 * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and
 * the OV flag is set to 1. The result after saturation is written to Rd.
 * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be
 * set.
 *
 * **Operations**:\n
 * ~~~
 * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMABB
 * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMABT
 * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMATT
 * If (0x8000 != aop | 0x8000 != bop) {
 *   Mresult = aop * bop;
 *   resQ31 = Mresult << 1;
 * } else {
 *   resQ31 = 0x7FFFFFFF;
 *   OV = 1;
 * }
 * resadd = Rd + resQ31; // RV32
 * resadd = Rd.W[0] + resQ31; // RV64
 * if (resadd > (2^31)-1) {
 *   resadd = (2^31)-1;
 *   OV = 1;
 * } else if (resadd < -2^31) {
 *   resadd = -2^31;
 *   OV = 1;
 * }
 * Rd = resadd; // RV32
 * Rd = SE(resadd); // RV64
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KDMATT(long t, unsigned int a, unsigned int b)
{
    __ASM volatile("kdmatt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.34.3. KDMATT ===== */

/* ===== Inline Function Start for 3.35.1. KHM8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY
 * \brief KHM8 (SIMD Signed Saturating Q7 Multiply)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KHM8 Rd, Rs1, Rs2
 * KHMX8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do Q7xQ7 element multiplications simultaneously. The Q14 results are then reduced to Q7
 * numbers again.
 *
 * **Description**:\n
 * For the `KHM8` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1
 * with the top 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7
 * content of 16-bit chunks in Rs1 with the bottom 8-bit Q7 content of 16-bit chunks in Rs2.
 * For the `KHMX16` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1 with the
 * bottom 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7
 * content of 16-bit chunks in Rs1 with the top 8-bit Q7 content of 16-bit chunks in Rs2.
 * The Q14 results are then right-shifted 7-bits and saturated into Q7 values. The Q7 results are then
 * written into Rd. When both the two Q7 inputs of a multiplication are 0x80, saturation will happen.
 * The result will be saturated to 0x7F and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * if (is `KHM8`) {
 *   op1t = Rs1.B[x+1]; op2t = Rs2.B[x+1]; // top
 *   op1b = Rs1.B[x]; op2b = Rs2.B[x]; // bottom
 * } else if (is `KHMX8`) {
 *   op1t = Rs1.H[x+1]; op2t = Rs2.H[x]; // Rs1 top
 *   op1b = Rs1.H[x]; op2b = Rs2.H[x+1]; // Rs1 bottom
 * }
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   if (0x80 != aop | 0x80 != bop) {
 *     res = (aop s* bop) >> 7;
 *   } else {
 *     res= 0x7F;
 *     OV = 1;
 *   }
 * }
 * Rd.H[x/2] = concat(rest, resb);
 * for RV32, x=0,2
 * for RV64, x=0,2,4,6
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KHM8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("khm8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.35.1. KHM8 ===== */

/* ===== Inline Function Start for 3.35.2. KHMX8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY
 * \brief KHMX8 (SIMD Signed Saturating Crossed Q7 Multiply)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KHM8 Rd, Rs1, Rs2
 * KHMX8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do Q7xQ7 element multiplications simultaneously. The Q14 results are then reduced to Q7
 * numbers again.
 *
 * **Description**:\n
 * For the `KHM8` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1
 * with the top 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7
 * content of 16-bit chunks in Rs1 with the bottom 8-bit Q7 content of 16-bit chunks in Rs2.
 * For the `KHMX16` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1 with the
 * bottom 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7
 * content of 16-bit chunks in Rs1 with the top 8-bit Q7 content of 16-bit chunks in Rs2.
 * The Q14 results are then right-shifted 7-bits and saturated into Q7 values. The Q7 results are then
 * written into Rd. When both the two Q7 inputs of a multiplication are 0x80, saturation will happen.
 * The result will be saturated to 0x7F and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * if (is `KHM8`) {
 *   op1t = Rs1.B[x+1]; op2t = Rs2.B[x+1]; // top
 *   op1b = Rs1.B[x]; op2b = Rs2.B[x]; // bottom
 * } else if (is `KHMX8`) {
 *   op1t = Rs1.H[x+1]; op2t = Rs2.H[x]; // Rs1 top
 *   op1b = Rs1.H[x]; op2b = Rs2.H[x+1]; // Rs1 bottom
 * }
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   if (0x80 != aop | 0x80 != bop) {
 *     res = (aop s* bop) >> 7;
 *   } else {
 *     res= 0x7F;
 *     OV = 1;
 *   }
 * }
 * Rd.H[x/2] = concat(rest, resb);
 * for RV32, x=0,2
 * for RV64, x=0,2,4,6
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KHMX8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("khmx8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.35.2. KHMX8 ===== */

/* ===== Inline Function Start for 3.36.1. KHM16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY
 * \brief KHM16 (SIMD Signed Saturating Q15 Multiply)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KHM16 Rd, Rs1, Rs2
 * KHMX16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do Q15xQ15 element multiplications simultaneously. The Q30 results are then reduced to
 * Q15 numbers again.
 *
 * **Description**:\n
 * For the `KHM16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in
 * Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom
 * 16-bit Q15 content of 32-bit chunks in Rs1 with the bottom 16-bit Q15 content of 32-bit chunks in
 * Rs2.
 * For the `KHMX16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in Rs1 with the
 * bottom 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom 16-bit Q15
 * content of 32-bit chunks in Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2.
 * The Q30 results are then right-shifted 15-bits and saturated into Q15 values. The Q15 results are
 * then written into Rd. When both the two Q15 inputs of a multiplication are 0x8000, saturation will
 * happen. The result will be saturated to 0x7FFF and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * if (is `KHM16`) {
 *   op1t = Rs1.H[x+1]; op2t = Rs2.H[x+1]; // top
 *   op1b = Rs1.H[x]; op2b = Rs2.H[x]; // bottom
 * } else if (is `KHMX16`) {
 *   op1t = Rs1.H[x+1]; op2t = Rs2.H[x]; // Rs1 top
 *   op1b = Rs1.H[x]; op2b = Rs2.H[x+1]; // Rs1 bottom
 * }
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   if (0x8000 != aop | 0x8000 != bop) {
 *     res = (aop s* bop) >> 15;
 *   } else {
 *     res= 0x7FFF;
 *     OV = 1;
 *   }
 * }
 * Rd.W[x/2] = concat(rest, resb);
 * for RV32: x=0
 * for RV64: x=0,2
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KHM16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("khm16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.36.1. KHM16 ===== */

/* ===== Inline Function Start for 3.36.2. KHMX16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY
 * \brief KHMX16 (SIMD Signed Saturating Crossed Q15 Multiply)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KHM16 Rd, Rs1, Rs2
 * KHMX16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do Q15xQ15 element multiplications simultaneously. The Q30 results are then reduced to
 * Q15 numbers again.
 *
 * **Description**:\n
 * For the `KHM16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in
 * Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom
 * 16-bit Q15 content of 32-bit chunks in Rs1 with the bottom 16-bit Q15 content of 32-bit chunks in
 * Rs2.
 * For the `KHMX16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in Rs1 with the
 * bottom 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom 16-bit Q15
 * content of 32-bit chunks in Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2.
 * The Q30 results are then right-shifted 15-bits and saturated into Q15 values. The Q15 results are
 * then written into Rd. When both the two Q15 inputs of a multiplication are 0x8000, saturation will
 * happen. The result will be saturated to 0x7FFF and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * if (is `KHM16`) {
 *   op1t = Rs1.H[x+1]; op2t = Rs2.H[x+1]; // top
 *   op1b = Rs1.H[x]; op2b = Rs2.H[x]; // bottom
 * } else if (is `KHMX16`) {
 *   op1t = Rs1.H[x+1]; op2t = Rs2.H[x]; // Rs1 top
 *   op1b = Rs1.H[x]; op2b = Rs2.H[x+1]; // Rs1 bottom
 * }
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   if (0x8000 != aop | 0x8000 != bop) {
 *     res = (aop s* bop) >> 15;
 *   } else {
 *     res= 0x7FFF;
 *     OV = 1;
 *   }
 * }
 * Rd.W[x/2] = concat(rest, resb);
 * for RV32: x=0
 * for RV64: x=0,2
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KHMX16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("khmx16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.36.2. KHMX16 ===== */

/* ===== Inline Function Start for 3.37.1. KHMBB ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
 * \brief KHMBB (Signed Saturating Half Multiply B16 x B16)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * KHMxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed Q15 number contents of two 16-bit data in the corresponding portion
 * of the lower 32-bit chunk in registers and then right-shift 15 bits to turn the Q30 result into a Q15
 * number again and saturate the Q15 result into the destination register. If saturation happens, an
 * overflow flag OV will be set.
 *
 * **Description**:\n
 * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
 * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then right-
 * shifted 15-bits and saturated into a Q15 value. The Q15 value is then sing-extended and written into
 * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated
 * to 0x7FFF and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * aop = Rs1.H[0]; bop = Rs2.H[0]; // KHMBB
 * aop = Rs1.H[0]; bop = Rs2.H[1]; // KHMBT
 * aop = Rs1.H[1]; bop = Rs2.H[1]; // KHMTT
 * If (0x8000 != aop | 0x8000 != bop) {
 *   Mresult[31:0] = aop * bop;
 *   res[15:0] = Mresult[30:15];
 * } else {
 *   res[15:0] = 0x7FFF;
 *   OV = 1;
 * }
 * Rd = SE32(res[15:0]); // Rv32
 * Rd = SE64(res[15:0]); // RV64
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KHMBB(unsigned int a, unsigned int b)
{
    long result;
    __ASM volatile("khmbb %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.37.1. KHMBB ===== */

/* ===== Inline Function Start for 3.37.2. KHMBT ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
 * \brief KHMBT (Signed Saturating Half Multiply B16 x T16)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * KHMxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed Q15 number contents of two 16-bit data in the corresponding portion
 * of the lower 32-bit chunk in registers and then right-shift 15 bits to turn the Q30 result into a Q15
 * number again and saturate the Q15 result into the destination register. If saturation happens, an
 * overflow flag OV will be set.
 *
 * **Description**:\n
 * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
 * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then right-
 * shifted 15-bits and saturated into a Q15 value. The Q15 value is then sing-extended and written into
 * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated
 * to 0x7FFF and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * aop = Rs1.H[0]; bop = Rs2.H[0]; // KHMBB
 * aop = Rs1.H[0]; bop = Rs2.H[1]; // KHMBT
 * aop = Rs1.H[1]; bop = Rs2.H[1]; // KHMTT
 * If (0x8000 != aop | 0x8000 != bop) {
 *   Mresult[31:0] = aop * bop;
 *   res[15:0] = Mresult[30:15];
 * } else {
 *   res[15:0] = 0x7FFF;
 *   OV = 1;
 * }
 * Rd = SE32(res[15:0]); // Rv32
 * Rd = SE64(res[15:0]); // RV64
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KHMBT(unsigned int a, unsigned int b)
{
    long result;
    __ASM volatile("khmbt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.37.2. KHMBT ===== */

/* ===== Inline Function Start for 3.37.3. KHMTT ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
 * \brief KHMTT (Signed Saturating Half Multiply T16 x T16)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * KHMxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed Q15 number contents of two 16-bit data in the corresponding portion
 * of the lower 32-bit chunk in registers and then right-shift 15 bits to turn the Q30 result into a Q15
 * number again and saturate the Q15 result into the destination register. If saturation happens, an
 * overflow flag OV will be set.
 *
 * **Description**:\n
 * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
 * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then right-
 * shifted 15-bits and saturated into a Q15 value. The Q15 value is then sing-extended and written into
 * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated
 * to 0x7FFF and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * aop = Rs1.H[0]; bop = Rs2.H[0]; // KHMBB
 * aop = Rs1.H[0]; bop = Rs2.H[1]; // KHMBT
 * aop = Rs1.H[1]; bop = Rs2.H[1]; // KHMTT
 * If (0x8000 != aop | 0x8000 != bop) {
 *   Mresult[31:0] = aop * bop;
 *   res[15:0] = Mresult[30:15];
 * } else {
 *   res[15:0] = 0x7FFF;
 *   OV = 1;
 * }
 * Rd = SE32(res[15:0]); // Rv32
 * Rd = SE64(res[15:0]); // RV64
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KHMTT(unsigned int a, unsigned int b)
{
    long result;
    __ASM volatile("khmtt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.37.3. KHMTT ===== */

/* ===== Inline Function Start for 3.38.1. KMABB ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
 * \brief KMABB (SIMD Saturating Signed Multiply Bottom Halfs & Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMABB Rd, Rs1, Rs2
 * KMABT Rd, Rs1, Rs2
 * KMATT Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 16-bit content of 32-bit elements in a register with the 16-bit content
 * of 32-bit elements in another register and add the result to the content of 32-bit elements in the
 * third register. The addition result may be saturated and is written to the third register.
 * * KMABB: rd.W[x] + bottom*bottom (per 32-bit element)
 * * KMABT rd.W[x] + bottom*top (per 32-bit element)
 * * KMATT rd.W[x] + top*top (per 32-bit element)
 *
 * **Description**:\n
 * For the `KMABB` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
 * the bottom 16-bit content of 32-bit elements in Rs2.
 * For the `KMABT` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
 * the top 16-bit content of 32-bit elements in Rs2.
 * For the `KMATT` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
 * top 16-bit content of 32-bit elements in Rs2.
 * The multiplication result is added to the content of 32-bit elements in Rd. If the addition result is
 * beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to
 * 1. The results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as
 * signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]); // KMABB
 * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[1]); // KMABT
 * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]); // KMATT
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMABB(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmabb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.38.1. KMABB ===== */

/* ===== Inline Function Start for 3.38.2. KMABT ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
 * \brief KMABT (SIMD Saturating Signed Multiply Bottom & Top Halfs & Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMABB Rd, Rs1, Rs2
 * KMABT Rd, Rs1, Rs2
 * KMATT Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 16-bit content of 32-bit elements in a register with the 16-bit content
 * of 32-bit elements in another register and add the result to the content of 32-bit elements in the
 * third register. The addition result may be saturated and is written to the third register.
 * * KMABB: rd.W[x] + bottom*bottom (per 32-bit element)
 * * KMABT rd.W[x] + bottom*top (per 32-bit element)
 * * KMATT rd.W[x] + top*top (per 32-bit element)
 *
 * **Description**:\n
 * For the `KMABB` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
 * the bottom 16-bit content of 32-bit elements in Rs2.
 * For the `KMABT` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
 * the top 16-bit content of 32-bit elements in Rs2.
 * For the `KMATT` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
 * top 16-bit content of 32-bit elements in Rs2.
 * The multiplication result is added to the content of 32-bit elements in Rd. If the addition result is
 * beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to
 * 1. The results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as
 * signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]); // KMABB
 * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[1]); // KMABT
 * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]); // KMATT
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMABT(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmabt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.38.2. KMABT ===== */

/* ===== Inline Function Start for 3.38.3. KMATT ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
 * \brief KMATT (SIMD Saturating Signed Multiply Top Halfs & Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMABB Rd, Rs1, Rs2
 * KMABT Rd, Rs1, Rs2
 * KMATT Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 16-bit content of 32-bit elements in a register with the 16-bit content
 * of 32-bit elements in another register and add the result to the content of 32-bit elements in the
 * third register. The addition result may be saturated and is written to the third register.
 * * KMABB: rd.W[x] + bottom*bottom (per 32-bit element)
 * * KMABT rd.W[x] + bottom*top (per 32-bit element)
 * * KMATT rd.W[x] + top*top (per 32-bit element)
 *
 * **Description**:\n
 * For the `KMABB` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
 * the bottom 16-bit content of 32-bit elements in Rs2.
 * For the `KMABT` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
 * the top 16-bit content of 32-bit elements in Rs2.
 * For the `KMATT` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
 * top 16-bit content of 32-bit elements in Rs2.
 * The multiplication result is added to the content of 32-bit elements in Rd. If the addition result is
 * beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to
 * 1. The results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as
 * signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]); // KMABB
 * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[1]); // KMABT
 * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]); // KMATT
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMATT(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmatt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.38.3. KMATT ===== */

/* ===== Inline Function Start for 3.39.1. KMADA ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
 * \brief KMADA (SIMD Saturating Signed Multiply Two Halfs and Two Adds)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMADA Rd, Rs1, Rs2
 * KMAXDA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then adds
 * the two 32-bit results and 32-bit elements in a third register together. The addition result may be
 * saturated.
 * * KMADA: rd.W[x] + top*top + bottom*bottom (per 32-bit element)
 * * KMAXDA: rd.W[x] + top*bottom + bottom*top (per 32-bit element)
 *
 * **Description**:\n
 * For the `KMADA instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
 * the bottom 16-bit content of 32-bit elements in Rs2 and then adds the result to the result of
 * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit
 * elements in Rs2.
 * For the `KMAXDA` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
 * bottom 16-bit content of 32-bit elements in Rs2 and then adds the result to the result of multiplying
 * the bottom 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit elements in
 * Rs2.
 * The result is added to the content of 32-bit elements in Rd. If the addition result is beyond the Q31
 * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The 32-bit
 * results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * // KMADA
 * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) + (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
 * // KMAXDA
 * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) + (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 * OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMADA(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmada %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.39.1. KMADA ===== */

/* ===== Inline Function Start for 3.39.2. KMAXDA ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
 * \brief KMAXDA (SIMD Saturating Signed Crossed Multiply Two Halfs and Two Adds)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMADA Rd, Rs1, Rs2
 * KMAXDA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then adds
 * the two 32-bit results and 32-bit elements in a third register together. The addition result may be
 * saturated.
 * * KMADA: rd.W[x] + top*top + bottom*bottom (per 32-bit element)
 * * KMAXDA: rd.W[x] + top*bottom + bottom*top (per 32-bit element)
 *
 * **Description**:\n
 * For the `KMADA instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
 * the bottom 16-bit content of 32-bit elements in Rs2 and then adds the result to the result of
 * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit
 * elements in Rs2.
 * For the `KMAXDA` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
 * bottom 16-bit content of 32-bit elements in Rs2 and then adds the result to the result of multiplying
 * the bottom 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit elements in
 * Rs2.
 * The result is added to the content of 32-bit elements in Rd. If the addition result is beyond the Q31
 * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The 32-bit
 * results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * // KMADA
 * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) + (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
 * // KMAXDA
 * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) + (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 * OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMAXDA(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmaxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.39.2. KMAXDA ===== */

/* ===== Inline Function Start for 3.40.1. KMADS ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
 * \brief KMADS (SIMD Saturating Signed Multiply Two Halfs & Subtract & Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMADS Rd, Rs1, Rs2
 * KMADRS Rd, Rs1, Rs2
 * KMAXDS Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then
 * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
 * the corresponding 32-bit elements in a third register. The addition result may be saturated.
 * * KMADS: rd.W[x] + (top*top - bottom*bottom) (per 32-bit element)
 * * KMADRS: rd.W[x] + (bottom*bottom - top*top) (per 32-bit element)
 * * KMAXDS: rd.W[x] + (top*bottom - bottom*top) (per 32-bit element)
 *
 * **Description**:\n
 * For the `KMADS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
 * the bottom 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
 * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit
 * elements in Rs2.
 * For the `KMADRS` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
 * top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
 * multiplying the bottom 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-
 * bit elements in Rs2.
 * For the `KMAXDS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
 * the top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
 * multiplying the top 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-bit
 * elements in Rs2.
 * The subtraction result is then added to the content of the corresponding 32-bit elements in Rd. If the
 * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and
 * the OV bit is set to 1. The 32-bit results after saturation are written to Rd. The 16-bit contents of Rs1
 * and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * // KMADS
 * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
 * // KMADRS
 * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]);
 * // KMAXDS
 * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMADS(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmads %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.40.1. KMADS ===== */

/* ===== Inline Function Start for 3.40.2. KMADRS ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
 * \brief KMADRS (SIMD Saturating Signed Multiply Two Halfs & Reverse Subtract & Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMADS Rd, Rs1, Rs2
 * KMADRS Rd, Rs1, Rs2
 * KMAXDS Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then
 * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
 * the corresponding 32-bit elements in a third register. The addition result may be saturated.
 * * KMADS: rd.W[x] + (top*top - bottom*bottom) (per 32-bit element)
 * * KMADRS: rd.W[x] + (bottom*bottom - top*top) (per 32-bit element)
 * * KMAXDS: rd.W[x] + (top*bottom - bottom*top) (per 32-bit element)
 *
 * **Description**:\n
 * For the `KMADS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
 * the bottom 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
 * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit
 * elements in Rs2.
 * For the `KMADRS` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
 * top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
 * multiplying the bottom 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-
 * bit elements in Rs2.
 * For the `KMAXDS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
 * the top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
 * multiplying the top 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-bit
 * elements in Rs2.
 * The subtraction result is then added to the content of the corresponding 32-bit elements in Rd. If the
 * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and
 * the OV bit is set to 1. The 32-bit results after saturation are written to Rd. The 16-bit contents of Rs1
 * and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * // KMADS
 * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
 * // KMADRS
 * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]);
 * // KMAXDS
 * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMADRS(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmadrs %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.40.2. KMADRS ===== */

/* ===== Inline Function Start for 3.40.3. KMAXDS ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
 * \brief KMAXDS (SIMD Saturating Signed Crossed Multiply Two Halfs & Subtract & Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMADS Rd, Rs1, Rs2
 * KMADRS Rd, Rs1, Rs2
 * KMAXDS Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then
 * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
 * the corresponding 32-bit elements in a third register. The addition result may be saturated.
 * * KMADS: rd.W[x] + (top*top - bottom*bottom) (per 32-bit element)
 * * KMADRS: rd.W[x] + (bottom*bottom - top*top) (per 32-bit element)
 * * KMAXDS: rd.W[x] + (top*bottom - bottom*top) (per 32-bit element)
 *
 * **Description**:\n
 * For the `KMADS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
 * the bottom 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
 * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit
 * elements in Rs2.
 * For the `KMADRS` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
 * top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
 * multiplying the bottom 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-
 * bit elements in Rs2.
 * For the `KMAXDS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
 * the top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
 * multiplying the top 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-bit
 * elements in Rs2.
 * The subtraction result is then added to the content of the corresponding 32-bit elements in Rd. If the
 * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and
 * the OV bit is set to 1. The 32-bit results after saturation are written to Rd. The 16-bit contents of Rs1
 * and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * // KMADS
 * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
 * // KMADRS
 * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]);
 * // KMAXDS
 * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMAXDS(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmaxds %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.40.3. KMAXDS ===== */

/* ===== Inline Function Start for 3.41. KMAR64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
 * \brief KMAR64 (Signed Multiply and Saturating Add to 64-Bit Data)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * KMAR64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the 32-bit signed elements in two registers and add the 64-bit multiplication
 * results to the 64-bit signed data of a pair of registers (RV32) or a register (RV64). The result is
 * saturated to the Q63 range and written back to the pair of registers (RV32) or the register (RV64).
 *
 * **RV32 Description**:\n
 * This instruction multiplies the 32-bit signed data of Rs1 with that of Rs2. It adds
 * the 64-bit multiplication result to the 64-bit signed data of an even/odd pair of registers specified by
 * Rd(4,1) with unlimited precision. If the 64-bit addition result is beyond the Q63 number range (-2^63 <=
 * Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The saturated result is written back
 * to the even/odd pair of registers specified by Rd(4,1).
 * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register
 * pair includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 *
 * **RV64 Description**:\n
 * This instruction multiplies the 32-bit signed elements of Rs1 with that of Rs2. It
 * adds the 64-bit multiplication results to the 64-bit signed data of Rd with unlimited precision. If the
 * 64-bit addition result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range
 * and the OV bit is set to 1. The saturated result is written back to Rd.
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * result = R[t_H].R[t_L] + (Rs1 * Rs2);
 * if (result > (2^63)-1) {
 *   result = (2^63)-1; OV = 1;
 * } else if (result < -2^63) {
 *   result = -2^63; OV = 1;
 * }
 * R[t_H].R[t_L] = result;
 * RV64:
 * // `result` has unlimited precision
 * result = Rd + (Rs1.W[0] * Rs2.W[0]) + (Rs1.W[1] * Rs2.W[1]);
 * if (result > (2^63)-1) {
 *   result = (2^63)-1; OV = 1;
 * } else if (result < -2^63) {
 *   result = -2^63; OV = 1;
 * }
 * Rd = result;
 * ~~~
 *
 * \param [in]  t    long long type of value stored in t
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_KMAR64(long long t, long a, long b)
{
    __ASM volatile("kmar64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.41. KMAR64 ===== */

/* ===== Inline Function Start for 3.42.1. KMDA ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
 * \brief KMDA (SIMD Signed Multiply Two Halfs and Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMDA Rd, Rs1, Rs2
 * KMXDA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
 * adds the two 32-bit results together. The addition result may be saturated.
 * * KMDA: top*top + bottom*bottom (per 32-bit element)
 * * KMXDA: top*bottom + bottom*top (per 32-bit element)
 *
 * **Description**:\n
 * For the `KMDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
 * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32-
 * bit elements of Rs2.
 * For the `KMXDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the top 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
 * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of the
 * 32-bit elements of Rs2.
 * The addition result is checked for saturation. If saturation happens, the result is saturated to 2^31-1.
 * The final results are written to Rd. The 16-bit contents are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * if  Rs1.W[x]  !=  0x80008000)  or  (Rs2.W[x]  !=  0x80008000  {  //  KMDA  Rd.W[x]  =  Rs1.W[x].H[1]  *
 * Rs2.W[x].H[1]) + (Rs1.W[x].H[0] * Rs2.W[x].H[0]; // KMXDA Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[0])
 * +  (Rs1.W[x].H[0]  *  Rs2.W[x].H[1];  }  else  {  Rd.W[x]  =  0x7fffffff;  OV  =  1;  }  for  RV32:  x=0  for  RV64:
 * x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMDA(unsigned long a, unsigned long b)
{
    long result;
    __ASM volatile("kmda %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.42.1. KMDA ===== */

/* ===== Inline Function Start for 3.42.2. KMXDA ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
 * \brief KMXDA (SIMD Signed Crossed Multiply Two Halfs and Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMDA Rd, Rs1, Rs2
 * KMXDA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
 * adds the two 32-bit results together. The addition result may be saturated.
 * * KMDA: top*top + bottom*bottom (per 32-bit element)
 * * KMXDA: top*bottom + bottom*top (per 32-bit element)
 *
 * **Description**:\n
 * For the `KMDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
 * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32-
 * bit elements of Rs2.
 * For the `KMXDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the top 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
 * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of the
 * 32-bit elements of Rs2.
 * The addition result is checked for saturation. If saturation happens, the result is saturated to 2^31-1.
 * The final results are written to Rd. The 16-bit contents are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * if  Rs1.W[x]  !=  0x80008000)  or  (Rs2.W[x]  !=  0x80008000  {  //  KMDA  Rd.W[x]  =  Rs1.W[x].H[1]  *
 * Rs2.W[x].H[1]) + (Rs1.W[x].H[0] * Rs2.W[x].H[0]; // KMXDA Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[0])
 * +  (Rs1.W[x].H[0]  *  Rs2.W[x].H[1];  }  else  {  Rd.W[x]  =  0x7fffffff;  OV  =  1;  }  for  RV32:  x=0  for  RV64:
 * x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMXDA(unsigned long a, unsigned long b)
{
    long result;
    __ASM volatile("kmxda %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.42.2. KMXDA ===== */

/* ===== Inline Function Start for 3.43.1. KMMAC ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
 * \brief KMMAC (SIMD Saturating MSW Signed Multiply Word and Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMMAC Rd, Rs1, Rs2
 * KMMAC.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit integer elements of two registers and add the most significant
 * 32-bit results with the signed 32-bit integer elements of a third register. The addition results are
 * saturated first and then written back to the third register. The `.u` form performs an additional
 * rounding up operation on the multiplication results before adding the most significant 32-bit part
 * of the results.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2
 * and adds the most significant 32-bit multiplication results with the signed 32-bit elements of Rd. If
 * the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range
 * and the OV bit is set to 1. The results after saturation are written to Rd. The `.u` form of the
 * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by
 * adding a 1 to bit 31 of the results.
 *
 * **Operations**:\n
 * ~~~
 * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
 * if (`.u` form) {
 *   Round[x][32:0] = Mres[x][63:31] + 1;
 *   res[x] = Rd.W[x] + Round[x][32:1];
 * } else {
 *   res[x] = Rd.W[x] + Mres[x][63:32];
 * }
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMMAC(long t, long a, long b)
{
    __ASM volatile("kmmac %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.43.1. KMMAC ===== */

/* ===== Inline Function Start for 3.43.2. KMMAC.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
 * \brief KMMAC.u (SIMD Saturating MSW Signed Multiply Word and Add with Rounding)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMMAC Rd, Rs1, Rs2
 * KMMAC.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit integer elements of two registers and add the most significant
 * 32-bit results with the signed 32-bit integer elements of a third register. The addition results are
 * saturated first and then written back to the third register. The `.u` form performs an additional
 * rounding up operation on the multiplication results before adding the most significant 32-bit part
 * of the results.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2
 * and adds the most significant 32-bit multiplication results with the signed 32-bit elements of Rd. If
 * the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range
 * and the OV bit is set to 1. The results after saturation are written to Rd. The `.u` form of the
 * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by
 * adding a 1 to bit 31 of the results.
 *
 * **Operations**:\n
 * ~~~
 * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
 * if (`.u` form) {
 *   Round[x][32:0] = Mres[x][63:31] + 1;
 *   res[x] = Rd.W[x] + Round[x][32:1];
 * } else {
 *   res[x] = Rd.W[x] + Mres[x][63:32];
 * }
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMMAC_U(long t, long a, long b)
{
    __ASM volatile("kmmac.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.43.2. KMMAC.u ===== */

/* ===== Inline Function Start for 3.44.1. KMMAWB ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
 * \brief KMMAWB (SIMD Saturating MSW Signed Multiply Word and Bottom Half and Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMMAWB Rd, Rs1, Rs2
 * KMMAWB.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the
 * corresponding 32-bit elements of another register and add the most significant 32-bit results with
 * the corresponding signed 32-bit elements of a third register. The addition result is written to the
 * corresponding 32-bit elements of the third register. The `.u` form rounds up the multiplication
 * results from the most significant discarded bit before the addition operations.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit elements of Rs1 with the signed bottom 16-bit content
 * of the corresponding 32-bit elements of Rs2 and adds the most significant 32-bit multiplication
 * results with the corresponding signed 32-bit elements of Rd. If the addition result is beyond the Q31
 * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The results
 * after saturation are written to the corresponding 32-bit elements of Rd. The `.u` form of the
 * instruction rounds up the most significant 32-bit of the 48-bit multiplication results by adding a 1 to
 * bit 15 of the result before the addition operations.
 *
 * **Operations**:\n
 * ~~~
 * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[0];
 * if (`.u` form) {
 *   Round[x][32:0] = Mres[x][47:15] + 1;
 *   res[x] = Rd.W[x] + Round[x][32:1];
 * } else {
 *   res[x] = Rd.W[x] + Mres[x][47:16];
 * }
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMMAWB(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmmawb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.44.1. KMMAWB ===== */

/* ===== Inline Function Start for 3.44.2. KMMAWB.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
 * \brief KMMAWB.u (SIMD Saturating MSW Signed Multiply Word and Bottom Half and Add with Rounding)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMMAWB Rd, Rs1, Rs2
 * KMMAWB.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the
 * corresponding 32-bit elements of another register and add the most significant 32-bit results with
 * the corresponding signed 32-bit elements of a third register. The addition result is written to the
 * corresponding 32-bit elements of the third register. The `.u` form rounds up the multiplication
 * results from the most significant discarded bit before the addition operations.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit elements of Rs1 with the signed bottom 16-bit content
 * of the corresponding 32-bit elements of Rs2 and adds the most significant 32-bit multiplication
 * results with the corresponding signed 32-bit elements of Rd. If the addition result is beyond the Q31
 * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The results
 * after saturation are written to the corresponding 32-bit elements of Rd. The `.u` form of the
 * instruction rounds up the most significant 32-bit of the 48-bit multiplication results by adding a 1 to
 * bit 15 of the result before the addition operations.
 *
 * **Operations**:\n
 * ~~~
 * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[0];
 * if (`.u` form) {
 *   Round[x][32:0] = Mres[x][47:15] + 1;
 *   res[x] = Rd.W[x] + Round[x][32:1];
 * } else {
 *   res[x] = Rd.W[x] + Mres[x][47:16];
 * }
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMMAWB_U(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmmawb.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.44.2. KMMAWB.u ===== */

/* ===== Inline Function Start for 3.45.1. KMMAWB2 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
 * \brief KMMAWB2 (SIMD Saturating MSW Signed Multiply Word and Bottom Half & 2 and Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMMAWB2 Rd, Rs1, Rs2
 * KMMAWB2.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit elements of one register and the bottom 16-bit of the
 * corresponding 32-bit elements of another register, double the multiplication results and add the
 * saturated most significant 32-bit results with the corresponding signed 32-bit elements of a third
 * register. The saturated addition result is written to the corresponding 32-bit elements of the third
 * register. The `.u` form rounds up the multiplication results from the most significant discarded bit
 * before the addition operations.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed bottom 16-bit Q15
 * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
 * adds the saturated most significant 32-bit Q31 multiplication results with the corresponding signed
 * 32-bit elements of Rd. If the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is
 * saturated to the range and the OV bit is set to 1. The results after saturation are written to the
 * corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the most significant
 * 32-bit of the 48-bit Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of
 * the result before the addition operations.
 *
 * **Operations**:\n
 * ~~~
 * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[0] == 0x8000)) {
 *   addop.W[x] = 0x7fffffff;
 *   OV = 1;
 * } else {
 *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[0];
 *   if (`.u` form) {
 *     Mres[x][47:14] = Mres[x][47:14] + 1;
 *   }
 *   addop.W[x] = Mres[x][46:15]; // doubling
 * }
 * res[x] = Rd.W[x] + addop.W[x];
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMMAWB2(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmmawb2 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.45.1. KMMAWB2 ===== */

/* ===== Inline Function Start for 3.45.2. KMMAWB2.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
 * \brief KMMAWB2.u (SIMD Saturating MSW Signed Multiply Word and Bottom Half & 2 and Add with Rounding)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMMAWB2 Rd, Rs1, Rs2
 * KMMAWB2.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit elements of one register and the bottom 16-bit of the
 * corresponding 32-bit elements of another register, double the multiplication results and add the
 * saturated most significant 32-bit results with the corresponding signed 32-bit elements of a third
 * register. The saturated addition result is written to the corresponding 32-bit elements of the third
 * register. The `.u` form rounds up the multiplication results from the most significant discarded bit
 * before the addition operations.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed bottom 16-bit Q15
 * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
 * adds the saturated most significant 32-bit Q31 multiplication results with the corresponding signed
 * 32-bit elements of Rd. If the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is
 * saturated to the range and the OV bit is set to 1. The results after saturation are written to the
 * corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the most significant
 * 32-bit of the 48-bit Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of
 * the result before the addition operations.
 *
 * **Operations**:\n
 * ~~~
 * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[0] == 0x8000)) {
 *   addop.W[x] = 0x7fffffff;
 *   OV = 1;
 * } else {
 *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[0];
 *   if (`.u` form) {
 *     Mres[x][47:14] = Mres[x][47:14] + 1;
 *   }
 *   addop.W[x] = Mres[x][46:15]; // doubling
 * }
 * res[x] = Rd.W[x] + addop.W[x];
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMMAWB2_U(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmmawb2.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.45.2. KMMAWB2.u ===== */

/* ===== Inline Function Start for 3.46.1. KMMAWT ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
 * \brief KMMAWT (SIMD Saturating MSW Signed Multiply Word and Top Half and Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMMAWT Rd, Rs1, Rs2
 * KMMAWT.u Rd Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit integer elements of one register and the signed top 16-bit of the
 * corresponding 32-bit elements of another register and add the most significant 32-bit results with
 * the corresponding signed 32-bit elements of a third register. The addition results are written to the
 * corresponding 32-bit elements of the third register. The `.u` form rounds up the multiplication
 * results from the most significant discarded bit before the addition operations.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit elements of Rs1 with the signed top 16-bit of the
 * corresponding 32-bit elements of Rs2 and adds the most significant 32-bit multiplication results
 * with the corresponding signed 32-bit elements of Rd. If the addition result is beyond the Q31
 * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The results
 * after saturation are written to the corresponding 32-bit elements of Rd. The `.u` form of the
 * instruction rounds up the most significant 32-bit of the 48-bit multiplication results by adding a 1 to
 * bit 15 of the result before the addition operations.
 *
 * **Operations**:\n
 * ~~~
 * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[1];
 * if (`.u` form) {
 *   Round[x][32:0] = Mres[x][47:15] + 1;
 *   res[x] = Rd.W[x] + Round[x][32:1];
 * } else {
 *   res[x] = Rd.W[x] + Mres[x][47:16];
 * }
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMMAWT(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmmawt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.46.1. KMMAWT ===== */

/* ===== Inline Function Start for 3.46.2. KMMAWT.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
 * \brief KMMAWT.u (SIMD Saturating MSW Signed Multiply Word and Top Half and Add with Rounding)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMMAWT Rd, Rs1, Rs2
 * KMMAWT.u Rd Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit integer elements of one register and the signed top 16-bit of the
 * corresponding 32-bit elements of another register and add the most significant 32-bit results with
 * the corresponding signed 32-bit elements of a third register. The addition results are written to the
 * corresponding 32-bit elements of the third register. The `.u` form rounds up the multiplication
 * results from the most significant discarded bit before the addition operations.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit elements of Rs1 with the signed top 16-bit of the
 * corresponding 32-bit elements of Rs2 and adds the most significant 32-bit multiplication results
 * with the corresponding signed 32-bit elements of Rd. If the addition result is beyond the Q31
 * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The results
 * after saturation are written to the corresponding 32-bit elements of Rd. The `.u` form of the
 * instruction rounds up the most significant 32-bit of the 48-bit multiplication results by adding a 1 to
 * bit 15 of the result before the addition operations.
 *
 * **Operations**:\n
 * ~~~
 * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[1];
 * if (`.u` form) {
 *   Round[x][32:0] = Mres[x][47:15] + 1;
 *   res[x] = Rd.W[x] + Round[x][32:1];
 * } else {
 *   res[x] = Rd.W[x] + Mres[x][47:16];
 * }
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMMAWT_U(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmmawt.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.46.2. KMMAWT.u ===== */

/* ===== Inline Function Start for 3.47.1. KMMAWT2 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
 * \brief KMMAWT2 (SIMD Saturating MSW Signed Multiply Word and Top Half & 2 and Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMMAWT2 Rd, Rs1, Rs2
 * KMMAWT2.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit elements of one register and the top 16-bit of the
 * corresponding 32-bit elements of another register, double the multiplication results and add the
 * saturated most significant 32-bit results with the corresponding signed 32-bit elements of a third
 * register. The saturated addition result is written to the corresponding 32-bit elements of the third
 * register. The `.u` form rounds up the multiplication results from the most significant discarded bit
 * before the addition operations.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed top 16-bit Q15
 * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
 * adds the saturated most significant 32-bit Q31 multiplication results with the corresponding signed
 * 32-bit elements of Rd. If the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is
 * saturated to the range and the OV bit is set to 1. The results after saturation are written to the
 * corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the most significant
 * 32-bit of the 48-bit Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of
 * the result before the addition operations.
 *
 * **Operations**:\n
 * ~~~
 * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[1] == 0x8000)) {
 *   addop.W[x] = 0x7fffffff;
 *   OV = 1;
 * } else {
 *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[1];
 *   if (`.u` form) {
 *     Mres[x][47:14] = Mres[x][47:14] + 1;
 *   }
 *   addop.W[x] = Mres[x][46:15]; // doubling
 * }
 * res[x] = Rd.W[x] + addop.W[x];
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMMAWT2(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmmawt2 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.47.1. KMMAWT2 ===== */

/* ===== Inline Function Start for 3.47.2. KMMAWT2.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
 * \brief KMMAWT2.u (SIMD Saturating MSW Signed Multiply Word and Top Half & 2 and Add with Rounding)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMMAWT2 Rd, Rs1, Rs2
 * KMMAWT2.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit elements of one register and the top 16-bit of the
 * corresponding 32-bit elements of another register, double the multiplication results and add the
 * saturated most significant 32-bit results with the corresponding signed 32-bit elements of a third
 * register. The saturated addition result is written to the corresponding 32-bit elements of the third
 * register. The `.u` form rounds up the multiplication results from the most significant discarded bit
 * before the addition operations.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed top 16-bit Q15
 * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
 * adds the saturated most significant 32-bit Q31 multiplication results with the corresponding signed
 * 32-bit elements of Rd. If the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is
 * saturated to the range and the OV bit is set to 1. The results after saturation are written to the
 * corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the most significant
 * 32-bit of the 48-bit Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of
 * the result before the addition operations.
 *
 * **Operations**:\n
 * ~~~
 * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[1] == 0x8000)) {
 *   addop.W[x] = 0x7fffffff;
 *   OV = 1;
 * } else {
 *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[1];
 *   if (`.u` form) {
 *     Mres[x][47:14] = Mres[x][47:14] + 1;
 *   }
 *   addop.W[x] = Mres[x][46:15]; // doubling
 * }
 * res[x] = Rd.W[x] + addop.W[x];
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMMAWT2_U(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmmawt2.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.47.2. KMMAWT2.u ===== */

/* ===== Inline Function Start for 3.48.1. KMMSB ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
 * \brief KMMSB (SIMD Saturating MSW Signed Multiply Word and Subtract)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMMSB Rd, Rs1, Rs2
 * KMMSB.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit integer elements of two registers and subtract the most
 * significant 32-bit results from the signed 32-bit elements of a third register. The subtraction results
 * are written to the third register. The `.u` form performs an additional rounding up operation on
 * the multiplication results before subtracting the most significant 32-bit part of the results.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2
 * and subtracts the most significant 32-bit multiplication results from the signed 32-bit elements of
 * Rd. If the subtraction result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the
 * range and the OV bit is set to 1. The results after saturation are written to Rd. The `.u` form of the
 * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by
 * adding a 1 to bit 31 of the results.
 *
 * **Operations**:\n
 * ~~~
 * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
 * if (`.u` form) {
 *   Round[x][32:0] = Mres[x][63:31] + 1;
 *   res[x] = Rd.W[x] - Round[x][32:1];
 * } else {
 *   res[x] = Rd.W[x] - Mres[x][63:32];
 * }
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMMSB(long t, long a, long b)
{
    __ASM volatile("kmmsb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.48.1. KMMSB ===== */

/* ===== Inline Function Start for 3.48.2. KMMSB.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
 * \brief KMMSB.u (SIMD Saturating MSW Signed Multiply Word and Subtraction with Rounding)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMMSB Rd, Rs1, Rs2
 * KMMSB.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit integer elements of two registers and subtract the most
 * significant 32-bit results from the signed 32-bit elements of a third register. The subtraction results
 * are written to the third register. The `.u` form performs an additional rounding up operation on
 * the multiplication results before subtracting the most significant 32-bit part of the results.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2
 * and subtracts the most significant 32-bit multiplication results from the signed 32-bit elements of
 * Rd. If the subtraction result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the
 * range and the OV bit is set to 1. The results after saturation are written to Rd. The `.u` form of the
 * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by
 * adding a 1 to bit 31 of the results.
 *
 * **Operations**:\n
 * ~~~
 * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
 * if (`.u` form) {
 *   Round[x][32:0] = Mres[x][63:31] + 1;
 *   res[x] = Rd.W[x] - Round[x][32:1];
 * } else {
 *   res[x] = Rd.W[x] - Mres[x][63:32];
 * }
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMMSB_U(long t, long a, long b)
{
    __ASM volatile("kmmsb.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.48.2. KMMSB.u ===== */

/* ===== Inline Function Start for 3.49.1. KMMWB2 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
 * \brief KMMWB2 (SIMD Saturating MSW Signed Multiply Word and Bottom Half & 2)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMMWB2 Rd, Rs1, Rs2
 * KMMWB2.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the
 * corresponding 32-bit elements of another register, double the multiplication results and write the
 * saturated most significant 32-bit results to the corresponding 32-bit elements of a register. The `.u`
 * form rounds up the results from the most significant discarded bit.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed bottom 16-bit Q15
 * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
 * writes the saturated most significant 32-bit Q31 multiplication results to the corresponding 32-bit
 * elements of Rd. The `.u` form of the instruction rounds up the most significant 32-bit of the 48-bit
 * Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of the results.
 *
 * **Operations**:\n
 * ~~~
 * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[0] == 0x8000)) {
 *   Rd.W[x] = 0x7fffffff;
 *   OV = 1;
 * } else {
 *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[0];
 *   if (`.u` form) {
 *     Round[x][32:0] = Mres[x][46:14] + 1;
 *     Rd.W[x] = Round[x][32:1];
 *   } else {
 *     Rd.W[x] = Mres[x][46:15];
 *   }
 * }
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMMWB2(long a, unsigned long b)
{
    long result;
    __ASM volatile("kmmwb2 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.49.1. KMMWB2 ===== */

/* ===== Inline Function Start for 3.49.2. KMMWB2.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
 * \brief KMMWB2.u (SIMD Saturating MSW Signed Multiply Word and Bottom Half & 2 with Rounding)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMMWB2 Rd, Rs1, Rs2
 * KMMWB2.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the
 * corresponding 32-bit elements of another register, double the multiplication results and write the
 * saturated most significant 32-bit results to the corresponding 32-bit elements of a register. The `.u`
 * form rounds up the results from the most significant discarded bit.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed bottom 16-bit Q15
 * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
 * writes the saturated most significant 32-bit Q31 multiplication results to the corresponding 32-bit
 * elements of Rd. The `.u` form of the instruction rounds up the most significant 32-bit of the 48-bit
 * Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of the results.
 *
 * **Operations**:\n
 * ~~~
 * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[0] == 0x8000)) {
 *   Rd.W[x] = 0x7fffffff;
 *   OV = 1;
 * } else {
 *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[0];
 *   if (`.u` form) {
 *     Round[x][32:0] = Mres[x][46:14] + 1;
 *     Rd.W[x] = Round[x][32:1];
 *   } else {
 *     Rd.W[x] = Mres[x][46:15];
 *   }
 * }
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMMWB2_U(long a, unsigned long b)
{
    long result;
    __ASM volatile("kmmwb2.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.49.2. KMMWB2.u ===== */

/* ===== Inline Function Start for 3.50.1. KMMWT2 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
 * \brief KMMWT2 (SIMD Saturating MSW Signed Multiply Word and Top Half & 2)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMMWT2 Rd, Rs1, Rs2
 * KMMWT2.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit integer elements of one register and the top 16-bit of the
 * corresponding 32-bit elements of another register, double the multiplication results and write the
 * saturated most significant 32-bit results to the corresponding 32-bit elements of a register. The `.u`
 * form rounds up the results from the most significant discarded bit.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed top 16-bit Q15
 * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
 * writes the saturated most significant 32-bit Q31 multiplication results to the corresponding 32-bit
 * elements of Rd. The `.u` form of the instruction rounds up the most significant 32-bit of the 48-bit
 * Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of the results.
 *
 * **Operations**:\n
 * ~~~
 * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[1] == 0x8000)) {
 *   Rd.W[x] = 0x7fffffff;
 *   OV = 1;
 * } else {
 *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[1];
 *   if (`.u` form) {
 *     Round[x][32:0] = Mres[x][46:14] + 1;
 *     Rd.W[x] = Round[x][32:1];
 *   } else {
 *     Rd.W[x] = Mres[x][46:15];
 *   }
 * }
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMMWT2(long a, unsigned long b)
{
    long result;
    __ASM volatile("kmmwt2 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.50.1. KMMWT2 ===== */

/* ===== Inline Function Start for 3.50.2. KMMWT2.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
 * \brief KMMWT2.u (SIMD Saturating MSW Signed Multiply Word and Top Half & 2 with Rounding)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMMWT2 Rd, Rs1, Rs2
 * KMMWT2.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit integer elements of one register and the top 16-bit of the
 * corresponding 32-bit elements of another register, double the multiplication results and write the
 * saturated most significant 32-bit results to the corresponding 32-bit elements of a register. The `.u`
 * form rounds up the results from the most significant discarded bit.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed top 16-bit Q15
 * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
 * writes the saturated most significant 32-bit Q31 multiplication results to the corresponding 32-bit
 * elements of Rd. The `.u` form of the instruction rounds up the most significant 32-bit of the 48-bit
 * Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of the results.
 *
 * **Operations**:\n
 * ~~~
 * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[1] == 0x8000)) {
 *   Rd.W[x] = 0x7fffffff;
 *   OV = 1;
 * } else {
 *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[1];
 *   if (`.u` form) {
 *     Round[x][32:0] = Mres[x][46:14] + 1;
 *     Rd.W[x] = Round[x][32:1];
 *   } else {
 *     Rd.W[x] = Mres[x][46:15];
 *   }
 * }
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMMWT2_U(long a, unsigned long b)
{
    long result;
    __ASM volatile("kmmwt2.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.50.2. KMMWT2.u ===== */

/* ===== Inline Function Start for 3.51.1. KMSDA ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
 * \brief KMSDA (SIMD Saturating Signed Multiply Two Halfs & Add & Subtract)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMSDA Rd, Rs1, Rs2
 * KMSXDA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
 * subtracts the two 32-bit results from the corresponding 32-bit elements of a third register. The
 * subtraction result may be saturated.
 * * KMSDA: rd.W[x] - top*top - bottom*bottom (per 32-bit element)
 * * KMSXDA: rd.W[x] - top*bottom - bottom*top (per 32-bit element)
 *
 * **Description**:\n
 * For the `KMSDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of
 * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
 * For the `KMSXDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the top 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of the
 * 32-bit elements of Rs1 with the bottom 16-bit content of the 32-bit elements of Rs2.
 * The two 32-bit multiplication results are then subtracted from the content of the corresponding 32-
 * bit elements of Rd. If the subtraction result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is
 * saturated to the range and the OV bit is set to 1. The results after saturation are written to Rd. The
 * 16-bit contents are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * // KMSDA
 * res[x] = Rd.W[x] - (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
 * // KMSXDA
 * res[x] = Rd.W[x] - (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMSDA(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmsda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.51.1. KMSDA ===== */

/* ===== Inline Function Start for 3.51.2. KMSXDA ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
 * \brief KMSXDA (SIMD Saturating Signed Crossed Multiply Two Halfs & Add & Subtract)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KMSDA Rd, Rs1, Rs2
 * KMSXDA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
 * subtracts the two 32-bit results from the corresponding 32-bit elements of a third register. The
 * subtraction result may be saturated.
 * * KMSDA: rd.W[x] - top*top - bottom*bottom (per 32-bit element)
 * * KMSXDA: rd.W[x] - top*bottom - bottom*top (per 32-bit element)
 *
 * **Description**:\n
 * For the `KMSDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of
 * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
 * For the `KMSXDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the top 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of the
 * 32-bit elements of Rs1 with the bottom 16-bit content of the 32-bit elements of Rs2.
 * The two 32-bit multiplication results are then subtracted from the content of the corresponding 32-
 * bit elements of Rd. If the subtraction result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is
 * saturated to the range and the OV bit is set to 1. The results after saturation are written to Rd. The
 * 16-bit contents are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * // KMSDA
 * res[x] = Rd.W[x] - (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
 * // KMSXDA
 * res[x] = Rd.W[x] - (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMSXDA(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmsxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.51.2. KMSXDA ===== */

/* ===== Inline Function Start for 3.52. KMSR64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
 * \brief KMSR64 (Signed Multiply and Saturating Subtract from 64-Bit Data)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * KMSR64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the 32-bit signed elements in two registers and subtract the 64-bit multiplication
 * results from the 64-bit signed data of a pair of registers (RV32) or a register (RV64). The result is
 * saturated to the Q63 range and written back to the pair of registers (RV32) or the register (RV64).
 *
 * **RV32 Description**:\n
 * This instruction multiplies the 32-bit signed data of Rs1 with that of Rs2. It
 * subtracts the 64-bit multiplication result from the 64-bit signed data of an even/odd pair of registers
 * specified by Rd(4,1) with unlimited precision. If the 64-bit subtraction result is beyond the Q63
 * number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The saturated
 * result is written back to the even/odd pair of registers specified by Rd(4,1).
 * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 *
 * **RV64 Description**:\n
 * This instruction multiplies the 32-bit signed elements of Rs1 with that of Rs2. It
 * subtracts the 64-bit multiplication results from the 64-bit signed data in Rd with unlimited
 * precision. If the 64-bit subtraction result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is
 * saturated to the range and the OV bit is set to 1. The saturated result is written back to Rd.
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * result = R[t_H].R[t_L] - (Rs1 * Rs2);
 * if (result > (2^63)-1) {
 *   result = (2^63)-1; OV = 1;
 * } else if (result < -2^63) {
 *   result = -2^63; OV = 1;
 * }
 * R[t_H].R[t_L] = result;
 * RV64:
 * // `result` has unlimited precision
 * result = Rd - (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]);
 * if (result > (2^63)-1) {
 *   result = (2^63)-1; OV = 1;
 * } else if (result < -2^63) {
 *   result = -2^63; OV = 1;
 * }
 * Rd = result;
 * ~~~
 *
 * \param [in]  t    long long type of value stored in t
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_KMSR64(long long t, long a, long b)
{
    __ASM volatile("kmsr64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.52. KMSR64 ===== */

/* ===== Inline Function Start for 3.53. KSLLW ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
 * \brief KSLLW (Saturating Shift Left Logical for Word)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * KSLLW Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do logical left shift operation with saturation on a 32-bit word. The shift amount is a
 * variable from a GPR.
 *
 * **Description**:\n
 * The first word data in Rs1 is left-shifted logically. The shifted out bits are filled with
 * zero and the shift amount is specified by the low-order 5-bits of the value in the Rs2 register. Any
 * shifted value greater than 2^31-1 is saturated to 2^31-1. Any shifted value smaller than -2^31 is saturated
 * to -2^31. And the saturated result is sign-extended and written to Rd. If any saturation is performed,
 * set OV bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[4:0];
 * res[(31+sa):0] = Rs1.W[0] << sa;
 * if (res > (2^31)-1) {
 *   res = 0x7fffffff; OV = 1;
 * } else if (res < -2^31) {
 *   res = 0x80000000; OV = 1;
 * }
 * Rd[31:0] = res[31:0]; // RV32
 * Rd[63:0] = SE(res[31:0]); // RV64
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KSLLW(long a, unsigned int b)
{
    long result;
    __ASM volatile("ksllw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.53. KSLLW ===== */

/* ===== Inline Function Start for 3.54. KSLLIW ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
 * \brief KSLLIW (Saturating Shift Left Logical Immediate for Word)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * KSLLIW Rd, Rs1, imm5u
 * ~~~
 *
 * **Purpose**:\n
 * Do logical left shift operation with saturation on a 32-bit word. The shift amount is an
 * immediate value.
 *
 * **Description**:\n
 * The first word data in Rs1 is left-shifted logically. The shifted out bits are filled with
 * zero and the shift amount is specified by the imm5u constant. Any shifted value greater than 2^31-1 is
 * saturated to 2^31-1. Any shifted value smaller than -2^31 is saturated to -2^31. And the saturated result is
 * sign-extended and written to Rd. If any saturation is performed, set OV bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm5u;
 * res[(31+sa):0] = Rs1.W[0] << sa;
 * if (res > (2^31)-1) {
 *   res = 0x7fffffff; OV = 1;
 * } else if (res < -2^31) {
 *   res = 0x80000000; OV = 1;
 * }
 * Rd[31:0] = res[31:0]; // RV32
 * Rd[63:0] = SE(res[31:0]); // RV64
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in long type
 */
#define __RV_KSLLIW(a, b)    \
    ({    \
        long result;    \
        long __a = (long)(a);    \
        __ASM volatile("kslliw %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.54. KSLLIW ===== */

/* ===== Inline Function Start for 3.55. KSLL8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
 * \brief KSLL8 (SIMD 8-bit Saturating Shift Left Logical)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KSLL8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit elements logical left shift operations with saturation simultaneously. The shift
 * amount is a variable from a GPR.
 *
 * **Description**:\n
 * The 8-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled
 * with zero and the shift amount is specified by the low-order 3-bits of the value in the Rs2 register.
 * Any shifted value greater than 2^7-1 is saturated to 2^7-1. Any shifted value smaller than -2^7 is
 * saturated to -2^7. And the saturated results are written to Rd. If any saturation is performed, set OV
 * bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[2:0];
 * if (sa != 0) {
 *   res[(7+sa):0] = Rs1.B[x] << sa;
 *   if (res > (2^7)-1) {
 *     res = 0x7f; OV = 1;
 *   } else if (res < -2^7) {
 *     res = 0x80; OV = 1;
 *   }
 *   Rd.B[x] = res[7:0];
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KSLL8(unsigned long a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("ksll8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.55. KSLL8 ===== */

/* ===== Inline Function Start for 3.56. KSLLI8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
 * \brief KSLLI8 (SIMD 8-bit Saturating Shift Left Logical Immediate)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KSLLI8 Rd, Rs1, imm3u
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit elements logical left shift operations with saturation simultaneously. The shift
 * amount is an immediate value.
 *
 * **Description**:\n
 * The 8-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled
 * with zero and the shift amount is specified by the imm3u constant. Any shifted value greater than
 * 2^7-1 is saturated to 2^7-1. Any shifted value smaller than -2^7 is saturated to -2^7. And the saturated
 * results are written to Rd. If any saturation is performed, set OV bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm3u[2:0];
 * if (sa != 0) {
 *   res[(7+sa):0] = Rs1.B[x] << sa;
 *   if (res > (2^7)-1) {
 *     res = 0x7f; OV = 1;
 *   } else if (res < -2^7) {
 *     res = 0x80; OV = 1;
 *   }
 *   Rd.B[x] = res[7:0];
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_KSLLI8(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("kslli8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.56. KSLLI8 ===== */

/* ===== Inline Function Start for 3.57. KSLL16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
 * \brief KSLL16 (SIMD 16-bit Saturating Shift Left Logical)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KSLL16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit elements logical left shift operations with saturation simultaneously. The shift
 * amount is a variable from a GPR.
 *
 * **Description**:\n
 * The 16-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled
 * with zero and the shift amount is specified by the low-order 4-bits of the value in the Rs2 register.
 * Any shifted value greater than 2^15-1 is saturated to 2^15-1. Any shifted value smaller than -2^15 is
 * saturated to -2^15. And the saturated results are written to Rd. If any saturation is performed, set OV
 * bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[3:0];
 * if (sa != 0) {
 *   res[(15+sa):0] = Rs1.H[x] << sa;
 *   if (res > (2^15)-1) {
 *     res = 0x7fff; OV = 1;
 *   } else if (res < -2^15) {
 *     res = 0x8000; OV = 1;
 *   }
 *   Rd.H[x] = res[15:0];
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KSLL16(unsigned long a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("ksll16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.57. KSLL16 ===== */

/* ===== Inline Function Start for 3.58. KSLLI16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
 * \brief KSLLI16 (SIMD 16-bit Saturating Shift Left Logical Immediate)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KSLLI16 Rd, Rs1, imm4u
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit elements logical left shift operations with saturation simultaneously. The shift
 * amount is an immediate value.
 *
 * **Description**:\n
 * The 16-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled
 * with zero and the shift amount is specified by the imm4u constant. Any shifted value greater than
 * 2^15-1 is saturated to 2^15-1. Any shifted value smaller than -2^15 is saturated to -2^15. And the saturated
 * results are written to Rd. If any saturation is performed, set OV bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm4u[3:0];
 * if (sa != 0) {
 *   res[(15+sa):0] = Rs1.H[x] << sa;
 *   if (res > (2^15)-1) {
 *     res = 0x7fff; OV = 1;
 *   } else if (res < -2^15) {
 *     res = 0x8000; OV = 1;
 *   }
 *   Rd.H[x] = res[15:0];
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_KSLLI16(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("kslli16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.58. KSLLI16 ===== */

/* ===== Inline Function Start for 3.59.1. KSLRA8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
 * \brief KSLRA8 (SIMD 8-bit Shift Left Logical with Saturation or Shift Right Arithmetic)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KSLRA8 Rd, Rs1, Rs2
 * KSLRA8.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit elements logical left (positive) or arithmetic right (negative) shift operation with
 * Q7 saturation for the left shift. The `.u` form performs additional rounding up operations for the
 * right shift.
 *
 * **Description**:\n
 * The 8-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
 * based on the value of Rs2[3:0]. Rs2[3:0] is in the signed range of [-2^3, 2^3-1]. A positive Rs2[3:0] means
 * logical left shift and a negative Rs2[3:0] means arithmetic right shift. The shift amount is the
 * absolute value of Rs2[3:0]. However, the behavior of `Rs2[3:0]==-2^3 (0x8)` is defined to be
 * equivalent to the behavior of `Rs2[3:0]==-(2^3-1) (0x9)`.
 * The left-shifted results are saturated to the 8-bit signed integer range of [-2^7, 2^7-1]. For the `.u` form
 * of the instruction, the right-shifted results are added a 1 to the most significant discarded bit
 * position for rounding effect. After the shift, saturation, or rounding, the final results are written to
 * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:4] will not affect
 * this instruction.
 *
 * **Operations**:\n
 * ~~~
 * if (Rs2[3:0] < 0) {
 *   sa = -Rs2[3:0];
 *   sa = (sa == 8)? 7 : sa;
 *   if (`.u` form) {
 *     res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1;
 *     Rd.B[x] = res[7:0];
 *   } else {
 *     Rd.B[x] = SE8(Rs1.B[x][7:sa]);
 *   }
 * } else {
 *   sa = Rs2[2:0];
 *   res[(7+sa):0] = Rs1.B[x] <<(logic) sa;
 *   if (res > (2^7)-1) {
 *     res[7:0] = 0x7f; OV = 1;
 *   } else if (res < -2^7) {
 *     res[7:0] = 0x80; OV = 1;
 *   }
 *   Rd.B[x] = res[7:0];
 * }
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KSLRA8(unsigned long a, int b)
{
    unsigned long result;
    __ASM volatile("kslra8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.59.1. KSLRA8 ===== */

/* ===== Inline Function Start for 3.59.2. KSLRA8.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
 * \brief KSLRA8.u (SIMD 8-bit Shift Left Logical with Saturation or Rounding Shift Right Arithmetic)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KSLRA8 Rd, Rs1, Rs2
 * KSLRA8.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit elements logical left (positive) or arithmetic right (negative) shift operation with
 * Q7 saturation for the left shift. The `.u` form performs additional rounding up operations for the
 * right shift.
 *
 * **Description**:\n
 * The 8-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
 * based on the value of Rs2[3:0]. Rs2[3:0] is in the signed range of [-2^3, 2^3-1]. A positive Rs2[3:0] means
 * logical left shift and a negative Rs2[3:0] means arithmetic right shift. The shift amount is the
 * absolute value of Rs2[3:0]. However, the behavior of `Rs2[3:0]==-2^3 (0x8)` is defined to be
 * equivalent to the behavior of `Rs2[3:0]==-(2^3-1) (0x9)`.
 * The left-shifted results are saturated to the 8-bit signed integer range of [-2^7, 2^7-1]. For the `.u` form
 * of the instruction, the right-shifted results are added a 1 to the most significant discarded bit
 * position for rounding effect. After the shift, saturation, or rounding, the final results are written to
 * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:4] will not affect
 * this instruction.
 *
 * **Operations**:\n
 * ~~~
 * if (Rs2[3:0] < 0) {
 *   sa = -Rs2[3:0];
 *   sa = (sa == 8)? 7 : sa;
 *   if (`.u` form) {
 *     res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1;
 *     Rd.B[x] = res[7:0];
 *   } else {
 *     Rd.B[x] = SE8(Rs1.B[x][7:sa]);
 *   }
 * } else {
 *   sa = Rs2[2:0];
 *   res[(7+sa):0] = Rs1.B[x] <<(logic) sa;
 *   if (res > (2^7)-1) {
 *     res[7:0] = 0x7f; OV = 1;
 *   } else if (res < -2^7) {
 *     res[7:0] = 0x80; OV = 1;
 *   }
 *   Rd.B[x] = res[7:0];
 * }
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KSLRA8_U(unsigned long a, int b)
{
    unsigned long result;
    __ASM volatile("kslra8.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.59.2. KSLRA8.u ===== */

/* ===== Inline Function Start for 3.60.1. KSLRA16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
 * \brief KSLRA16 (SIMD 16-bit Shift Left Logical with Saturation or Shift Right Arithmetic)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KSLRA16 Rd, Rs1, Rs2
 * KSLRA16.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit elements logical left (positive) or arithmetic right (negative) shift operation with
 * Q15 saturation for the left shift. The `.u` form performs additional rounding up operations for the
 * right shift.
 *
 * **Description**:\n
 * The 16-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
 * based on the value of Rs2[4:0]. Rs2[4:0] is in the signed range of [-2^4, 2^4-1]. A positive Rs2[4:0] means
 * logical left shift and a negative Rs2[4:0] means arithmetic right shift. The shift amount is the
 * absolute value of Rs2[4:0]. However, the behavior of `Rs2[4:0]==-2^4 (0x10)` is defined to be
 * equivalent to the behavior of `Rs2[4:0]==-(2^4-1) (0x11)`.
 * The left-shifted results are saturated to the 16-bit signed integer range of [-2^15, 2^15-1]. For the `.u`
 * form of the instruction, the right-shifted results are added a 1 to the most significant discarded bit
 * position for rounding effect. After the shift, saturation, or rounding, the final results are written to
 * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:5] will not affect
 * this instruction.
 *
 * **Operations**:\n
 * ~~~
 * if (Rs2[4:0] < 0) {
 *   sa = -Rs2[4:0];
 *   sa = (sa == 16)? 15 : sa;
 *   if (`.u` form) {
 *     res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1;
 *     Rd.H[x] = res[15:0];
 *   } else {
 *     Rd.H[x] = SE16(Rs1.H[x][15:sa]);
 *   }
 * } else {
 *   sa = Rs2[3:0];
 *   res[(15+sa):0] = Rs1.H[x] <<(logic) sa;
 *   if (res > (2^15)-1) {
 *     res[15:0] = 0x7fff; OV = 1;
 *   } else if (res < -2^15) {
 *     res[15:0] = 0x8000; OV = 1;
 *   }
 *   d.H[x] = res[15:0];
 * }
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KSLRA16(unsigned long a, int b)
{
    unsigned long result;
    __ASM volatile("kslra16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.60.1. KSLRA16 ===== */

/* ===== Inline Function Start for 3.60.2. KSLRA16.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
 * \brief KSLRA16.u (SIMD 16-bit Shift Left Logical with Saturation or Rounding Shift Right Arithmetic)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KSLRA16 Rd, Rs1, Rs2
 * KSLRA16.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit elements logical left (positive) or arithmetic right (negative) shift operation with
 * Q15 saturation for the left shift. The `.u` form performs additional rounding up operations for the
 * right shift.
 *
 * **Description**:\n
 * The 16-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
 * based on the value of Rs2[4:0]. Rs2[4:0] is in the signed range of [-2^4, 2^4-1]. A positive Rs2[4:0] means
 * logical left shift and a negative Rs2[4:0] means arithmetic right shift. The shift amount is the
 * absolute value of Rs2[4:0]. However, the behavior of `Rs2[4:0]==-2^4 (0x10)` is defined to be
 * equivalent to the behavior of `Rs2[4:0]==-(2^4-1) (0x11)`.
 * The left-shifted results are saturated to the 16-bit signed integer range of [-2^15, 2^15-1]. For the `.u`
 * form of the instruction, the right-shifted results are added a 1 to the most significant discarded bit
 * position for rounding effect. After the shift, saturation, or rounding, the final results are written to
 * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:5] will not affect
 * this instruction.
 *
 * **Operations**:\n
 * ~~~
 * if (Rs2[4:0] < 0) {
 *   sa = -Rs2[4:0];
 *   sa = (sa == 16)? 15 : sa;
 *   if (`.u` form) {
 *     res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1;
 *     Rd.H[x] = res[15:0];
 *   } else {
 *     Rd.H[x] = SE16(Rs1.H[x][15:sa]);
 *   }
 * } else {
 *   sa = Rs2[3:0];
 *   res[(15+sa):0] = Rs1.H[x] <<(logic) sa;
 *   if (res > (2^15)-1) {
 *     res[15:0] = 0x7fff; OV = 1;
 *   } else if (res < -2^15) {
 *     res[15:0] = 0x8000; OV = 1;
 *   }
 *   d.H[x] = res[15:0];
 * }
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KSLRA16_U(unsigned long a, int b)
{
    unsigned long result;
    __ASM volatile("kslra16.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.60.2. KSLRA16.u ===== */

/* ===== Inline Function Start for 3.61. KSLRAW ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
 * \brief KSLRAW (Shift Left Logical with Q31 Saturation or Shift Right Arithmetic)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * KSLRAW Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Perform a logical left (positive) or arithmetic right (negative) shift operation with Q31
 * saturation for the left shift on a 32-bit data.
 *
 * **Description**:\n
 * The lower 32-bit content of Rs1 is left-shifted logically or right-shifted arithmetically
 * based on the value of Rs2[5:0]. Rs2[5:0] is in the signed range of [-25, 25-1]. A positive Rs2[5:0] means
 * logical left shift and a negative Rs2[5:0] means arithmetic right shift. The shift amount is the
 * absolute value of Rs2[5:0] clamped to the actual shift range of [0, 31].
 * The left-shifted result is saturated to the 32-bit signed integer range of [-2^31, 2^31-1]. After the shift
 * operation, the final result is bit-31 sign-extended and written to Rd. If any saturation happens, this
 * instruction sets the OV flag. The value of Rs2[31:6] will not affected the operation of this instruction.
 *
 * **Operations**:\n
 * ~~~
 * if (Rs2[5:0] < 0) {
 *   sa = -Rs2[5:0];
 *   sa = (sa == 32)? 31 : sa;
 *   res[31:0] = Rs1.W[0] >>(arith) sa;
 * } else {
 *   sa = Rs2[5:0];
 *   tmp = Rs1.W[0] <<(logic) sa;
 *   if (tmp > (2^31)-1) {
 *     res[31:0] = (2^31)-1;
 *     OV = 1;
 *   } else if (tmp < -2^31) {
 *     res[31:0] = -2^31;
 *     OV = 1
 *   } else {
 *     res[31:0] = tmp[31:0];
 *   }
 * }
 * Rd = res[31:0]; // RV32
 * Rd = SE64(res[31:0]); // RV64
 * ~~~
 *
 * \param [in]  a    int type of value stored in a
 * \param [in]  b    int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KSLRAW(int a, int b)
{
    long result;
    __ASM volatile("kslraw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.61. KSLRAW ===== */

/* ===== Inline Function Start for 3.62. KSLRAW.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
 * \brief KSLRAW.u (Shift Left Logical with Q31 Saturation or Rounding Shift Right Arithmetic)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * KSLRAW.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Perform a logical left (positive) or arithmetic right (negative) shift operation with Q31
 * saturation for the left shift and a rounding up operation for the right shift on a 32-bit data.
 *
 * **Description**:\n
 * The lower 32-bit content of Rs1 is left-shifted logically or right-shifted arithmetically
 * based on the value of Rs2[5:0]. Rs2[5:0] is in the signed range of [-25, 25-1]. A positive Rs2[5:0] means
 * logical left shift and a negative Rs2[5:0] means arithmetic right shift. The shift amount is the
 * absolute value of Rs2[5:0] clamped to the actual shift range of [0, 31].
 * The left-shifted result is saturated to the 32-bit signed integer range of [-2^31, 2^31-1]. The right-shifted
 * result is added a 1 to the most significant discarded bit position for rounding effect. After the shift,
 * saturation, or rounding, the final result is bit-31 sign-extended and written to Rd. If any saturation
 * happens, this instruction sets the OV flag. The value of Rs2[31:6] will not affect the operation of this
 * instruction.
 *
 * **Operations**:\n
 * ~~~
 * if (Rs2[5:0] < 0) {
 *   sa = -Rs2[5:0];
 *   sa = (sa == 32)? 31 : sa;
 *   res[31:-1] = SE33(Rs1[31:(sa-1)]) + 1;
 *   rst[31:0] = res[31:0];
 * } else {
 *   sa = Rs2[5:0];
 *   tmp = Rs1.W[0] <<(logic) sa;
 *   if (tmp > (2^31)-1) {
 *     rst[31:0] = (2^31)-1;
 *     OV = 1;
 *   } else if (tmp < -2^31) {
 *     rst[31:0] = -2^31;
 *     OV = 1
 *   } else {
 *     rst[31:0] = tmp[31:0];
 *   }
 * }
 * Rd = rst[31:0]; // RV32
 * Rd = SE64(rst[31:0]); // RV64
 * ~~~
 *
 * \param [in]  a    int type of value stored in a
 * \param [in]  b    int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KSLRAW_U(int a, int b)
{
    long result;
    __ASM volatile("kslraw.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.62. KSLRAW.u ===== */

/* ===== Inline Function Start for 3.63. KSTAS16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief KSTAS16 (SIMD 16-bit Signed Saturating Straight Addition & Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KSTAS16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element saturating addition and 16-bit signed integer element
 * saturating subtraction in a 32-bit chunk simultaneously. Operands are from corresponding
 * positions in 32-bit chunks.
 *
 * **Description**:\n
 * This instruction adds the 16-bit signed integer element in [31:16] of 32-bit chunks in
 * Rs1 with the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs2; at the same time, it
 * subtracts the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs2 from the 16-bit signed
 * integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the Q15 number
 * range (-2^15 <= Q15 <= 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated
 * results are written to [31:16] of 32-bit chunks in Rd for addition and [15:0] of 32-bit chunks in Rd for
 * subtraction.
 *
 * **Operations**:\n
 * ~~~
 * res1 = Rs1.W[x][31:16] + Rs2.W[x][31:16];
 * res2 = Rs1.W[x][15:0] - Rs2.W[x][15:0];
 * for (res in [res1, res2]) {
 *   if (res > (2^15)-1) {
 *     res = (2^15)-1;
 *     OV = 1;
 *   } else if (res < -2^15) {
 *     res = -2^15;
 *     OV = 1;
 *   }
 * }
 * Rd.W[x][31:16] = res1;
 * Rd.W[x][15:0] = res2;
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KSTAS16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("kstas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.63. KSTAS16 ===== */

/* ===== Inline Function Start for 3.64. KSTSA16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief KSTSA16 (SIMD 16-bit Signed Saturating Straight Subtraction & Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KSTSA16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element saturating subtraction and 16-bit signed integer element
 * saturating addition in a 32-bit chunk simultaneously. Operands are from corresponding positions in
 * 32-bit chunks.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit signed integer element in [31:16] of 32-bit chunks
 * in Rs2 from the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs1; at the same time, it
 * adds the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs2 with the 16-bit signed integer
 * element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the Q15 number range (-2^15
 * <= Q15 <= 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated results are
 * written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of 32-bit chunks in Rd for
 * addition.
 *
 * **Operations**:\n
 * ~~~
 * res1 = Rs1.W[x][31:16] - Rs2.W[x][31:16];
 * res2 = Rs1.W[x][15:0] + Rs2.W[x][15:0];
 * for (res in [res1, res2]) {
 *   if (res > (2^15)-1) {
 *     res = (2^15)-1;
 *     OV = 1;
 *   } else if (res < -2^15) {
 *     res = -2^15;
 *     OV = 1;
 *   }
 * }
 * Rd.W[x][31:16] = res1;
 * Rd.W[x][15:0] = res2;
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KSTSA16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("kstsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.64. KSTSA16 ===== */

/* ===== Inline Function Start for 3.65. KSUB8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
 * \brief KSUB8 (SIMD 8-bit Signed Saturating Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KSUB8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit signed elements saturating subtractions simultaneously.
 *
 * **Description**:\n
 * This instruction subtracts the 8-bit signed integer elements in Rs2 from the 8-bit
 * signed integer elements in Rs1. If any of the results are beyond the Q7 number range (-2^7 <= Q7 <= 27
 * -1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rs1.B[x] - Rs2.B[x];
 * if (res[x] > (2^7)-1) {
 *   res[x] = (2^7)-1;
 *   OV = 1;
 * } else if (res[x] < -2^7) {
 *   res[x] = -2^7;
 *   OV = 1;
 * }
 * Rd.B[x] = res[x];
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KSUB8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ksub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.65. KSUB8 ===== */

/* ===== Inline Function Start for 3.66. KSUB16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief KSUB16 (SIMD 16-bit Signed Saturating Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KSUB16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer elements saturating subtractions simultaneously.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit signed integer elements in Rs2 from the 16-bit
 * signed integer elements in Rs1. If any of the results are beyond the Q15 number range (-2^15 <= Q15 <=
 * 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to
 * Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rs1.H[x] - Rs2.H[x];
 * if (res[x] > (2^15)-1) {
 *   res[x] = (2^15)-1;
 *   OV = 1;
 * } else if (res[x] < -2^15) {
 *   res[x] = -2^15;
 *   OV = 1;
 * }
 * Rd.H[x] = res[x];
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KSUB16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ksub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.66. KSUB16 ===== */

/* ===== Inline Function Start for 3.67. KSUB64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
 * \brief KSUB64 (64-bit Signed Saturating Subtraction)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * KSUB64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Perform a 64-bit signed integer subtraction. The result is saturated to the Q63 range.
 *
 * **RV32 Description**:\n
 * This instruction subtracts the 64-bit signed integer of an even/odd pair of
 * registers specified by Rs2(4,1) from the 64-bit signed integer of an even/odd pair of registers
 * specified by Rs1(4,1). If the 64-bit result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is
 * saturated to the range and the OV bit is set to 1. The saturated result is then written to an even/odd
 * pair of registers specified by Rd(4,1).
 * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
 * register of the pair contains the low 32-bit of the operand.
 *
 * **RV64 Description**:\n
 * This instruction subtracts the 64-bit signed integer of Rs2 from the 64-bit signed
 * integer of Rs1. If the 64-bit result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated
 * to the range and the OV bit is set to 1. The saturated result is then written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
 * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
 * result = R[a_H].R[a_L] - R[b_H].R[b_L];
 * if (result > (2^63)-1) {
 *   result = (2^63)-1; OV = 1;
 * } else if (result < -2^63) {
 *   result = -2^63; OV = 1;
 * }
 * R[t_H].R[t_L] = result;
 * RV64:
 * result = Rs1 - Rs2;
 * if (result > (2^63)-1) {
 *   result = (2^63)-1; OV = 1;
 * } else if (result < -2^63) {
 *   result = -2^63; OV = 1;
 * }
 * Rd = result;
 * ~~~
 *
 * \param [in]  a    long long type of value stored in a
 * \param [in]  b    long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_KSUB64(long long a, long long b)
{
    long long result;
    __ASM volatile("ksub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.67. KSUB64 ===== */

/* ===== Inline Function Start for 3.68. KSUBH ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
 * \brief KSUBH (Signed Subtraction with Q15 Saturation)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * KSUBH Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Subtract the signed lower 32-bit content of two registers with Q15 saturation.
 *
 * **Description**:\n
 * The signed lower 32-bit content of Rs2 is subtracted from the signed lower 32-bit
 * content of Rs1. And the result is saturated to the 16-bit signed integer range of [-2^15, 2^15-1] and then
 * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag.
 *
 * **Operations**:\n
 * ~~~
 * tmp = Rs1.W[0] - Rs2.W[0];
 * if (tmp > (2^15)-1) {
 *   res = (2^15)-1;
 *   OV = 1;
 * } else if (tmp < -2^15) {
 *   res = -2^15;
 *   OV = 1
 * } else {
 *   res = tmp;
 * }
 * Rd = SE(res[15:0]);
 * ~~~
 *
 * \param [in]  a    int type of value stored in a
 * \param [in]  b    int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KSUBH(int a, int b)
{
    long result;
    __ASM volatile("ksubh %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.68. KSUBH ===== */

/* ===== Inline Function Start for 3.69. KSUBW ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
 * \brief KSUBW (Signed Subtraction with Q31 Saturation)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * KSUBW Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Subtract the signed lower 32-bit content of two registers with Q31 saturation.
 *
 * **Description**:\n
 * The signed lower 32-bit content of Rs2 is subtracted from the signed lower 32-bit
 * content of Rs1. And the result is saturated to the 32-bit signed integer range of [-2^31, 2^31-1] and then
 * sign-extened and written to Rd. If saturation happens, this instruction sets the OV flag.
 *
 * **Operations**:\n
 * ~~~
 * tmp = Rs1.W[0] - Rs2.W[0];
 * if (tmp > (2^31)-1) {
 *   res = (2^31)-1;
 *   OV = 1;
 * } else if (tmp < -2^31) {
 * res = -2^31;
 *   OV = 1
 * } else {
 *   res = tmp;
 * }
 * Rd = res[31:0]; // RV32
 * Rd = SE(res[31:0]); // RV64
 * ~~~
 *
 * \param [in]  a    int type of value stored in a
 * \param [in]  b    int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KSUBW(int a, int b)
{
    long result;
    __ASM volatile("ksubw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.69. KSUBW ===== */

/* ===== Inline Function Start for 3.70.1. KWMMUL ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
 * \brief KWMMUL (SIMD Saturating MSW Signed Multiply Word & Double)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KWMMUL Rd, Rs1, Rs2
 * KWMMUL.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit integer elements of two registers, shift the results left 1-bit,
 * saturate, and write the most significant 32-bit results to a register. The `.u` form additionally
 * rounds up the multiplication results from the most signification discarded bit.
 *
 * **Description**:\n
 * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2. It then shifts
 * the multiplication results one bit to the left and takes the most significant 32-bit results. If the
 * shifted result is greater than 2^31-1, it is saturated to 2^31-1 and the OV flag is set to 1. The final element
 * result is written to Rd. The 32-bit elements of Rs1 and Rs2 are treated as signed integers. The `.u`
 * form of the instruction additionally rounds up the 64-bit multiplication results by adding a 1 to bit
 * 30 before the shift and saturation operations.
 *
 * **Operations**:\n
 * ~~~
 * if ((0x80000000 != Rs1.W[x]) | (0x80000000 != Rs2.W[x])) {
 *   Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
 *   if (`.u` form) {
 *     Round[x][33:0] = Mres[x][63:30] + 1;
 *     Rd.W[x] = Round[x][32:1];
 *   } else {
 *     Rd.W[x] = Mres[x][62:31];
 *   }
 * } else {
 *   Rd.W[x] = 0x7fffffff;
 *   OV = 1;
 * }
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KWMMUL(long a, long b)
{
    long result;
    __ASM volatile("kwmmul %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.70.1. KWMMUL ===== */

/* ===== Inline Function Start for 3.70.2. KWMMUL.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
 * \brief KWMMUL.u (SIMD Saturating MSW Signed Multiply Word & Double with Rounding)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * KWMMUL Rd, Rs1, Rs2
 * KWMMUL.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit integer elements of two registers, shift the results left 1-bit,
 * saturate, and write the most significant 32-bit results to a register. The `.u` form additionally
 * rounds up the multiplication results from the most signification discarded bit.
 *
 * **Description**:\n
 * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2. It then shifts
 * the multiplication results one bit to the left and takes the most significant 32-bit results. If the
 * shifted result is greater than 2^31-1, it is saturated to 2^31-1 and the OV flag is set to 1. The final element
 * result is written to Rd. The 32-bit elements of Rs1 and Rs2 are treated as signed integers. The `.u`
 * form of the instruction additionally rounds up the 64-bit multiplication results by adding a 1 to bit
 * 30 before the shift and saturation operations.
 *
 * **Operations**:\n
 * ~~~
 * if ((0x80000000 != Rs1.W[x]) | (0x80000000 != Rs2.W[x])) {
 *   Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
 *   if (`.u` form) {
 *     Round[x][33:0] = Mres[x][63:30] + 1;
 *     Rd.W[x] = Round[x][32:1];
 *   } else {
 *     Rd.W[x] = Mres[x][62:31];
 *   }
 * } else {
 *   Rd.W[x] = 0x7fffffff;
 *   OV = 1;
 * }
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KWMMUL_U(long a, long b)
{
    long result;
    __ASM volatile("kwmmul.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.70.2. KWMMUL.u ===== */

/* ===== Inline Function Start for 3.71. MADDR32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
 * \brief MADDR32 (Multiply and Add to 32-Bit Word)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * MADDR32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the 32-bit contents of two registers and add the lower 32-bit multiplication result
 * to the 32-bit content of a destination register. Write the final result back to the destination register.
 *
 * **Description**:\n
 * This instruction multiplies the lower 32-bit content of Rs1 with that of Rs2. It adds the
 * lower 32-bit multiplication result to the lower 32-bit content of Rd and writes the final result (RV32)
 * or sign-extended result (RV64) back to Rd. The contents of Rs1 and Rs2 can be either signed or
 * unsigned integers.
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 * Mresult = Rs1 * Rs2;
 * Rd = Rd + Mresult.W[0];
 * RV64:
 * Mresult = Rs1.W[0] * Rs2.W[0];
 * tres[31:0] = Rd.W[0] + Mresult.W[0];
 * Rd = SE64(tres[31:0]);
 * ~~~
 *
 * \param [in]  t    unsigned long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_MADDR32(unsigned long t, unsigned long a, unsigned long b)
{
    __ASM volatile("maddr32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.71. MADDR32 ===== */

/* ===== Inline Function Start for 3.72. MAXW ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
 * \brief MAXW (32-bit Signed Word Maximum)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * MAXW Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Get the larger value from the 32-bit contents of two general registers.
 *
 * **Description**:\n
 * This instruction compares two signed 32-bit integers stored in Rs1 and Rs2, picks the
 * larger value as the result, and writes the result to Rd.
 *
 * **Operations**:\n
 * ~~~
 * if (Rs1.W[0] >= Rs2.W[0]) {
 *   Rd = SE(Rs1.W[0]);
 * } else {
 *   Rd = SE(Rs2.W[0]);
 * }
 * ~~~
 *
 * \param [in]  a    int type of value stored in a
 * \param [in]  b    int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_MAXW(int a, int b)
{
    long result;
    __ASM volatile("maxw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.72. MAXW ===== */

/* ===== Inline Function Start for 3.73. MINW ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
 * \brief MINW (32-bit Signed Word Minimum)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * MINW Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Get the smaller value from the 32-bit contents of two general registers.
 *
 * **Description**:\n
 * This instruction compares two signed 32-bit integers stored in Rs1 and Rs2, picks the
 * smaller value as the result, and writes the result to Rd.
 *
 * **Operations**:\n
 * ~~~
 * if (Rs1.W[0] >= Rs2.W[0]) { Rd = SE(Rs2.W[0]); } else { Rd = SE(Rs1.W[0]); }
 * ~~~
 *
 * \param [in]  a    int type of value stored in a
 * \param [in]  b    int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_MINW(int a, int b)
{
    long result;
    __ASM volatile("minw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.73. MINW ===== */

/* ===== Inline Function Start for 3.74. MSUBR32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
 * \brief MSUBR32 (Multiply and Subtract from 32-Bit Word)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * MSUBR32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the 32-bit contents of two registers and subtract the lower 32-bit multiplication
 * result from the 32-bit content of a destination register. Write the final result back to the destination
 * register.
 *
 * **Description**:\n
 * This instruction multiplies the lower 32-bit content of Rs1 with that of Rs2, subtracts
 * the lower 32-bit multiplication result from the lower 32-bit content of Rd, then writes the final
 * result (RV32) or sign-extended result (RV64) back to Rd. The contents of Rs1 and Rs2 can be either
 * signed or unsigned integers.
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 * Mresult = Rs1 * Rs2;
 * Rd = Rd - Mresult.W[0];
 * RV64:
 * Mresult = Rs1.W[0] * Rs2.W[0];
 * tres[31:0] = Rd.W[0] - Mresult.W[0];
 * Rd = SE64(tres[31:0]);
 * ~~~
 *
 * \param [in]  t    unsigned long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_MSUBR32(unsigned long t, unsigned long a, unsigned long b)
{
    __ASM volatile("msubr32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.74. MSUBR32 ===== */

/* ===== Inline Function Start for 3.75. MULR64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
 * \brief MULR64 (Multiply Word Unsigned to 64-bit Data)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * MULR64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the 32-bit unsigned integer contents of two registers and write the 64-bit result.
 *
 * **RV32 Description**:\n
 * This instruction multiplies the 32-bit content of Rs1 with that of Rs2 and writes the 64-bit
 * multiplication result to an even/odd pair of registers containing Rd. Rd(4,1) index d determines the
 * even/odd pair group of the two registers. Specifically, the register pair includes register 2d and
 * 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 * The lower 32-bit contents of Rs1 and Rs2 are treated as unsigned integers.
 *
 * **RV64 Description**:\n
 * This instruction multiplies the lower 32-bit content of Rs1 with that of Rs2 and writes the 64-bit
 * multiplication result to Rd.
 * The lower 32-bit contents of Rs1 and Rs2 are treated as unsigned integers.
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 * Mresult = CONCAT(1`b0,Rs1) u* CONCAT(1`b0,Rs2);
 * R[Rd(4,1).1(0)][31:0] = Mresult[63:32];
 * R[Rd(4,1).0(0)][31:0] = Mresult[31:0];
 * RV64:
 * Rd = Mresult[63:0];
 * Mresult = CONCAT(1`b0,Rs1.W[0]) u* CONCAT(1`b0,Rs2.W[0]);
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_MULR64(unsigned long a, unsigned long b)
{
    unsigned long long result;
    __ASM volatile("mulr64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.75. MULR64 ===== */

/* ===== Inline Function Start for 3.76. MULSR64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
 * \brief MULSR64 (Multiply Word Signed to 64-bit Data)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * MULSR64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the 32-bit signed integer contents of two registers and write the 64-bit result.
 *
 * **RV32 Description**:\n
 * This instruction multiplies the lower 32-bit content of Rs1 with the lower 32-bit content of Rs2 and
 * writes the 64-bit multiplication result to an even/odd pair of registers containing Rd. Rd(4,1) index d
 * determines the even/odd pair group of the two registers. Specifically, the register pair includes
 * register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 * The lower 32-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **RV64 Description**:\n
 * This instruction multiplies the lower 32-bit content of Rs1 with the lower 32-bit content of Rs2 and
 * writes the 64-bit multiplication result to Rd.
 * The lower 32-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 * Mresult = Ra s* Rb;
 * R[Rd(4,1).1(0)][31:0] = Mresult[63:32];
 * R[Rd(4,1).0(0)][31:0] = Mresult[31:0];
 * RV64:
 * Mresult = Ra.W[0] s* Rb.W[0];
 * Rd = Mresult[63:0];
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_MULSR64(long a, long b)
{
    long long result;
    __ASM volatile("mulsr64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.76. MULSR64 ===== */

/* ===== Inline Function Start for 3.77. PBSAD ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
 * \brief PBSAD (Parallel Byte Sum of Absolute Difference)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * PBSAD Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Calculate the sum of absolute difference of unsigned 8-bit data elements.
 *
 * **Description**:\n
 * This instruction subtracts the un-signed 8-bit elements of Rs2 from those of Rs1. Then
 * it adds the absolute value of each difference together and writes the result to Rd.
 *
 * **Operations**:\n
 * ~~~
 * absdiff[x] = ABS(Rs1.B[x] - Rs2.B[x]);
 * Rd = SUM(absdiff[x]);
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_PBSAD(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("pbsad %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.77. PBSAD ===== */

/* ===== Inline Function Start for 3.78. PBSADA ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
 * \brief PBSADA (Parallel Byte Sum of Absolute Difference Accum)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * PBSADA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Calculate the sum of absolute difference of four unsigned 8-bit data elements and
 * accumulate it into a register.
 *
 * **Description**:\n
 * This instruction subtracts the un-signed 8-bit elements of Rs2 from those of Rs1. It
 * then adds the absolute value of each difference together along with the content of Rd and writes the
 * accumulated result back to Rd.
 *
 * **Operations**:\n
 * ~~~
 * absdiff[x] = ABS(Rs1.B[x] - Rs2.B[x]);
 * Rd = Rd + SUM(absdiff[x]);
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  t    unsigned long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_PBSADA(unsigned long t, unsigned long a, unsigned long b)
{
    __ASM volatile("pbsada %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.78. PBSADA ===== */

/* ===== Inline Function Start for 3.79.1. PKBB16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK
 * \brief PKBB16 (Pack Two 16-bit Data from Both Bottom Half)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * PKBB16 Rd, Rs1, Rs2
 * PKBT16 Rd, Rs1, Rs2
 * PKTT16 Rd, Rs1, Rs2
 * PKTB16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Pack 16-bit data from 32-bit chunks in two registers.
 * * PKBB16: bottom.bottom
 * * PKBT16 bottom.top
 * * PKTT16 top.top
 * * PKTB16 top.bottom
 *
 * **Description**:\n
 * (PKBB16) moves Rs1.W[x][15:0] to Rd.W[x][31:16] and moves Rs2.W[x] [15:0] to
 * Rd.W[x] [15:0].
 * (PKBT16) moves Rs1.W[x] [15:0] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
 * (PKTT16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
 * (PKTB16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [15:0] to Rd.W[x] [15:0].
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][15:0]); // PKBB16
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][31:16]); // PKBT16
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][15:0]); // PKTB16
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][31:16]); // PKTT16
 * for RV32: x=0,
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_PKBB16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("pkbb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.79.1. PKBB16 ===== */

/* ===== Inline Function Start for 3.79.2. PKBT16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK
 * \brief PKBT16 (Pack Two 16-bit Data from Bottom and Top Half)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * PKBB16 Rd, Rs1, Rs2
 * PKBT16 Rd, Rs1, Rs2
 * PKTT16 Rd, Rs1, Rs2
 * PKTB16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Pack 16-bit data from 32-bit chunks in two registers.
 * * PKBB16: bottom.bottom
 * * PKBT16 bottom.top
 * * PKTT16 top.top
 * * PKTB16 top.bottom
 *
 * **Description**:\n
 * (PKBB16) moves Rs1.W[x][15:0] to Rd.W[x][31:16] and moves Rs2.W[x] [15:0] to
 * Rd.W[x] [15:0].
 * (PKBT16) moves Rs1.W[x] [15:0] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
 * (PKTT16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
 * (PKTB16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [15:0] to Rd.W[x] [15:0].
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][15:0]); // PKBB16
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][31:16]); // PKBT16
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][15:0]); // PKTB16
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][31:16]); // PKTT16
 * for RV32: x=0,
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_PKBT16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("pkbt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.79.2. PKBT16 ===== */

/* ===== Inline Function Start for 3.79.3. PKTT16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK
 * \brief PKTT16 (Pack Two 16-bit Data from Both Top Half)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * PKBB16 Rd, Rs1, Rs2
 * PKBT16 Rd, Rs1, Rs2
 * PKTT16 Rd, Rs1, Rs2
 * PKTB16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Pack 16-bit data from 32-bit chunks in two registers.
 * * PKBB16: bottom.bottom
 * * PKBT16 bottom.top
 * * PKTT16 top.top
 * * PKTB16 top.bottom
 *
 * **Description**:\n
 * (PKBB16) moves Rs1.W[x][15:0] to Rd.W[x][31:16] and moves Rs2.W[x] [15:0] to
 * Rd.W[x] [15:0].
 * (PKBT16) moves Rs1.W[x] [15:0] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
 * (PKTT16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
 * (PKTB16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [15:0] to Rd.W[x] [15:0].
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][15:0]); // PKBB16
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][31:16]); // PKBT16
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][15:0]); // PKTB16
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][31:16]); // PKTT16
 * for RV32: x=0,
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_PKTT16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("pktt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.79.3. PKTT16 ===== */

/* ===== Inline Function Start for 3.79.4. PKTB16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK
 * \brief PKTB16 (Pack Two 16-bit Data from Top and Bottom Half)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * PKBB16 Rd, Rs1, Rs2
 * PKBT16 Rd, Rs1, Rs2
 * PKTT16 Rd, Rs1, Rs2
 * PKTB16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Pack 16-bit data from 32-bit chunks in two registers.
 * * PKBB16: bottom.bottom
 * * PKBT16 bottom.top
 * * PKTT16 top.top
 * * PKTB16 top.bottom
 *
 * **Description**:\n
 * (PKBB16) moves Rs1.W[x][15:0] to Rd.W[x][31:16] and moves Rs2.W[x] [15:0] to
 * Rd.W[x] [15:0].
 * (PKBT16) moves Rs1.W[x] [15:0] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
 * (PKTT16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
 * (PKTB16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [15:0] to Rd.W[x] [15:0].
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][15:0]); // PKBB16
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][31:16]); // PKBT16
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][15:0]); // PKTB16
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][31:16]); // PKTT16
 * for RV32: x=0,
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_PKTB16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("pktb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.79.4. PKTB16 ===== */

/* ===== Inline Function Start for 3.80. RADD8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
 * \brief RADD8 (SIMD 8-bit Signed Halving Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * RADD8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit signed integer element additions simultaneously. The element results are halved
 * to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the 8-bit signed integer elements in Rs1 with the 8-bit signed
 * integer elements in Rs2. The results are first arithmetically right-shifted by 1 bit and then written to
 * Rd.
 *
 * **Examples**:\n
 * ~~~
 * * Rs1 = 0x7F, Rs2 = 0x7F, Rd = 0x7F
 * * Rs1 = 0x80, Rs2 = 0x80, Rd = 0x80
 * * Rs1 = 0x40, Rs2 = 0x80, Rd = 0xE0
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.B[x] = (Rs1.B[x] + Rs2.B[x]) s>> 1; for RV32: x=3...0, for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_RADD8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("radd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.80. RADD8 ===== */

/* ===== Inline Function Start for 3.81. RADD16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief RADD16 (SIMD 16-bit Signed Halving Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * RADD16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element additions simultaneously. The results are halved to avoid
 * overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the 16-bit signed integer elements in Rs1 with the 16-bit signed
 * integer elements in Rs2. The results are first arithmetically right-shifted by 1 bit and then written to
 * Rd.
 *
 * **Examples**:\n
 * ~~~
 * * Rs1 = 0x7FFF, Rs2 = 0x7FFF, Rd = 0x7FFF
 * * Rs1 = 0x8000, Rs2 = 0x8000, Rd = 0x8000
 * * Rs1 = 0x4000, Rs2 = 0x8000, Rd = 0xE000
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = (Rs1.H[x] + Rs2.H[x]) s>> 1; for RV32: x=1...0, for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_RADD16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("radd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.81. RADD16 ===== */

/* ===== Inline Function Start for 3.82. RADD64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
 * \brief RADD64 (64-bit Signed Halving Addition)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * RADD64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Add two 64-bit signed integers. The result is halved to avoid overflow or saturation.
 *
 * **RV32 Description**:\n
 * This instruction adds the 64-bit signed integer of an even/odd pair of registers
 * specified by Rs1(4,1) with the 64-bit signed integer of an even/odd pair of registers specified by
 * Rs2(4,1). The 64-bit addition result is first arithmetically right-shifted by 1 bit and then written to an
 * even/odd pair of registers specified by Rd(4,1).
 * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register
 * pair includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 *
 * **RV64 Description**:\n
 * This instruction adds the 64-bit signed integer in Rs1 with the 64-bit signed
 * integer in Rs2. The 64-bit addition result is first arithmetically right-shifted by 1 bit and then
 * written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
 * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
 * R[t_H].R[t_L] = (R[a_H].R[a_L] + R[b_H].R[b_L]) s>> 1;
 * RV64:
 * Rd = (Rs1 + Rs2) s>> 1;
 * ~~~
 *
 * \param [in]  a    long long type of value stored in a
 * \param [in]  b    long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_RADD64(long long a, long long b)
{
    long long result;
    __ASM volatile("radd64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.82. RADD64 ===== */

/* ===== Inline Function Start for 3.83. RADDW ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
 * \brief RADDW (32-bit Signed Halving Addition)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * RADDW Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Add 32-bit signed integers and the results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the first 32-bit signed integer in Rs1 with the first 32-bit signed
 * integer in Rs2. The result is first arithmetically right-shifted by 1 bit and then sign-extended and
 * written to Rd.
 *
 * **Examples**:\n
 * ~~~
 * * Rs1 = 0x7FFFFFFF, Rs2 = 0x7FFFFFFF, Rd = 0x7FFFFFFF
 * * Rs1 = 0x80000000, Rs2 = 0x80000000, Rd = 0x80000000
 * * Rs1 = 0x40000000, Rs2 = 0x80000000, Rd = 0xE0000000
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 * Rd[31:0] = (Rs1[31:0] + Rs2[31:0]) s>> 1;
 * RV64:
 * resw[31:0] = (Rs1[31:0] + Rs2[31:0]) s>> 1;
 * Rd[63:0] = SE(resw[31:0]);
 * ~~~
 *
 * \param [in]  a    int type of value stored in a
 * \param [in]  b    int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_RADDW(int a, int b)
{
    long result;
    __ASM volatile("raddw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.83. RADDW ===== */

/* ===== Inline Function Start for 3.84. RCRAS16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief RCRAS16 (SIMD 16-bit Signed Halving Cross Addition & Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * RCRAS16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element addition and 16-bit signed integer element subtraction in
 * a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit chunks. The results
 * are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the 16-bit signed integer element in [31:16] of 32-bit chunks in
 * Rs1 with the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs2, and subtracts the 16-bit
 * signed integer element in [31:16] of 32-bit chunks in Rs2 from the 16-bit signed integer element in
 * [15:0] of 32-bit chunks in Rs1. The element results are first arithmetically right-shifted by 1 bit and
 * then written to [31:16] of 32-bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
 *
 * **Examples**:\n
 * ~~~
 * Please see `RADD16` and `RSUB16` instructions.
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:16] = (Rs1.W[x][31:16] + Rs2.W[x][15:0]) s>> 1;
 * Rd.W[x][15:0] = (Rs1.W[x][15:0] - Rs2.W[x][31:16]) s>> 1;
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_RCRAS16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("rcras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.84. RCRAS16 ===== */

/* ===== Inline Function Start for 3.85. RCRSA16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief RCRSA16 (SIMD 16-bit Signed Halving Cross Subtraction & Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * RCRSA16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element subtraction and 16-bit signed integer element addition in
 * a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit chunks. The results
 * are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit signed integer element in [15:0] of 32-bit chunks
 * in Rs2 from the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs1, and adds the 16-bit
 * signed element integer in [15:0] of 32-bit chunks in Rs1 with the 16-bit signed integer element in
 * [31:16] of 32-bit chunks in Rs2. The two results are first arithmetically right-shifted by 1 bit and
 * then written to [31:16] of 32-bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
 *
 * **Examples**:\n
 * ~~~
 * Please see `RADD16` and `RSUB16` instructions.
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:16] = (Rs1.W[x][31:16] - Rs2.W[x][15:0]) s>> 1;
 * Rd.W[x][15:0] = (Rs1.W[x][15:0] + Rs2.W[x][31:16]) s>> 1;
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_RCRSA16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("rcrsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.85. RCRSA16 ===== */

/* ===== Inline Function Start for 3.86. RDOV ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_OV_FLAG_SC
 * \brief RDOV (Read OV flag)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * RDOV Rd  # pseudo mnemonic
 * ~~~
 *
 * **Purpose**:\n
 * This pseudo instruction is an alias to `CSRR Rd, ucode` instruction which maps to the real
 * instruction of `CSRRS Rd, ucode, x0`.
 *
 *
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_RDOV(void)
{
    unsigned long result;
    __ASM volatile("rdov %0" : "=r"(result));
    return result;
}
/* ===== Inline Function End for 3.86. RDOV ===== */

/* ===== Inline Function Start for 3.87. RSTAS16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief RSTAS16 (SIMD 16-bit Signed Halving Straight Addition & Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * RSTAS16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element addition and 16-bit signed integer element subtraction in
 * a 32-bit chunk simultaneously. Operands are from corresponding positions in 32-bit chunks. The
 * results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the 16-bit signed integer element in [31:16] of 32-bit chunks in
 * Rs1 with the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs2, and subtracts the 16-bit
 * signed integer element in [15:0] of 32-bit chunks in Rs2 from the 16-bit signed integer element in
 * [15:0] of 32-bit chunks in Rs1. The element results are first arithmetically right-shifted by 1 bit and
 * then written to [31:16] of 32-bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
 *
 * **Examples**:\n
 * ~~~
 * Please see `RADD16` and `RSUB16` instructions.
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:16] = (Rs1.W[x][31:16] + Rs2.W[x][31:16]) s>> 1;
 * Rd.W[x][15:0] = (Rs1.W[x][15:0] - Rs2.W[x][15:0]) s>> 1;
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_RSTAS16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("rstas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.87. RSTAS16 ===== */

/* ===== Inline Function Start for 3.88. RSTSA16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief RSTSA16 (SIMD 16-bit Signed Halving Straight Subtraction & Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * RSTSA16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element subtraction and 16-bit signed integer element addition in
 * a 32-bit chunk simultaneously. Operands are from corresponding positions in 32-bit chunks. The
 * results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit signed integer element in [31:16] of 32-bit chunks
 * in Rs2 from the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs1, and adds the 16-bit
 * signed element integer in [15:0] of 32-bit chunks in Rs1 with the 16-bit signed integer element in
 * [15:0] of 32-bit chunks in Rs2. The two results are first arithmetically right-shifted by 1 bit and then
 * written to [31:16] of 32-bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
 *
 * **Examples**:\n
 * ~~~
 * Please see `RADD16` and `RSUB16` instructions.
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:16] = (Rs1.W[x][31:16] - Rs2.W[x][31:16]) s>> 1;
 * Rd.W[x][15:0] = (Rs1.W[x][15:0] + Rs2.W[x][15:0]) s>> 1;
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_RSTSA16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("rstsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.88. RSTSA16 ===== */

/* ===== Inline Function Start for 3.89. RSUB8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
 * \brief RSUB8 (SIMD 8-bit Signed Halving Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * RSUB8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit signed integer element subtractions simultaneously. The results are halved to
 * avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the 8-bit signed integer elements in Rs2 from the 8-bit
 * signed integer elements in Rs1. The results are first arithmetically right-shifted by 1 bit and then
 * written to Rd.
 *
 * **Examples**:\n
 * ~~~
 * * Rs1 = 0x7F, Rs2 = 0x80, Rd = 0x7F
 * * Rs1 = 0x80, Rs2 = 0x7F, Rd = 0x80
 * * Rs1= 0x80, Rs2 = 0x40, Rd = 0xA0
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.B[x] = (Rs1.B[x] - Rs2.B[x]) s>> 1;
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_RSUB8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("rsub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.89. RSUB8 ===== */

/* ===== Inline Function Start for 3.90. RSUB16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief RSUB16 (SIMD 16-bit Signed Halving Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * RSUB16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element subtractions simultaneously. The results are halved to
 * avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit signed integer elements in Rs2 from the 16-bit
 * signed integer elements in Rs1. The results are first arithmetically right-shifted by 1 bit and then
 * written to Rd.
 *
 * **Examples**:\n
 * ~~~
 * * Ra = 0x7FFF, Rb = 0x8000, Rt = 0x7FFF
 * * Ra = 0x8000, Rb = 0x7FFF, Rt = 0x8000
 * * Ra = 0x8000, Rb = 0x4000, Rt = 0xA000
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = (Rs1.H[x] - Rs2.H[x]) s>> 1;
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_RSUB16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("rsub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.90. RSUB16 ===== */

/* ===== Inline Function Start for 3.91. RSUB64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
 * \brief RSUB64 (64-bit Signed Halving Subtraction)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * RSUB64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Perform a 64-bit signed integer subtraction. The result is halved to avoid overflow or
 * saturation.
 *
 * **RV32 Description**:\n
 * This instruction subtracts the 64-bit signed integer of an even/odd pair of
 * registers specified by Rb(4,1) from the 64-bit signed integer of an even/odd pair of registers
 * specified by Ra(4,1). The subtraction result is first arithmetically right-shifted by 1 bit and then
 * written to an even/odd pair of registers specified by Rt(4,1).
 * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register
 * pair includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 *
 * **RV64 Description**:\n
 * This instruction subtracts the 64-bit signed integer in Rs2 from the 64-bit signed
 * integer in Rs1. The 64-bit subtraction result is first arithmetically right-shifted by 1 bit and then
 * written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
 * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
 * R[t_H].R[t_L] = (R[a_H].R[a_L] - R[b_H].R[b_L]) s>> 1;
 * RV64:
 * Rd = (Rs1 - Rs2) s>> 1;
 * ~~~
 *
 * \param [in]  a    long long type of value stored in a
 * \param [in]  b    long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_RSUB64(long long a, long long b)
{
    long long result;
    __ASM volatile("rsub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.91. RSUB64 ===== */

/* ===== Inline Function Start for 3.92. RSUBW ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
 * \brief RSUBW (32-bit Signed Halving Subtraction)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * RSUBW Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Subtract 32-bit signed integers and the result is halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the first 32-bit signed integer in Rs2 from the first 32-bit
 * signed integer in Rs1. The result is first arithmetically right-shifted by 1 bit and then sign-extended
 * and written to Rd.
 *
 * **Examples**:\n
 * ~~~
 * * Rs1 = 0x7FFFFFFF, Rs2 = 0x80000000, Rd = 0x7FFFFFFF
 * * Rs1 = 0x80000000, Rs2 = 0x7FFFFFFF, Rd = 0x80000000
 * * Rs1 = 0x80000000, Rs2 = 0x40000000, Rd = 0xA0000000
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 * Rd[31:0] = (Rs1[31:0] - Rs2[31:0]) s>> 1;
 * RV64:
 * resw[31:0] = (Rs1[31:0] - Rs2[31:0]) s>> 1;
 * Rd[63:0] = SE(resw[31:0]);
 * ~~~
 *
 * \param [in]  a    int type of value stored in a
 * \param [in]  b    int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_RSUBW(int a, int b)
{
    long result;
    __ASM volatile("rsubw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.92. RSUBW ===== */

/* ===== Inline Function Start for 3.93. SCLIP8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
 * \brief SCLIP8 (SIMD 8-bit Signed Clip Value)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SCLIP8 Rd, Rs1, imm3u[2:0]
 * ~~~
 *
 * **Purpose**:\n
 * Limit the 8-bit signed integer elements of a register into a signed range simultaneously.
 *
 * **Description**:\n
 * This instruction limits the 8-bit signed integer elements stored in Rs1 into a signed
 * integer range between 2^imm3u-1 and -2^imm3u, and writes the limited results to Rd. For example, if
 * imm3u is 3, the 8-bit input values should be saturated between 7 and -8. If saturation is performed,
 * set OV bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * src = Rs1.B[x];
 * if (src > (2^imm3u)-1) {
 *   src = (2^imm3u)-1;
 *   OV = 1;
 * } else if (src < -2^imm3u) {
 *   src = -2^imm3u;
 *   OV = 1;
 * }
 * Rd.B[x] = src
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_SCLIP8(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("sclip8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.93. SCLIP8 ===== */

/* ===== Inline Function Start for 3.94. SCLIP16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
 * \brief SCLIP16 (SIMD 16-bit Signed Clip Value)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SCLIP16 Rd, Rs1, imm4u[3:0]
 * ~~~
 *
 * **Purpose**:\n
 * Limit the 16-bit signed integer elements of a register into a signed range simultaneously.
 *
 * **Description**:\n
 * This instruction limits the 16-bit signed integer elements stored in Rs1 into a signed
 * integer range between 2imm4u-1 and -2imm4u, and writes the limited results to Rd. For example, if
 * imm4u is 3, the 16-bit input values should be saturated between 7 and -8. If saturation is performed,
 * set OV bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * src = Rs1.H[x];
 * if (src > (2^imm4u)-1) {
 *   src = (2^imm4u)-1;
 *   OV = 1;
 * } else if (src < -2^imm4u) {
 *   src = -2^imm4u;
 *   OV = 1;
 * }
 * Rd.H[x] = src
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_SCLIP16(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("sclip16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.94. SCLIP16 ===== */

/* ===== Inline Function Start for 3.95. SCLIP32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
 * \brief SCLIP32 (SIMD 32-bit Signed Clip Value)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * SCLIP32 Rd, Rs1, imm5u[4:0]
 * ~~~
 *
 * **Purpose**:\n
 * Limit the 32-bit signed integer elements of a register into a signed range simultaneously.
 *
 * **Description**:\n
 * This instruction limits the 32-bit signed integer elements stored in Rs1 into a signed
 * integer range between 2imm5u-1 and -2imm5u, and writes the limited results to Rd. For example, if
 * imm5u is 3, the 32-bit input values should be saturated between 7 and -8. If saturation is performed,
 * set OV bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * src = Rs1.W[x];
 * if (src > (2^imm5u)-1) {
 *   src = (2^imm5u)-1;
 *   OV = 1;
 * } else if (src < -2^imm5u) {
 *   src = -2^imm5u;
 *   OV = 1;
 * }
 * Rd.W[x] = src
 * for RV32: x=0,
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in long type
 */
#define __RV_SCLIP32(a, b)    \
    ({    \
        long result;    \
        long __a = (long)(a);    \
        __ASM volatile("sclip32 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.95. SCLIP32 ===== */

/* ===== Inline Function Start for 3.96. SCMPLE8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP
 * \brief SCMPLE8 (SIMD 8-bit Signed Compare Less Than & Equal)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SCMPLE8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit signed integer elements less than & equal comparisons simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 8-bit signed integer elements in Rs1 with the 8-bit
 * signed integer elements in Rs2 to see if the one in Rs1 is less than or equal to the one in Rs2. If it is
 * true, the result is 0xFF; otherwise, the result is 0x0. The element comparison results are written to
 * Rd
 *
 * **Operations**:\n
 * ~~~
 * Rd.B[x] = (Rs1.B[x] {le} Rs2.B[x])? 0xff : 0x0;
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SCMPLE8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("scmple8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.96. SCMPLE8 ===== */

/* ===== Inline Function Start for 3.97. SCMPLE16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP
 * \brief SCMPLE16 (SIMD 16-bit Signed Compare Less Than & Equal)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SCMPLE16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer elements less than & equal comparisons simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 16-bit signed integer elements in Rs1 with the 16-bit
 * signed integer elements in Rs2 to see if the one in Rs1 is less than or equal to the one in Rs2. If it is
 * true, the result is 0xFFFF; otherwise, the result is 0x0. The element comparison results are written
 * to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = (Rs1.H[x] {le} Rs2.H[x])? 0xffff : 0x0;
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SCMPLE16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("scmple16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.97. SCMPLE16 ===== */

/* ===== Inline Function Start for 3.98. SCMPLT8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP
 * \brief SCMPLT8 (SIMD 8-bit Signed Compare Less Than)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SCMPLT8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit signed integer elements less than comparisons simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 8-bit signed integer elements in Rs1 with the 8-bit
 * signed integer elements in Rs2 to see if the one in Rs1 is less than the one in Rs2. If it is true, the
 * result is 0xFF; otherwise, the result is 0x0. The element comparison results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.B[x] = (Rs1.B[x] < Rs2.B[x])? 0xff : 0x0;
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SCMPLT8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("scmplt8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.98. SCMPLT8 ===== */

/* ===== Inline Function Start for 3.99. SCMPLT16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP
 * \brief SCMPLT16 (SIMD 16-bit Signed Compare Less Than)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SCMPLT16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer elements less than comparisons simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 16-bit signed integer elements in Rs1 with the two 16-
 * bit signed integer elements in Rs2 to see if the one in Rs1 is less than the one in Rs2. If it is true, the
 * result is 0xFFFF; otherwise, the result is 0x0. The element comparison results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = (Rs1.H[x] < Rs2.H[x])? 0xffff : 0x0;
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SCMPLT16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("scmplt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.99. SCMPLT16 ===== */

/* ===== Inline Function Start for 3.100. SLL8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
 * \brief SLL8 (SIMD 8-bit Shift Left Logical)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SLL8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit elements logical left shift operations simultaneously. The shift amount is a
 * variable from a GPR.
 *
 * **Description**:\n
 * The 8-bit elements in Rs1 are left-shifted logically. And the results are written to Rd.
 * The shifted out bits are filled with zero and the shift amount is specified by the low-order 3-bits of
 * the value in the Rs2 register.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[2:0];
 * Rd.B[x] = Rs1.B[x] << sa;
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SLL8(unsigned long a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("sll8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.100. SLL8 ===== */

/* ===== Inline Function Start for 3.101. SLLI8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
 * \brief SLLI8 (SIMD 8-bit Shift Left Logical Immediate)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SLLI8 Rd, Rs1, imm3u
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit elements logical left shift operations simultaneously. The shift amount is an
 * immediate value.
 *
 * **Description**:\n
 * The 8-bit elements in Rs1 are left-shifted logically. And the results are written to Rd.
 * The shifted out bits are filled with zero and the shift amount is specified by the imm3u constant.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm3u[2:0];
 * Rd.B[x] = Rs1.B[x] << sa;
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_SLLI8(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("slli8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.101. SLLI8 ===== */

/* ===== Inline Function Start for 3.102. SLL16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
 * \brief SLL16 (SIMD 16-bit Shift Left Logical)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SLL16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit elements logical left shift operations simultaneously. The shift amount is a
 * variable from a GPR.
 *
 * **Description**:\n
 * The 16-bit elements in Rs1 are left-shifted logically. And the results are written to Rd.
 * The shifted out bits are filled with zero and the shift amount is specified by the low-order 4-bits of
 * the value in the Rs2 register.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[3:0];
 * Rd.H[x] = Rs1.H[x] << sa;
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SLL16(unsigned long a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("sll16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.102. SLL16 ===== */

/* ===== Inline Function Start for 3.103. SLLI16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
 * \brief SLLI16 (SIMD 16-bit Shift Left Logical Immediate)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SLLI16 Rd, Rs1, imm4[3:0]
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit element logical left shift operations simultaneously. The shift amount is an
 * immediate value.
 *
 * **Description**:\n
 * The 16-bit elements in Rs1 are left-shifted logically. The shifted out bits are filled with
 * zero and the shift amount is specified by the imm4[3:0] constant. And the results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm4[3:0];
 * Rd.H[x] = Rs1.H[x] << sa;
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_SLLI16(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("slli16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.103. SLLI16 ===== */

/* ===== Inline Function Start for 3.104. SMAL ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
 * \brief SMAL (Signed Multiply Halfs & Add 64-bit)
 * \details
 * **Type**: Partial-SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMAL Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed bottom 16-bit content of the 32-bit elements of a register with the top
 * 16-bit content of the same 32-bit elements of the same register, and add the results with a 64-bit
 * value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is written back
 * to another even/odd pair of registers (RV32) or a register (RV64).
 *
 * **RV32 Description**:\n
 * This instruction multiplies the bottom 16-bit content of the lower 32-bit of Rs2 with the top 16-bit
 * content of the lower 32-bit of Rs2 and adds the result with the 64-bit value of an even/odd pair of
 * registers specified by Rs1(4,1). The 64-bit addition result is written back to an even/odd pair of
 * registers specified by Rd(4,1). The 16-bit values of Rs2, and the 64-bit value of the Rs1(4,1) register-
 * pair are treated as signed integers.
 * Rx(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
 * register of the pair contains the low 32-bit of the operand.
 *
 * **RV64 Description**:\n
 * This instruction multiplies the bottom 16-bit content of the 32-bit elements of Rs2 with the top 16-bit
 * content of the same 32-bit elements of Rs2 and adds the results with the 64-bit value of Rs1. The 64-
 * bit addition result is written back to Rd. The 16-bit values of Rs2, and the 64-bit value of Rs1 are
 * treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 * Mres[31:0] = Rs2.H[1] * Rs2.H[0];
 * Idx0 = CONCAT(Rs1(4,1),1'b0); Idx1 = CONCAT(Rs1(4,1),1'b1); +
 * Idx2 = CONCAT(Rd(4,1),1'b0); Idx3 = CONCAT(Rd(4,1),1'b1);
 * R[Idx3].R[Idx2] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
 * RV64:
 * Mres[0][31:0] = Rs2.W[0].H[1] * Rs2.W[0].H[0];
 * Mres[1][31:0] = Rs2.W[1].H[1] * Rs2.W[1].H[0];
 * Rd = Rs1 + SE64(Mres[1][31:0]) + SE64(Mres[0][31:0]);
 * ~~~
 *
 * \param [in]  a    long long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_SMAL(long long a, unsigned long b)
{
    long long result;
    __ASM volatile("smal %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.104. SMAL ===== */

/* ===== Inline Function Start for 3.105.1. SMALBB ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
 * \brief SMALBB (Signed Multiply Bottom Halfs & Add 64-bit)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * SMALBB Rd, Rs1, Rs2
 * SMALBT Rd, Rs1, Rs2
 * SMALTT Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 16-bit content of the 32-bit elements of a register with the 16-bit
 * content of the corresponding 32-bit elements of another register and add the results with a 64-bit
 * value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is written back
 * to the register-pair (RV32) or the register (RV64).
 * * SMALBB: rt pair + bottom*bottom (all 32-bit elements)
 * * SMALBT rt pair + bottom*top (all 32-bit elements)
 * * SMALTT rt pair + top*top (all 32-bit elements)
 *
 * **RV32 Description**:\n
 * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
 * content of Rs2.
 * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit
 * content of Rs2.
 * For the `SMALTT` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content
 * of Rs2.
 * The multiplication result is added with the 64-bit value of an even/odd pair of registers specified by
 * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and
 * Rs2, and the 64-bit value of the register-pair are treated as signed integers.
 * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
 * register of the pair contains the low 32-bit of the operand.
 *
 * **RV64 Description**:\n
 * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2.
 * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the top 16-bit content of the 32-bit elements of Rs2.
 * For the `SMALTT` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
 * the top 16-bit content of the 32-bit elements of Rs2.
 * The multiplication results are added with the 64-bit value of Rd. The 64-bit addition result is written
 * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 * Mres[31:0] = Rs1.H[0] * Rs2.H[0]; // SMALBB
 * Mres[31:0] = Rs1.H[0] * Rs2.H[1]; // SMALBT
 * Mres[31:0] = Rs1.H[1] * Rs2.H[1]; // SMALTT
 * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
 * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
 * RV64:
 * // SMALBB
 * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[0];
 * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[0];
 * // SMALBT
 * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[1];
 * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[1];
 * // SMALTT
 * Mres[0][31:0] = Rs1.W[0].H[1] * Rs2.W[0].H[1];
 * Mres[1][31:0] = Rs1.W[1].H[1] * Rs2.W[1].H[1];
 * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
 * ~~~
 *
 * \param [in]  t    long long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_SMALBB(long long t, unsigned long a, unsigned long b)
{
    __ASM volatile("smalbb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.105.1. SMALBB ===== */

/* ===== Inline Function Start for 3.105.2. SMALBT ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
 * \brief SMALBT (Signed Multiply Bottom Half & Top Half & Add 64-bit)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * SMALBB Rd, Rs1, Rs2
 * SMALBT Rd, Rs1, Rs2
 * SMALTT Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 16-bit content of the 32-bit elements of a register with the 16-bit
 * content of the corresponding 32-bit elements of another register and add the results with a 64-bit
 * value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is written back
 * to the register-pair (RV32) or the register (RV64).
 * * SMALBB: rt pair + bottom*bottom (all 32-bit elements)
 * * SMALBT rt pair + bottom*top (all 32-bit elements)
 * * SMALTT rt pair + top*top (all 32-bit elements)
 *
 * **RV32 Description**:\n
 * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
 * content of Rs2.
 * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit
 * content of Rs2.
 * For the `SMALTT` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content
 * of Rs2.
 * The multiplication result is added with the 64-bit value of an even/odd pair of registers specified by
 * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and
 * Rs2, and the 64-bit value of the register-pair are treated as signed integers.
 * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
 * register of the pair contains the low 32-bit of the operand.
 *
 * **RV64 Description**:\n
 * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2.
 * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the top 16-bit content of the 32-bit elements of Rs2.
 * For the `SMALTT` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
 * the top 16-bit content of the 32-bit elements of Rs2.
 * The multiplication results are added with the 64-bit value of Rd. The 64-bit addition result is written
 * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 * Mres[31:0] = Rs1.H[0] * Rs2.H[0]; // SMALBB
 * Mres[31:0] = Rs1.H[0] * Rs2.H[1]; // SMALBT
 * Mres[31:0] = Rs1.H[1] * Rs2.H[1]; // SMALTT
 * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
 * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
 * RV64:
 * // SMALBB
 * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[0];
 * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[0];
 * // SMALBT
 * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[1];
 * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[1];
 * // SMALTT
 * Mres[0][31:0] = Rs1.W[0].H[1] * Rs2.W[0].H[1];
 * Mres[1][31:0] = Rs1.W[1].H[1] * Rs2.W[1].H[1];
 * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
 * ~~~
 *
 * \param [in]  t    long long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_SMALBT(long long t, unsigned long a, unsigned long b)
{
    __ASM volatile("smalbt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.105.2. SMALBT ===== */

/* ===== Inline Function Start for 3.105.3. SMALTT ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
 * \brief SMALTT (Signed Multiply Top Halfs & Add 64-bit)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * SMALBB Rd, Rs1, Rs2
 * SMALBT Rd, Rs1, Rs2
 * SMALTT Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 16-bit content of the 32-bit elements of a register with the 16-bit
 * content of the corresponding 32-bit elements of another register and add the results with a 64-bit
 * value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is written back
 * to the register-pair (RV32) or the register (RV64).
 * * SMALBB: rt pair + bottom*bottom (all 32-bit elements)
 * * SMALBT rt pair + bottom*top (all 32-bit elements)
 * * SMALTT rt pair + top*top (all 32-bit elements)
 *
 * **RV32 Description**:\n
 * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
 * content of Rs2.
 * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit
 * content of Rs2.
 * For the `SMALTT` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content
 * of Rs2.
 * The multiplication result is added with the 64-bit value of an even/odd pair of registers specified by
 * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and
 * Rs2, and the 64-bit value of the register-pair are treated as signed integers.
 * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
 * register of the pair contains the low 32-bit of the operand.
 *
 * **RV64 Description**:\n
 * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2.
 * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the top 16-bit content of the 32-bit elements of Rs2.
 * For the `SMALTT` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
 * the top 16-bit content of the 32-bit elements of Rs2.
 * The multiplication results are added with the 64-bit value of Rd. The 64-bit addition result is written
 * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 * Mres[31:0] = Rs1.H[0] * Rs2.H[0]; // SMALBB
 * Mres[31:0] = Rs1.H[0] * Rs2.H[1]; // SMALBT
 * Mres[31:0] = Rs1.H[1] * Rs2.H[1]; // SMALTT
 * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
 * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
 * RV64:
 * // SMALBB
 * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[0];
 * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[0];
 * // SMALBT
 * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[1];
 * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[1];
 * // SMALTT
 * Mres[0][31:0] = Rs1.W[0].H[1] * Rs2.W[0].H[1];
 * Mres[1][31:0] = Rs1.W[1].H[1] * Rs2.W[1].H[1];
 * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
 * ~~~
 *
 * \param [in]  t    long long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_SMALTT(long long t, unsigned long a, unsigned long b)
{
    __ASM volatile("smaltt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.105.3. SMALTT ===== */

/* ===== Inline Function Start for 3.106.1. SMALDA ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
 * \brief SMALDA (Signed Multiply Two Halfs and Two Adds 64-bit)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * SMALDA Rd, Rs1, Rs2
 * SMALXDA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
 * adds the two 32-bit results and the 64-bit value of an even/odd pair of registers together.
 * * SMALDA: rt pair+ top*top + bottom*bottom (all 32-bit elements)
 * * SMALXDA: rt pair+ top*bottom + bottom*top (all 32-bit elements)
 *
 * **RV32 Description**:\n
 * For the `SMALDA` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
 * content of Rs2 and then adds the result to the result of multiplying the top 16-bit content of Rs1 with
 * the top 16-bit content of Rs2 with unlimited precision.
 * For the `SMALXDA` instruction, it multiplies the top 16-bit content of Rs1 with the bottom 16-bit
 * content of Rs2 and then adds the result to the result of multiplying the bottom 16-bit content of Rs1
 * with the top 16-bit content of Rs2 with unlimited precision.
 * The result is added to the 64-bit value of an even/odd pair of registers specified by Rd(4,1). The 64-
 * bit addition result is written back to the register-pair. The 16-bit values of Rs1 and Rs2, and the 64-
 * bit value of the register-pair are treated as signed integers.
 * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
 * register of the pair contains the low 32-bit of the operand.
 *
 * **RV64 Description**:\n
 * For the `SMALDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
 * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32-
 * bit elements of Rs2 with unlimited precision.
 * For the `SMALXDA` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
 * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the
 * 32-bit elements of Rs2 with unlimited precision.
 * The results are added to the 64-bit value of Rd. The 64-bit addition result is written back to Rd. The
 * 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 * // SMALDA
 * Mres0[31:0] = (Rs1.H[0] * Rs2.H[0]);
 * Mres1[31:0] = (Rs1.H[1] * Rs2.H[1]);
 * // SMALXDA
 * Mres0[31:0] = (Rs1.H[0] * Rs2.H[1]);
 * Mres1[31:0] = (Rs1.H[1] * Rs2.H[0]);
 * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
 * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres0[31:0]) + SE64(Mres1[31:0]);
 * RV64:
 * // SMALDA
 * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
 * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
 * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
 * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
 * // SMALXDA
 * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
 * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]);
 * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
 * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[0]);
 * Rd = Rd + SE64(Mres0[0][31:0]) + SE64(Mres1[0][31:0]) + SE64(Mres0[1][31:0]) +
 * SE64(Mres1[1][31:0]);
 * ~~~
 *
 * \param [in]  t    long long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_SMALDA(long long t, unsigned long a, unsigned long b)
{
    __ASM volatile("smalda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.106.1. SMALDA ===== */

/* ===== Inline Function Start for 3.106.2. SMALXDA ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
 * \brief SMALXDA (Signed Crossed Multiply Two Halfs and Two Adds 64-bit)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * SMALDA Rd, Rs1, Rs2
 * SMALXDA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
 * adds the two 32-bit results and the 64-bit value of an even/odd pair of registers together.
 * * SMALDA: rt pair+ top*top + bottom*bottom (all 32-bit elements)
 * * SMALXDA: rt pair+ top*bottom + bottom*top (all 32-bit elements)
 *
 * **RV32 Description**:\n
 * For the `SMALDA` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
 * content of Rs2 and then adds the result to the result of multiplying the top 16-bit content of Rs1 with
 * the top 16-bit content of Rs2 with unlimited precision.
 * For the `SMALXDA` instruction, it multiplies the top 16-bit content of Rs1 with the bottom 16-bit
 * content of Rs2 and then adds the result to the result of multiplying the bottom 16-bit content of Rs1
 * with the top 16-bit content of Rs2 with unlimited precision.
 * The result is added to the 64-bit value of an even/odd pair of registers specified by Rd(4,1). The 64-
 * bit addition result is written back to the register-pair. The 16-bit values of Rs1 and Rs2, and the 64-
 * bit value of the register-pair are treated as signed integers.
 * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
 * register of the pair contains the low 32-bit of the operand.
 *
 * **RV64 Description**:\n
 * For the `SMALDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
 * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32-
 * bit elements of Rs2 with unlimited precision.
 * For the `SMALXDA` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
 * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the
 * 32-bit elements of Rs2 with unlimited precision.
 * The results are added to the 64-bit value of Rd. The 64-bit addition result is written back to Rd. The
 * 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * RV32:
 * // SMALDA
 * Mres0[31:0] = (Rs1.H[0] * Rs2.H[0]);
 * Mres1[31:0] = (Rs1.H[1] * Rs2.H[1]);
 * // SMALXDA
 * Mres0[31:0] = (Rs1.H[0] * Rs2.H[1]);
 * Mres1[31:0] = (Rs1.H[1] * Rs2.H[0]);
 * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
 * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres0[31:0]) + SE64(Mres1[31:0]);
 * RV64:
 * // SMALDA
 * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
 * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
 * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
 * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
 * // SMALXDA
 * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
 * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]);
 * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
 * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[0]);
 * Rd = Rd + SE64(Mres0[0][31:0]) + SE64(Mres1[0][31:0]) + SE64(Mres0[1][31:0]) +
 * SE64(Mres1[1][31:0]);
 * ~~~
 *
 * \param [in]  t    long long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_SMALXDA(long long t, unsigned long a, unsigned long b)
{
    __ASM volatile("smalxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.106.2. SMALXDA ===== */

/* ===== Inline Function Start for 3.107.1. SMALDS ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
 * \brief SMALDS (Signed Multiply Two Halfs & Subtract & Add 64-bit)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * SMALDS Rd, Rs1, Rs2
 * SMALDRS Rd, Rs1, Rs2
 * SMALXDS Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
 * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
 * the 64-bit value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is
 * written back to the register-pair.
 * * SMALDS: rt pair + (top*top - bottom*bottom) (all 32-bit elements)
 * * SMALDRS: rt pair + (bottom*bottom - top*top) (all 32-bit elements)
 * * SMALXDS: rt pair + (top*bottom - bottom*top) (all 32-bit elements)
 *
 * **RV32 Description**:\n
 * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
 * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
 * Rs1 with the top 16-bit content of Rs2.
 * For the `SMALDRS` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content
 * of Rs2 and then subtracts the result from the result of multiplying the bottom 16-bit content of Rs1
 * with the bottom 16-bit content of Rs2.
 * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit
 * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
 * Rs1 with the bottom 16-bit content of Rs2.
 * The subtraction result is then added to the 64-bit value of an even/odd pair of registers specified by
 * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and
 * Rs2, and the 64-bit value of the register-pair are treated as signed integers.
 * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
 * register of the pair contains the low 32-bit of the operand.
 *
 * **RV64 Description**:\n
 * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
 * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content
 * of the 32-bit elements of Rs2.
 * For the `SMALDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
 * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of
 * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of
 * the 32-bit elements of Rs2.
 * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
 * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit
 * content of the 32-bit elements of Rs2.
 * The subtraction results are then added to the 64-bit value of Rd. The 64-bit addition result is written
 * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * Mres[31:0] = (Rs1.H[1] * Rs2.H[1]) - (Rs1.H[0] * Rs2.H[0]); // SMALDS
 * Mres[31:0] = (Rs1.H[0] * Rs2.H[0]) - (Rs1.H[1] * Rs2.H[1]); // SMALDRS
 * Mres[31:0] = (Rs1.H[1] * Rs2.H[0]) - (Rs1.H[0] * Rs2.H[1]); // SMALXDS
 * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
 * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
 * * RV64:
 * // SMALDS
 * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]) - (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
 * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[1]) - (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
 * // SMALDRS
 * Mres[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]) - (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
 * Mres[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[0].H[0]) - (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
 * // SMALXDS
 * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]) - (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
 * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[0]) - (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
 * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
 * ~~~
 *
 * \param [in]  t    long long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_SMALDS(long long t, unsigned long a, unsigned long b)
{
    __ASM volatile("smalds %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.107.1. SMALDS ===== */

/* ===== Inline Function Start for 3.107.2. SMALDRS ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
 * \brief SMALDRS (Signed Multiply Two Halfs & Reverse Subtract & Add 64- bit)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * SMALDS Rd, Rs1, Rs2
 * SMALDRS Rd, Rs1, Rs2
 * SMALXDS Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
 * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
 * the 64-bit value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is
 * written back to the register-pair.
 * * SMALDS: rt pair + (top*top - bottom*bottom) (all 32-bit elements)
 * * SMALDRS: rt pair + (bottom*bottom - top*top) (all 32-bit elements)
 * * SMALXDS: rt pair + (top*bottom - bottom*top) (all 32-bit elements)
 *
 * **RV32 Description**:\n
 * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
 * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
 * Rs1 with the top 16-bit content of Rs2.
 * For the `SMALDRS` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content
 * of Rs2 and then subtracts the result from the result of multiplying the bottom 16-bit content of Rs1
 * with the bottom 16-bit content of Rs2.
 * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit
 * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
 * Rs1 with the bottom 16-bit content of Rs2.
 * The subtraction result is then added to the 64-bit value of an even/odd pair of registers specified by
 * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and
 * Rs2, and the 64-bit value of the register-pair are treated as signed integers.
 * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
 * register of the pair contains the low 32-bit of the operand.
 *
 * **RV64 Description**:\n
 * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
 * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content
 * of the 32-bit elements of Rs2.
 * For the `SMALDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
 * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of
 * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of
 * the 32-bit elements of Rs2.
 * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
 * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit
 * content of the 32-bit elements of Rs2.
 * The subtraction results are then added to the 64-bit value of Rd. The 64-bit addition result is written
 * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * Mres[31:0] = (Rs1.H[1] * Rs2.H[1]) - (Rs1.H[0] * Rs2.H[0]); // SMALDS
 * Mres[31:0] = (Rs1.H[0] * Rs2.H[0]) - (Rs1.H[1] * Rs2.H[1]); // SMALDRS
 * Mres[31:0] = (Rs1.H[1] * Rs2.H[0]) - (Rs1.H[0] * Rs2.H[1]); // SMALXDS
 * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
 * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
 * * RV64:
 * // SMALDS
 * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]) - (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
 * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[1]) - (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
 * // SMALDRS
 * Mres[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]) - (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
 * Mres[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[0].H[0]) - (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
 * // SMALXDS
 * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]) - (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
 * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[0]) - (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
 * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
 * ~~~
 *
 * \param [in]  t    long long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_SMALDRS(long long t, unsigned long a, unsigned long b)
{
    __ASM volatile("smaldrs %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.107.2. SMALDRS ===== */

/* ===== Inline Function Start for 3.107.3. SMALXDS ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
 * \brief SMALXDS (Signed Crossed Multiply Two Halfs & Subtract & Add 64- bit)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * SMALDS Rd, Rs1, Rs2
 * SMALDRS Rd, Rs1, Rs2
 * SMALXDS Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
 * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
 * the 64-bit value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is
 * written back to the register-pair.
 * * SMALDS: rt pair + (top*top - bottom*bottom) (all 32-bit elements)
 * * SMALDRS: rt pair + (bottom*bottom - top*top) (all 32-bit elements)
 * * SMALXDS: rt pair + (top*bottom - bottom*top) (all 32-bit elements)
 *
 * **RV32 Description**:\n
 * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
 * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
 * Rs1 with the top 16-bit content of Rs2.
 * For the `SMALDRS` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content
 * of Rs2 and then subtracts the result from the result of multiplying the bottom 16-bit content of Rs1
 * with the bottom 16-bit content of Rs2.
 * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit
 * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
 * Rs1 with the bottom 16-bit content of Rs2.
 * The subtraction result is then added to the 64-bit value of an even/odd pair of registers specified by
 * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and
 * Rs2, and the 64-bit value of the register-pair are treated as signed integers.
 * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
 * register of the pair contains the low 32-bit of the operand.
 *
 * **RV64 Description**:\n
 * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
 * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content
 * of the 32-bit elements of Rs2.
 * For the `SMALDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
 * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of
 * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of
 * the 32-bit elements of Rs2.
 * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
 * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit
 * content of the 32-bit elements of Rs2.
 * The subtraction results are then added to the 64-bit value of Rd. The 64-bit addition result is written
 * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * Mres[31:0] = (Rs1.H[1] * Rs2.H[1]) - (Rs1.H[0] * Rs2.H[0]); // SMALDS
 * Mres[31:0] = (Rs1.H[0] * Rs2.H[0]) - (Rs1.H[1] * Rs2.H[1]); // SMALDRS
 * Mres[31:0] = (Rs1.H[1] * Rs2.H[0]) - (Rs1.H[0] * Rs2.H[1]); // SMALXDS
 * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
 * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
 * * RV64:
 * // SMALDS
 * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]) - (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
 * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[1]) - (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
 * // SMALDRS
 * Mres[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]) - (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
 * Mres[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[0].H[0]) - (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
 * // SMALXDS
 * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]) - (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
 * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[0]) - (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
 * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
 * ~~~
 *
 * \param [in]  t    long long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_SMALXDS(long long t, unsigned long a, unsigned long b)
{
    __ASM volatile("smalxds %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.107.3. SMALXDS ===== */

/* ===== Inline Function Start for 3.108. SMAR64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
 * \brief SMAR64 (Signed Multiply and Add to 64-Bit Data)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * SMAR64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the 32-bit signed elements in two registers and add the 64-bit multiplication
 * result to the 64-bit signed data of a pair of registers (RV32) or a register (RV64). The result is written
 * back to the pair of registers (RV32) or a register (RV64).
 *
 * **RV32 Description**:\n
 * This instruction multiplies the 32-bit signed data of Rs1 with that of Rs2. It adds
 * the 64-bit multiplication result to the 64-bit signed data of an even/odd pair of registers specified by
 * Rd(4,1). The addition result is written back to the even/odd pair of registers specified by Rd(4,1).
 * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 *
 * **RV64 Description**:\n
 * This instruction multiplies the 32-bit signed elements of Rs1 with that of Rs2. It
 * adds the 64-bit multiplication results to the 64-bit signed data of Rd. The addition result is written
 * back to Rd.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * R[t_H].R[t_L] = R[t_H].R[t_L] + (Rs1 * Rs2);
 * * RV64:
 * Rd = Rd + (Rs1.W[0] * Rs2.W[0]) + (Rs1.W[1] * Rs2.W[1]);
 * ~~~
 *
 * \param [in]  t    long long type of value stored in t
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_SMAR64(long long t, long a, long b)
{
    __ASM volatile("smar64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.108. SMAR64 ===== */

/* ===== Inline Function Start for 3.109. SMAQA ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_8B_MULT_32B_ADD
 * \brief SMAQA (Signed Multiply Four Bytes with 32-bit Adds)
 * \details
 * **Type**: Partial-SIMD (Reduction)
 *
 * **Syntax**:\n
 * ~~~
 * SMAQA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do four signed 8-bit multiplications from 32-bit chunks of two registers; and then adds
 * the four 16-bit results and the content of corresponding 32-bit chunks of a third register together.
 *
 * **Description**:\n
 * This instruction multiplies the four signed 8-bit elements of 32-bit chunks of Rs1 with the four
 * signed 8-bit elements of 32-bit chunks of Rs2 and then adds the four results together with the signed
 * content of the corresponding 32-bit chunks of Rd. The final results are written back to the
 * corresponding 32-bit chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rd.W[x] +
 *    (Rs1.W[x].B[3] s* Rs2.W[x].B[3]) + (Rs1.W[x].B[2] s* Rs2.W[x].B[2]) +
 *    (Rs1.W[x].B[1] s* Rs2.W[x].B[1]) + (Rs1.W[x].B[0] s* Rs2.W[x].B[0]);
 * Rd.W[x] = res[x];
 * for RV32: x=0,
 * for RV64: x=1,0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMAQA(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("smaqa %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.109. SMAQA ===== */

/* ===== Inline Function Start for 3.110. SMAQA.SU ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_8B_MULT_32B_ADD
 * \brief SMAQA.SU (Signed and Unsigned Multiply Four Bytes with 32-bit Adds)
 * \details
 * **Type**: Partial-SIMD (Reduction)
 *
 * **Syntax**:\n
 * ~~~
 * SMAQA.SU Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do four `signed x unsigned` 8-bit multiplications from 32-bit chunks of two registers; and
 * then adds the four 16-bit results and the content of corresponding 32-bit chunks of a third register
 * together.
 *
 * **Description**:\n
 * This instruction multiplies the four signed 8-bit elements of 32-bit chunks of Rs1 with the four
 * unsigned 8-bit elements of 32-bit chunks of Rs2 and then adds the four results together with the
 * signed content of the corresponding 32-bit chunks of Rd. The final results are written back to the
 * corresponding 32-bit chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rd.W[x] +
 *    (Rs1.W[x].B[3] su* Rs2.W[x].B[3]) + (Rs1.W[x].B[2] su* Rs2.W[x].B[2]) +
 *    (Rs1.W[x].B[1] su* Rs2.W[x].B[1]) + (Rs1.W[x].B[0] su* Rs2.W[x].B[0]);
 * Rd.W[x] = res[x];
 * for RV32: x=0,
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMAQA_SU(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("smaqa.su %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.110. SMAQA.SU ===== */

/* ===== Inline Function Start for 3.111. SMAX8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
 * \brief SMAX8 (SIMD 8-bit Signed Maximum)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMAX8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit signed integer elements finding maximum operations simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 8-bit signed integer elements in Rs1 with the 8-bit
 * signed integer elements in Rs2 and selects the numbers that is greater than the other one. The
 * selected results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.B[x] = (Rs1.B[x] > Rs2.B[x])? Rs1.B[x] : Rs2.B[x];
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SMAX8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("smax8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.111. SMAX8 ===== */

/* ===== Inline Function Start for 3.112. SMAX16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
 * \brief SMAX16 (SIMD 16-bit Signed Maximum)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMAX16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer elements finding maximum operations simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 16-bit signed integer elements in Rs1 with the 16-bit
 * signed integer elements in Rs2 and selects the numbers that is greater than the other one. The
 * selected results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = (Rs1.H[x] > Rs2.H[x])? Rs1.H[x] : Rs2.H[x];
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SMAX16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("smax16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.112. SMAX16 ===== */

/* ===== Inline Function Start for 3.113.1. SMBB16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
 * \brief SMBB16 (SIMD Signed Multiply Bottom Half & Bottom Half)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMBB16 Rd, Rs1, Rs2
 * SMBT16 Rd, Rs1, Rs2
 * SMTT16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 16-bit content of the 32-bit elements of a register with the signed 16-
 * bit content of the 32-bit elements of another register and write the result to a third register.
 * * SMBB16: W[x].bottom*W[x].bottom
 * * SMBT16: W[x].bottom *W[x].top
 * * SMTT16: W[x].top * W[x].top
 *
 * **Description**:\n
 * For the `SMBB16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2.
 * For the `SMBT16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the top 16-bit content of the 32-bit elements of Rs2.
 * For the `SMTT16` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
 * the top 16-bit content of the 32-bit elements of Rs2.
 * The multiplication results are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[0]; // SMBB16
 * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[1]; // SMBT16
 * Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[1]; // SMTT16
 * for RV32: x=0,
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMBB16(unsigned long a, unsigned long b)
{
    long result;
    __ASM volatile("smbb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.113.1. SMBB16 ===== */

/* ===== Inline Function Start for 3.113.2. SMBT16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
 * \brief SMBT16 (SIMD Signed Multiply Bottom Half & Top Half)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMBB16 Rd, Rs1, Rs2
 * SMBT16 Rd, Rs1, Rs2
 * SMTT16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 16-bit content of the 32-bit elements of a register with the signed 16-
 * bit content of the 32-bit elements of another register and write the result to a third register.
 * * SMBB16: W[x].bottom*W[x].bottom
 * * SMBT16: W[x].bottom *W[x].top
 * * SMTT16: W[x].top * W[x].top
 *
 * **Description**:\n
 * For the `SMBB16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2.
 * For the `SMBT16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the top 16-bit content of the 32-bit elements of Rs2.
 * For the `SMTT16` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
 * the top 16-bit content of the 32-bit elements of Rs2.
 * The multiplication results are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[0]; // SMBB16
 * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[1]; // SMBT16
 * Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[1]; // SMTT16
 * for RV32: x=0,
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMBT16(unsigned long a, unsigned long b)
{
    long result;
    __ASM volatile("smbt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.113.2. SMBT16 ===== */

/* ===== Inline Function Start for 3.113.3. SMTT16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
 * \brief SMTT16 (SIMD Signed Multiply Top Half & Top Half)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMBB16 Rd, Rs1, Rs2
 * SMBT16 Rd, Rs1, Rs2
 * SMTT16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 16-bit content of the 32-bit elements of a register with the signed 16-
 * bit content of the 32-bit elements of another register and write the result to a third register.
 * * SMBB16: W[x].bottom*W[x].bottom
 * * SMBT16: W[x].bottom *W[x].top
 * * SMTT16: W[x].top * W[x].top
 *
 * **Description**:\n
 * For the `SMBB16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2.
 * For the `SMBT16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the top 16-bit content of the 32-bit elements of Rs2.
 * For the `SMTT16` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
 * the top 16-bit content of the 32-bit elements of Rs2.
 * The multiplication results are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[0]; // SMBB16
 * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[1]; // SMBT16
 * Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[1]; // SMTT16
 * for RV32: x=0,
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMTT16(unsigned long a, unsigned long b)
{
    long result;
    __ASM volatile("smtt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.113.3. SMTT16 ===== */

/* ===== Inline Function Start for 3.114.1. SMDS ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
 * \brief SMDS (SIMD Signed Multiply Two Halfs and Subtract)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMDS Rd, Rs1, Rs2
 * SMDRS Rd, Rs1, Rs2
 * SMXDS Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
 * perform a subtraction operation between the two 32-bit results.
 * * SMDS: top*top - bottom*bottom (per 32-bit element)
 * * SMDRS: bottom*bottom - top*top (per 32-bit element)
 * * SMXDS: top*bottom - bottom*top (per 32-bit element)
 *
 * **Description**:\n
 * For the `SMDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 with
 * the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result
 * of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the
 * 32-bit elements of Rs2.
 * For the `SMDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
 * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of
 * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of
 * the 32-bit elements of Rs2.
 * For the `SMXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
 * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit
 * content of the 32-bit elements of Rs2.
 * The subtraction result is written to the corresponding 32-bit element of Rd. The 16-bit contents of
 * multiplication are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * * SMDS:
 * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
 * * SMDRS:
 * Rd.W[x] = (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]);
 * * SMXDS:
 * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMDS(unsigned long a, unsigned long b)
{
    long result;
    __ASM volatile("smds %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.114.1. SMDS ===== */

/* ===== Inline Function Start for 3.114.2. SMDRS ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
 * \brief SMDRS (SIMD Signed Multiply Two Halfs and Reverse Subtract)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMDS Rd, Rs1, Rs2
 * SMDRS Rd, Rs1, Rs2
 * SMXDS Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
 * perform a subtraction operation between the two 32-bit results.
 * * SMDS: top*top - bottom*bottom (per 32-bit element)
 * * SMDRS: bottom*bottom - top*top (per 32-bit element)
 * * SMXDS: top*bottom - bottom*top (per 32-bit element)
 *
 * **Description**:\n
 * For the `SMDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 with
 * the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result
 * of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the
 * 32-bit elements of Rs2.
 * For the `SMDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
 * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of
 * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of
 * the 32-bit elements of Rs2.
 * For the `SMXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
 * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit
 * content of the 32-bit elements of Rs2.
 * The subtraction result is written to the corresponding 32-bit element of Rd. The 16-bit contents of
 * multiplication are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * * SMDS:
 * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
 * * SMDRS:
 * Rd.W[x] = (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]);
 * * SMXDS:
 * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMDRS(unsigned long a, unsigned long b)
{
    long result;
    __ASM volatile("smdrs %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.114.2. SMDRS ===== */

/* ===== Inline Function Start for 3.114.3. SMXDS ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
 * \brief SMXDS (SIMD Signed Crossed Multiply Two Halfs and Subtract)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMDS Rd, Rs1, Rs2
 * SMDRS Rd, Rs1, Rs2
 * SMXDS Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
 * perform a subtraction operation between the two 32-bit results.
 * * SMDS: top*top - bottom*bottom (per 32-bit element)
 * * SMDRS: bottom*bottom - top*top (per 32-bit element)
 * * SMXDS: top*bottom - bottom*top (per 32-bit element)
 *
 * **Description**:\n
 * For the `SMDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 with
 * the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result
 * of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the
 * 32-bit elements of Rs2.
 * For the `SMDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
 * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of
 * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of
 * the 32-bit elements of Rs2.
 * For the `SMXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
 * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit
 * content of the 32-bit elements of Rs2.
 * The subtraction result is written to the corresponding 32-bit element of Rd. The 16-bit contents of
 * multiplication are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * * SMDS:
 * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
 * * SMDRS:
 * Rd.W[x] = (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]);
 * * SMXDS:
 * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMXDS(unsigned long a, unsigned long b)
{
    long result;
    __ASM volatile("smxds %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.114.3. SMXDS ===== */

/* ===== Inline Function Start for 3.115. SMIN8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
 * \brief SMIN8 (SIMD 8-bit Signed Minimum)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMIN8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit signed integer elements finding minimum operations simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 8-bit signed integer elements in Rs1 with the 8-bit
 * signed integer elements in Rs2 and selects the numbers that is less than the other one. The selected
 * results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.B[x] = (Rs1.B[x] < Rs2.B[x])? Rs1.B[x] : Rs2.B[x];
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SMIN8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("smin8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.115. SMIN8 ===== */

/* ===== Inline Function Start for 3.116. SMIN16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
 * \brief SMIN16 (SIMD 16-bit Signed Minimum)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMIN16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer elements finding minimum operations simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 16-bit signed integer elements in Rs1 with the 16-bit
 * signed integer elements in Rs2 and selects the numbers that is less than the other one. The selected
 * results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = (Rs1.H[x] < Rs2.H[x])? Rs1.H[x] : Rs2.H[x];
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SMIN16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("smin16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.116. SMIN16 ===== */

/* ===== Inline Function Start for 3.117.1. SMMUL ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
 * \brief SMMUL (SIMD MSW Signed Multiply Word)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMMUL Rd, Rs1, Rs2
 * SMMUL.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the 32-bit signed integer elements of two registers and write the most significant
 * 32-bit results to the corresponding 32-bit elements of a register. The `.u` form performs an
 * additional rounding up operation on the multiplication results before taking the most significant
 * 32-bit part of the results.
 *
 * **Description**:\n
 * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2 and writes the
 * most significant 32-bit multiplication results to the corresponding 32-bit elements of Rd. The 32-bit
 * elements of Rs1 and Rs2 are treated as signed integers. The `.u` form of the instruction rounds up
 * the most significant 32-bit of the 64-bit multiplication results by adding a 1 to bit 31 of the results.
 * * For `smmul/RV32` instruction, it is an alias to `mulh/RV32` instruction.
 *
 * **Operations**:\n
 * ~~~
 * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
 * if (`.u` form) {
 *   Round[x][32:0] = Mres[x][63:31] + 1;
 *   Rd.W[x] = Round[x][32:1];
 * } else {
 *   Rd.W[x] = Mres[x][63:32];
 * }
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMMUL(long a, long b)
{
    long result;
    __ASM volatile("smmul %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.117.1. SMMUL ===== */

/* ===== Inline Function Start for 3.117.2. SMMUL.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
 * \brief SMMUL.u (SIMD MSW Signed Multiply Word with Rounding)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMMUL Rd, Rs1, Rs2
 * SMMUL.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the 32-bit signed integer elements of two registers and write the most significant
 * 32-bit results to the corresponding 32-bit elements of a register. The `.u` form performs an
 * additional rounding up operation on the multiplication results before taking the most significant
 * 32-bit part of the results.
 *
 * **Description**:\n
 * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2 and writes the
 * most significant 32-bit multiplication results to the corresponding 32-bit elements of Rd. The 32-bit
 * elements of Rs1 and Rs2 are treated as signed integers. The `.u` form of the instruction rounds up
 * the most significant 32-bit of the 64-bit multiplication results by adding a 1 to bit 31 of the results.
 * * For `smmul/RV32` instruction, it is an alias to `mulh/RV32` instruction.
 *
 * **Operations**:\n
 * ~~~
 * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
 * if (`.u` form) {
 *   Round[x][32:0] = Mres[x][63:31] + 1;
 *   Rd.W[x] = Round[x][32:1];
 * } else {
 *   Rd.W[x] = Mres[x][63:32];
 * }
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMMUL_U(long a, long b)
{
    long result;
    __ASM volatile("smmul.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.117.2. SMMUL.u ===== */

/* ===== Inline Function Start for 3.118.1. SMMWB ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
 * \brief SMMWB (SIMD MSW Signed Multiply Word and Bottom Half)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMMWB Rd, Rs1, Rs2
 * SMMWB.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the
 * corresponding 32-bit elements of another register, and write the most significant 32-bit results to
 * the corresponding 32-bit elements of a register. The `.u` form rounds up the results from the most
 * significant discarded bit.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit elements of Rs1 with the signed bottom 16-bit content
 * of the corresponding 32-bit elements of Rs2 and writes the most significant 32-bit multiplication
 * results to the corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the
 * most significant 32-bit of the 48-bit multiplication results by adding a 1 to bit 15 of the results.
 *
 * **Operations**:\n
 * ~~~
 * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[0];
 * if (`.u` form) {
 *   Round[x][32:0] = Mres[x][47:15] + 1;
 *   Rd.W[x] = Round[x][32:1];
 * } else {
 *   Rd.W[x] = Mres[x][47:16];
 * }
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMMWB(long a, unsigned long b)
{
    long result;
    __ASM volatile("smmwb %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.118.1. SMMWB ===== */

/* ===== Inline Function Start for 3.118.2. SMMWB.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
 * \brief SMMWB.u (SIMD MSW Signed Multiply Word and Bottom Half with Rounding)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMMWB Rd, Rs1, Rs2
 * SMMWB.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the
 * corresponding 32-bit elements of another register, and write the most significant 32-bit results to
 * the corresponding 32-bit elements of a register. The `.u` form rounds up the results from the most
 * significant discarded bit.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit elements of Rs1 with the signed bottom 16-bit content
 * of the corresponding 32-bit elements of Rs2 and writes the most significant 32-bit multiplication
 * results to the corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the
 * most significant 32-bit of the 48-bit multiplication results by adding a 1 to bit 15 of the results.
 *
 * **Operations**:\n
 * ~~~
 * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[0];
 * if (`.u` form) {
 *   Round[x][32:0] = Mres[x][47:15] + 1;
 *   Rd.W[x] = Round[x][32:1];
 * } else {
 *   Rd.W[x] = Mres[x][47:16];
 * }
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMMWB_U(long a, unsigned long b)
{
    long result;
    __ASM volatile("smmwb.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.118.2. SMMWB.u ===== */

/* ===== Inline Function Start for 3.119.1. SMMWT ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
 * \brief SMMWT (SIMD MSW Signed Multiply Word and Top Half)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMMWT Rd, Rs1, Rs2
 * SMMWT.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit integer elements of one register and the top 16-bit of the
 * corresponding 32-bit elements of another register, and write the most significant 32-bit results to
 * the corresponding 32-bit elements of a register. The `.u` form rounds up the results from the most
 * significant discarded bit.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit elements of Rs1 with the top signed 16-bit content of
 * the corresponding 32-bit elements of Rs2 and writes the most significant 32-bit multiplication
 * results to the corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the
 * most significant 32-bit of the 48-bit multiplication results by adding a 1 to bit 15 of the results.
 *
 * **Operations**:\n
 * ~~~
 * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[1];
 * if (`.u` form) {
 *   Round[x][32:0] = Mres[x][47:15] + 1;
 *   Rd.W[x] = Round[x][32:1];
 * } else {
 *   Rd.W[x] = Mres[x][47:16];
 * }
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMMWT(long a, unsigned long b)
{
    long result;
    __ASM volatile("smmwt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.119.1. SMMWT ===== */

/* ===== Inline Function Start for 3.119.2. SMMWT.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
 * \brief SMMWT.u (SIMD MSW Signed Multiply Word and Top Half with Rounding)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMMWT Rd, Rs1, Rs2
 * SMMWT.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit integer elements of one register and the top 16-bit of the
 * corresponding 32-bit elements of another register, and write the most significant 32-bit results to
 * the corresponding 32-bit elements of a register. The `.u` form rounds up the results from the most
 * significant discarded bit.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit elements of Rs1 with the top signed 16-bit content of
 * the corresponding 32-bit elements of Rs2 and writes the most significant 32-bit multiplication
 * results to the corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the
 * most significant 32-bit of the 48-bit multiplication results by adding a 1 to bit 15 of the results.
 *
 * **Operations**:\n
 * ~~~
 * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[1];
 * if (`.u` form) {
 *   Round[x][32:0] = Mres[x][47:15] + 1;
 *   Rd.W[x] = Round[x][32:1];
 * } else {
 *   Rd.W[x] = Mres[x][47:16];
 * }
 * for RV32: x=0
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMMWT_U(long a, unsigned long b)
{
    long result;
    __ASM volatile("smmwt.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.119.2. SMMWT.u ===== */

/* ===== Inline Function Start for 3.120.1. SMSLDA ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
 * \brief SMSLDA (Signed Multiply Two Halfs & Add & Subtract 64-bit)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * SMSLDA Rd, Rs1, Rs2
 * SMSLXDA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
 * subtracts the two 32-bit results from the 64-bit value of an even/odd pair of registers (RV32) or a
 * register (RV64). The subtraction result is written back to the register-pair.
 * * SMSLDA: rd pair - top*top - bottom*bottom (all 32-bit elements)
 * * SMSLXDA: rd pair - top*bottom - bottom*top (all 32-bit elements)
 *
 * **RV32 Description**:\n
 * For the `SMSLDA` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
 * content Rs2 and multiplies the top 16-bit content of Rs1 with the top 16-bit content of Rs2.
 * For the `SMSLXDA` instruction, it multiplies the top 16-bit content of Rs1 with the bottom 16-bit
 * content of Rs2 and multiplies the bottom 16-bit content of Rs1 with the top 16-bit content of Rs2.
 * The two multiplication results are subtracted from the 64-bit value of an even/odd pair of registers
 * specified by Rd(4,1). The 64-bit subtraction result is written back to the register-pair. The 16-bit
 * values of Rs1 and Rs2, and the 64-bit value of the register-pair are treated as signed integers.
 * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 *
 * **RV64 Description**:\n
 * For the `SMSLDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of
 * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
 * For the `SMSLXDA` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
 * the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the bottom 16-bit content of
 * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
 * The four multiplication results are subtracted from the 64-bit value of Rd. The 64-bit subtraction
 * result is written back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated
 * as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * // SMSLDA
 * Mres0[31:0] = (Rs1.H[0] * Rs2.H[0]);
 * Mres1[31:0] = (Rs1.H[1] * Rs2.H[1]);
 * // SMSLXDA
 * Mres0[31:0] = (Rs1.H[0] * Rs2.H[1]);
 * Mres1[31:0] = (Rs1.H[1] * Rs2.H[0]);
 * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
 * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] - SE64(Mres0[31:0]) - SE64(Mres1[31:0]);
 * * RV64:
 * // SMSLDA
 * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
 * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
 * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
 * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
 * // SMSLXDA
 * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
 * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]);
 * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
 * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[0]);
 * Rd = Rd - SE64(Mres0[0][31:0]) - SE64(Mres1[0][31:0]) - SE64(Mres0[1][31:0]) -
 * SE64(Mres1[1][31:0]);
 * ~~~
 *
 * \param [in]  t    long long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_SMSLDA(long long t, unsigned long a, unsigned long b)
{
    __ASM volatile("smslda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.120.1. SMSLDA ===== */

/* ===== Inline Function Start for 3.120.2. SMSLXDA ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
 * \brief SMSLXDA (Signed Crossed Multiply Two Halfs & Add & Subtract 64- bit)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * SMSLDA Rd, Rs1, Rs2
 * SMSLXDA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
 * subtracts the two 32-bit results from the 64-bit value of an even/odd pair of registers (RV32) or a
 * register (RV64). The subtraction result is written back to the register-pair.
 * * SMSLDA: rd pair - top*top - bottom*bottom (all 32-bit elements)
 * * SMSLXDA: rd pair - top*bottom - bottom*top (all 32-bit elements)
 *
 * **RV32 Description**:\n
 * For the `SMSLDA` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
 * content Rs2 and multiplies the top 16-bit content of Rs1 with the top 16-bit content of Rs2.
 * For the `SMSLXDA` instruction, it multiplies the top 16-bit content of Rs1 with the bottom 16-bit
 * content of Rs2 and multiplies the bottom 16-bit content of Rs1 with the top 16-bit content of Rs2.
 * The two multiplication results are subtracted from the 64-bit value of an even/odd pair of registers
 * specified by Rd(4,1). The 64-bit subtraction result is written back to the register-pair. The 16-bit
 * values of Rs1 and Rs2, and the 64-bit value of the register-pair are treated as signed integers.
 * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 *
 * **RV64 Description**:\n
 * For the `SMSLDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of
 * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
 * For the `SMSLXDA` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
 * the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the bottom 16-bit content of
 * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
 * The four multiplication results are subtracted from the 64-bit value of Rd. The 64-bit subtraction
 * result is written back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated
 * as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * // SMSLDA
 * Mres0[31:0] = (Rs1.H[0] * Rs2.H[0]);
 * Mres1[31:0] = (Rs1.H[1] * Rs2.H[1]);
 * // SMSLXDA
 * Mres0[31:0] = (Rs1.H[0] * Rs2.H[1]);
 * Mres1[31:0] = (Rs1.H[1] * Rs2.H[0]);
 * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
 * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] - SE64(Mres0[31:0]) - SE64(Mres1[31:0]);
 * * RV64:
 * // SMSLDA
 * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
 * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
 * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
 * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
 * // SMSLXDA
 * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
 * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]);
 * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
 * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[0]);
 * Rd = Rd - SE64(Mres0[0][31:0]) - SE64(Mres1[0][31:0]) - SE64(Mres0[1][31:0]) -
 * SE64(Mres1[1][31:0]);
 * ~~~
 *
 * \param [in]  t    long long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_SMSLXDA(long long t, unsigned long a, unsigned long b)
{
    __ASM volatile("smslxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.120.2. SMSLXDA ===== */

/* ===== Inline Function Start for 3.121. SMSR64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
 * \brief SMSR64 (Signed Multiply and Subtract from 64- Bit Data)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * SMSR64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the 32-bit signed elements in two registers and subtract the 64-bit multiplication
 * results from the 64-bit signed data of a pair of registers (RV32) or a register (RV64). The result is
 * written back to the pair of registers (RV32) or a register (RV64).
 *
 * **RV32 Description**:\n
 * This instruction multiplies the 32-bit signed data of Rs1 with that of Rs2. It
 * subtracts the 64-bit multiplication result from the 64-bit signed data of an even/odd pair of registers
 * specified by Rd(4,1). The subtraction result is written back to the even/odd pair of registers
 * specified by Rd(4,1).
 * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 *
 * **RV64 Description**:\n
 * This instruction multiplies the 32-bit signed elements of Rs1 with that of Rs2. It
 * subtracts the 64-bit multiplication results from the 64-bit signed data of Rd. The subtraction result is
 * written back to Rd.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * R[t_H].R[t_L] = R[t_H].R[t_L] - (Rs1 * Rs2);
 * * RV64:
 * Rd = Rd - (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]);
 * ~~~
 *
 * \param [in]  t    long long type of value stored in t
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_SMSR64(long long t, long a, long b)
{
    __ASM volatile("smsr64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.121. SMSR64 ===== */

/* ===== Inline Function Start for 3.122.1. SMUL8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY
 * \brief SMUL8 (SIMD Signed 8-bit Multiply)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMUL8 Rd, Rs1, Rs2
 * SMULX8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do signed 8-bit multiplications and generate four 16-bit results simultaneously.
 *
 * **RV32 Description**:\n
 * For the `SMUL8` instruction, multiply the 8-bit data elements of Rs1 with the
 * corresponding 8-bit data elements of Rs2.
 * For the `SMULX8` instruction, multiply the first and second 8-bit data elements of Rs1 with the
 * second and first 8-bit data elements of Rs2. At the same time, multiply the third and fourth 8-bit data
 * elements of Rs1 with the fourth and third 8-bit data elements of Rs2.
 * The four 16-bit results are then written into an even/odd pair of registers specified by Rd(4,1).
 * Rd(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the two 16-bit results calculated from the top part of
 * Rs1 and the even `2d` register of the pair contains the two 16-bit results calculated from the bottom
 * part of Rs1.
 *
 * **RV64 Description**:\n
 * For the `SMUL8` instruction, multiply the 8-bit data elements of Rs1 with the
 * corresponding 8-bit data elements of Rs2.
 * For the `SMULX8` instruction, multiply the first and second 8-bit data elements of Rs1 with the
 * second and first 8-bit data elements of Rs2. At the same time, multiply the third and fourth 8-bit data
 * elements of Rs1 with the fourth and third 8-bit data elements of Rs2.
 * The four 16-bit results are then written into Rd. The Rd.W[1] contains the two 16-bit results
 * calculated from the top part of Rs1 and the Rd.W[0] contains the two 16-bit results calculated from
 * the bottom part of Rs1.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * if (is `SMUL8`) {
 *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
 *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
 * } else if (is `SMULX8`) {
 *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
 *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom
 * }
 * rest[x/2] = op1t[x/2] s* op2t[x/2];
 * resb[x/2] = op1b[x/2] s* op2b[x/2];
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * R[t_H].H[1] = rest[1]; R[t_H].H[0] = resb[1];
 * R[t_L].H[1] = rest[0]; R[t_L].H[0] = resb[0];
 * x = 0 and 2
 * * RV64:
 * if (is `SMUL8`) {
 *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
 *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
 * } else if (is `SMULX8`) {
 *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
 *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom
 * }
 * rest[x/2] = op1t[x/2] s* op2t[x/2];
 * resb[x/2] = op1b[x/2] s* op2b[x/2];
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * Rd.W[1].H[1] = rest[1]; Rd.W[1].H[0] = resb[1];
 * Rd.W[0].H[1] = rest[0]; Rd.W[0].H[0] = resb[0];
 * x = 0 and 2
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_SMUL8(unsigned int a, unsigned int b)
{
    unsigned long long result;
    __ASM volatile("smul8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.122.1. SMUL8 ===== */

/* ===== Inline Function Start for 3.122.2. SMULX8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY
 * \brief SMULX8 (SIMD Signed Crossed 8-bit Multiply)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMUL8 Rd, Rs1, Rs2
 * SMULX8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do signed 8-bit multiplications and generate four 16-bit results simultaneously.
 *
 * **RV32 Description**:\n
 * For the `SMUL8` instruction, multiply the 8-bit data elements of Rs1 with the
 * corresponding 8-bit data elements of Rs2.
 * For the `SMULX8` instruction, multiply the first and second 8-bit data elements of Rs1 with the
 * second and first 8-bit data elements of Rs2. At the same time, multiply the third and fourth 8-bit data
 * elements of Rs1 with the fourth and third 8-bit data elements of Rs2.
 * The four 16-bit results are then written into an even/odd pair of registers specified by Rd(4,1).
 * Rd(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the two 16-bit results calculated from the top part of
 * Rs1 and the even `2d` register of the pair contains the two 16-bit results calculated from the bottom
 * part of Rs1.
 *
 * **RV64 Description**:\n
 * For the `SMUL8` instruction, multiply the 8-bit data elements of Rs1 with the
 * corresponding 8-bit data elements of Rs2.
 * For the `SMULX8` instruction, multiply the first and second 8-bit data elements of Rs1 with the
 * second and first 8-bit data elements of Rs2. At the same time, multiply the third and fourth 8-bit data
 * elements of Rs1 with the fourth and third 8-bit data elements of Rs2.
 * The four 16-bit results are then written into Rd. The Rd.W[1] contains the two 16-bit results
 * calculated from the top part of Rs1 and the Rd.W[0] contains the two 16-bit results calculated from
 * the bottom part of Rs1.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * if (is `SMUL8`) {
 *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
 *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
 * } else if (is `SMULX8`) {
 *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
 *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom
 * }
 * rest[x/2] = op1t[x/2] s* op2t[x/2];
 * resb[x/2] = op1b[x/2] s* op2b[x/2];
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * R[t_H].H[1] = rest[1]; R[t_H].H[0] = resb[1];
 * R[t_L].H[1] = rest[0]; R[t_L].H[0] = resb[0];
 * x = 0 and 2
 * * RV64:
 * if (is `SMUL8`) {
 *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
 *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
 * } else if (is `SMULX8`) {
 *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
 *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom
 * }
 * rest[x/2] = op1t[x/2] s* op2t[x/2];
 * resb[x/2] = op1b[x/2] s* op2b[x/2];
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * Rd.W[1].H[1] = rest[1]; Rd.W[1].H[0] = resb[1];
 * Rd.W[0].H[1] = rest[0]; Rd.W[0].H[0] = resb[0];
 * x = 0 and 2
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_SMULX8(unsigned int a, unsigned int b)
{
    unsigned long long result;
    __ASM volatile("smulx8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.122.2. SMULX8 ===== */

/* ===== Inline Function Start for 3.123.1. SMUL16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY
 * \brief SMUL16 (SIMD Signed 16-bit Multiply)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMUL16 Rd, Rs1, Rs2
 * SMULX16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do signed 16-bit multiplications and generate two 32-bit results simultaneously.
 *
 * **RV32 Description**:\n
 * For the `SMUL16` instruction, multiply the top 16-bit Q15 content of Rs1 with
 * the top 16-bit Q15 content of Rs2. At the same time, multiply the bottom 16-bit Q15 content of Rs1
 * with the bottom 16-bit Q15 content of Rs2.
 * For the `SMULX16` instruction, multiply the top 16-bit Q15 content of Rs1 with the bottom 16-bit
 * Q15 content of Rs2. At the same time, multiply the bottom 16-bit Q15 content of Rs1 with the top 16-
 * bit Q15 content of Rs2.
 * The two Q30 results are then written into an even/odd pair of registers specified by Rd(4,1). Rd(4,1),
 * i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair includes
 * register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the 32-bit result calculated from the top part of Rs1 and
 * the even `2d` register of the pair contains the 32-bit result calculated from the bottom part of Rs1.
 *
 * **RV64 Description**:\n
 * For the `SMUL16` instruction, multiply the top 16-bit Q15 content of the lower
 * 32-bit word in Rs1 with the top 16-bit Q15 content of the lower 32-bit word in Rs2. At the same time,
 * multiply the bottom 16-bit Q15 content of the lower 32-bit word in Rs1 with the bottom 16-bit Q15
 * content of the lower 32-bit word in Rs2.
 * For the `SMULX16` instruction, multiply the top 16-bit Q15 content of the lower 32-bit word in Rs1
 * with the bottom 16-bit Q15 content of the lower 32-bit word in Rs2. At the same time, multiply the
 * bottom 16-bit Q15 content of the lower 32-bit word in Rs1 with the top 16-bit Q15 content of the
 * lower 32-bit word in Rs2.
 * The two 32-bit Q30 results are then written into Rd. The result calculated from the top 16-bit of the
 * lower 32-bit word in Rs1 is written to Rd.W[1]. And the result calculated from the bottom 16-bit of
 * the lower 32-bit word in Rs1 is written to Rd.W[0]
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * if (is `SMUL16`) {
 *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
 *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
 * } else if (is `SMULX16`) {
 *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
 *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
 * }
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   res = aop s* bop;
 * }
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * R[t_H] = rest;
 * R[t_L] = resb;
 * * RV64:
 * if (is `SMUL16`) {
 *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
 *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
 * } else if (is `SMULX16`) {
 *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
 *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
 * }
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   res = aop s* bop;
 * }
 * Rd.W[1] = rest;
 * Rd.W[0] = resb;
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_SMUL16(unsigned int a, unsigned int b)
{
    unsigned long long result;
    __ASM volatile("smul16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.123.1. SMUL16 ===== */

/* ===== Inline Function Start for 3.123.2. SMULX16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY
 * \brief SMULX16 (SIMD Signed Crossed 16-bit Multiply)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SMUL16 Rd, Rs1, Rs2
 * SMULX16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do signed 16-bit multiplications and generate two 32-bit results simultaneously.
 *
 * **RV32 Description**:\n
 * For the `SMUL16` instruction, multiply the top 16-bit Q15 content of Rs1 with
 * the top 16-bit Q15 content of Rs2. At the same time, multiply the bottom 16-bit Q15 content of Rs1
 * with the bottom 16-bit Q15 content of Rs2.
 * For the `SMULX16` instruction, multiply the top 16-bit Q15 content of Rs1 with the bottom 16-bit
 * Q15 content of Rs2. At the same time, multiply the bottom 16-bit Q15 content of Rs1 with the top 16-
 * bit Q15 content of Rs2.
 * The two Q30 results are then written into an even/odd pair of registers specified by Rd(4,1). Rd(4,1),
 * i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair includes
 * register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the 32-bit result calculated from the top part of Rs1 and
 * the even `2d` register of the pair contains the 32-bit result calculated from the bottom part of Rs1.
 *
 * **RV64 Description**:\n
 * For the `SMUL16` instruction, multiply the top 16-bit Q15 content of the lower
 * 32-bit word in Rs1 with the top 16-bit Q15 content of the lower 32-bit word in Rs2. At the same time,
 * multiply the bottom 16-bit Q15 content of the lower 32-bit word in Rs1 with the bottom 16-bit Q15
 * content of the lower 32-bit word in Rs2.
 * For the `SMULX16` instruction, multiply the top 16-bit Q15 content of the lower 32-bit word in Rs1
 * with the bottom 16-bit Q15 content of the lower 32-bit word in Rs2. At the same time, multiply the
 * bottom 16-bit Q15 content of the lower 32-bit word in Rs1 with the top 16-bit Q15 content of the
 * lower 32-bit word in Rs2.
 * The two 32-bit Q30 results are then written into Rd. The result calculated from the top 16-bit of the
 * lower 32-bit word in Rs1 is written to Rd.W[1]. And the result calculated from the bottom 16-bit of
 * the lower 32-bit word in Rs1 is written to Rd.W[0]
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * if (is `SMUL16`) {
 *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
 *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
 * } else if (is `SMULX16`) {
 *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
 *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
 * }
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   res = aop s* bop;
 * }
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * R[t_H] = rest;
 * R[t_L] = resb;
 * * RV64:
 * if (is `SMUL16`) {
 *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
 *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
 * } else if (is `SMULX16`) {
 *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
 *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
 * }
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   res = aop s* bop;
 * }
 * Rd.W[1] = rest;
 * Rd.W[0] = resb;
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_SMULX16(unsigned int a, unsigned int b)
{
    unsigned long long result;
    __ASM volatile("smulx16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.123.2. SMULX16 ===== */

/* ===== Inline Function Start for 3.124. SRA.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
 * \brief SRA.u (Rounding Shift Right Arithmetic)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * SRA.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Perform an arithmetic right shift operation with rounding. The shift amount is a variable
 * from a GPR.
 *
 * **Description**:\n
 * This instruction right-shifts the content of Rs1 arithmetically. The shifted out bits are
 * filled with the sign-bit and the shift amount is specified by the low-order 5-bits (RV32) or 6-bits
 * (RV64) of the Rs2 register. For the rounding operation, a value of 1 is added to the most significant
 * discarded bit of the data to calculate the final result. And the result is written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * sa = Rs2[4:0];
 * if (sa > 0) {
 *   res[31:-1] = SE33(Rs1[31:(sa-1)]) + 1;
 *   Rd = res[31:0];
 * } else {
 *   Rd = Rs1;
 * }
 * * RV64:
 * sa = Rs2[5:0];
 * if (sa > 0) {
 *   res[63:-1] = SE65(Rs1[63:(sa-1)]) + 1;
 *   Rd = res[63:0];
 * } else {
 *   Rd = Rs1;
 * }
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SRA_U(long a, unsigned int b)
{
    long result;
    __ASM volatile("sra.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.124. SRA.u ===== */

/* ===== Inline Function Start for 3.125. SRAI.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
 * \brief SRAI.u (Rounding Shift Right Arithmetic Immediate)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * SRAI.u Rd, Rs1, imm6u[4:0] (RV32)
 * SRAI.u Rd, Rs1, imm6u[5:0] (RV64)
 * ~~~
 *
 * **Purpose**:\n
 * Perform an arithmetic right shift operation with rounding. The shift amount is an
 * immediate value.
 *
 * **Description**:\n
 * This instruction right-shifts the content of Rs1 arithmetically. The shifted out bits are
 * filled with the sign-bit and the shift amount is specified by the imm6u[4:0] (RV32) or imm6u[5:0]
 * (RV64) constant . For the rounding operation, a value of 1 is added to the most significant discarded
 * bit of the data to calculate the final result. And the result is written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * sa = imm6u[4:0];
 * if (sa > 0) {
 *   res[31:-1] = SE33(Rs1[31:(sa-1)]) + 1;
 *   Rd = res[31:0];
 * } else {
 *   Rd = Rs1;
 * }
 * * RV64:
 * sa = imm6u[5:0];
 * if (sa > 0) {
 *   res[63:-1] = SE65(Rs1[63:(sa-1)]) + 1;
 *   Rd = res[63:0];
 * } else {
 *   Rd = Rs1;
 * }
 * ~~~
 *
 * \param [in]  a    long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in long type
 */
#define __RV_SRAI_U(a, b)    \
    ({    \
        long result;    \
        long __a = (long)(a);    \
        __ASM volatile("srai.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.125. SRAI.u ===== */

/* ===== Inline Function Start for 3.126.1. SRA8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
 * \brief SRA8 (SIMD 8-bit Shift Right Arithmetic)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SRA8 Rd, Rs1, Rs2
 * SRA8.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit element arithmetic right shift operations simultaneously. The shift amount is a
 * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
 * results.
 *
 * **Description**:\n
 * The 8-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
 * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order
 * 3-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is
 * added to the most significant discarded bit of each 8-bit data element to calculate the final results.
 * And the results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[2:0];
 * if (sa > 0) {
 *   if (`.u` form) { // SRA8.u
 *     res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1;
 *     Rd.B[x] = res[7:0];
 *   } else { // SRA8
 *     Rd.B[x] = SE8(Rd.B[x][7:sa])
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SRA8(unsigned long a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("sra8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.126.1. SRA8 ===== */

/* ===== Inline Function Start for 3.126.2. SRA8.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
 * \brief SRA8.u (SIMD 8-bit Rounding Shift Right Arithmetic)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SRA8 Rd, Rs1, Rs2
 * SRA8.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit element arithmetic right shift operations simultaneously. The shift amount is a
 * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
 * results.
 *
 * **Description**:\n
 * The 8-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
 * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order
 * 3-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is
 * added to the most significant discarded bit of each 8-bit data element to calculate the final results.
 * And the results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[2:0];
 * if (sa > 0) {
 *   if (`.u` form) { // SRA8.u
 *     res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1;
 *     Rd.B[x] = res[7:0];
 *   } else { // SRA8
 *     Rd.B[x] = SE8(Rd.B[x][7:sa])
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SRA8_U(unsigned long a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("sra8.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.126.2. SRA8.u ===== */

/* ===== Inline Function Start for 3.127.1. SRAI8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
 * \brief SRAI8 (SIMD 8-bit Shift Right Arithmetic Immediate)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SRAI8 Rd, Rs1, imm3u
 * SRAI8.u Rd, Rs1, imm3u
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit element arithmetic right shift operations simultaneously. The shift amount is an
 * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
 *
 * **Description**:\n
 * The 8-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
 * bits are filled with the sign-bit of the data elements. The shift amount is specified by the imm3u
 * constant. For the rounding operation of the `.u` form, a value of 1 is added to the most significant
 * discarded bit of each 8-bit data element to calculate the final results. And the results are written to
 * Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm3u[2:0];
 * if (sa > 0) {
 *   if (`.u` form) { // SRA8.u
 *     res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1;
 *     Rd.B[x] = res[7:0];
 *   } else { // SRA8
 *     Rd.B[x] = SE8(Rd.B[x][7:sa])
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_SRAI8(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("srai8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.127.1. SRAI8 ===== */

/* ===== Inline Function Start for 3.127.2. SRAI8.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
 * \brief SRAI8.u (SIMD 8-bit Rounding Shift Right Arithmetic Immediate)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SRAI8 Rd, Rs1, imm3u
 * SRAI8.u Rd, Rs1, imm3u
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit element arithmetic right shift operations simultaneously. The shift amount is an
 * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
 *
 * **Description**:\n
 * The 8-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
 * bits are filled with the sign-bit of the data elements. The shift amount is specified by the imm3u
 * constant. For the rounding operation of the `.u` form, a value of 1 is added to the most significant
 * discarded bit of each 8-bit data element to calculate the final results. And the results are written to
 * Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm3u[2:0];
 * if (sa > 0) {
 *   if (`.u` form) { // SRA8.u
 *     res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1;
 *     Rd.B[x] = res[7:0];
 *   } else { // SRA8
 *     Rd.B[x] = SE8(Rd.B[x][7:sa])
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_SRAI8_U(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("srai8.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.127.2. SRAI8.u ===== */

/* ===== Inline Function Start for 3.128.1. SRA16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
 * \brief SRA16 (SIMD 16-bit Shift Right Arithmetic)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SRA16 Rd, Rs1, Rs2
 * SRA16.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit element arithmetic right shift operations simultaneously. The shift amount is a
 * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
 * results.
 *
 * **Description**:\n
 * The 16-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
 * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order
 * 4-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is
 * added to the most significant discarded bit of each 16-bit data element to calculate the final results.
 * And the results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[3:0];
 * if (sa != 0) {
 *   if (`.u` form) { // SRA16.u
 *     res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1;
 *     Rd.H[x] = res[15:0];
 *   } else { // SRA16
 *     Rd.H[x] = SE16(Rs1.H[x][15:sa])
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SRA16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("sra16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.128.1. SRA16 ===== */

/* ===== Inline Function Start for 3.128.2. SRA16.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
 * \brief SRA16.u (SIMD 16-bit Rounding Shift Right Arithmetic)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SRA16 Rd, Rs1, Rs2
 * SRA16.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit element arithmetic right shift operations simultaneously. The shift amount is a
 * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
 * results.
 *
 * **Description**:\n
 * The 16-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
 * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order
 * 4-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is
 * added to the most significant discarded bit of each 16-bit data element to calculate the final results.
 * And the results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[3:0];
 * if (sa != 0) {
 *   if (`.u` form) { // SRA16.u
 *     res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1;
 *     Rd.H[x] = res[15:0];
 *   } else { // SRA16
 *     Rd.H[x] = SE16(Rs1.H[x][15:sa])
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SRA16_U(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("sra16.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.128.2. SRA16.u ===== */

/* ===== Inline Function Start for 3.129.1. SRAI16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
 * \brief SRAI16 (SIMD 16-bit Shift Right Arithmetic Immediate)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SRAI16 Rd, Rs1, imm4u
 * SRAI16.u Rd, Rs1, imm4u
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit elements arithmetic right shift operations simultaneously. The shift amount is
 * an immediate value. The `.u` form performs additional rounding up operations on the shifted
 * results.
 *
 * **Description**:\n
 * The 16-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
 * bits are filled with the sign-bit of the 16-bit data elements. The shift amount is specified by the
 * imm4u constant. For the rounding operation of the `.u` form, a value of 1 is added to the most
 * significant discarded bit of each 16-bit data to calculate the final results. And the results are written
 * to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm4u[3:0];
 * if (sa > 0) {
 *   if (`.u` form) { // SRAI16.u
 *     res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1;
 *     Rd.H[x] = res[15:0];
 *   } else { // SRAI16
 *     Rd.H[x] = SE16(Rs1.H[x][15:sa]);
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_SRAI16(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("srai16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.129.1. SRAI16 ===== */

/* ===== Inline Function Start for 3.129.2. SRAI16.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
 * \brief SRAI16.u (SIMD 16-bit Rounding Shift Right Arithmetic Immediate)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SRAI16 Rd, Rs1, imm4u
 * SRAI16.u Rd, Rs1, imm4u
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit elements arithmetic right shift operations simultaneously. The shift amount is
 * an immediate value. The `.u` form performs additional rounding up operations on the shifted
 * results.
 *
 * **Description**:\n
 * The 16-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
 * bits are filled with the sign-bit of the 16-bit data elements. The shift amount is specified by the
 * imm4u constant. For the rounding operation of the `.u` form, a value of 1 is added to the most
 * significant discarded bit of each 16-bit data to calculate the final results. And the results are written
 * to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm4u[3:0];
 * if (sa > 0) {
 *   if (`.u` form) { // SRAI16.u
 *     res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1;
 *     Rd.H[x] = res[15:0];
 *   } else { // SRAI16
 *     Rd.H[x] = SE16(Rs1.H[x][15:sa]);
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_SRAI16_U(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("srai16.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.129.2. SRAI16.u ===== */

/* ===== Inline Function Start for 3.130.1. SRL8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
 * \brief SRL8 (SIMD 8-bit Shift Right Logical)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SRL8 Rt, Ra, Rb
 * SRL8.u Rt, Ra, Rb
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit elements logical right shift operations simultaneously. The shift amount is a
 * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
 * results.
 *
 * **Description**:\n
 * The 8-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits are
 * filled with zero. The shift amount is specified by the low-order 3-bits of the value in the Rs2 register.
 * For the rounding operation of the `.u` form, a value of 1 is added to the most significant discarded
 * bit of each 8-bit data element to calculate the final results. And the results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[2:0];
 * if (sa > 0) {
 *   if (`.u` form) { // SRL8.u
 *     res[8:0] = ZE9(Rs1.B[x][7:sa-1]) + 1;
 *     Rd.B[x] = res[8:1];
 *   } else { // SRL8
 *     Rd.B[x] = ZE8(Rs1.B[x][7:sa]);
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SRL8(unsigned long a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("srl8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.130.1. SRL8 ===== */

/* ===== Inline Function Start for 3.130.2. SRL8.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
 * \brief SRL8.u (SIMD 8-bit Rounding Shift Right Logical)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SRL8 Rt, Ra, Rb
 * SRL8.u Rt, Ra, Rb
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit elements logical right shift operations simultaneously. The shift amount is a
 * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
 * results.
 *
 * **Description**:\n
 * The 8-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits are
 * filled with zero. The shift amount is specified by the low-order 3-bits of the value in the Rs2 register.
 * For the rounding operation of the `.u` form, a value of 1 is added to the most significant discarded
 * bit of each 8-bit data element to calculate the final results. And the results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[2:0];
 * if (sa > 0) {
 *   if (`.u` form) { // SRL8.u
 *     res[8:0] = ZE9(Rs1.B[x][7:sa-1]) + 1;
 *     Rd.B[x] = res[8:1];
 *   } else { // SRL8
 *     Rd.B[x] = ZE8(Rs1.B[x][7:sa]);
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SRL8_U(unsigned long a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("srl8.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.130.2. SRL8.u ===== */

/* ===== Inline Function Start for 3.131.1. SRLI8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
 * \brief SRLI8 (SIMD 8-bit Shift Right Logical Immediate)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SRLI8 Rt, Ra, imm3u
 * SRLI8.u Rt, Ra, imm3u
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit elements logical right shift operations simultaneously. The shift amount is an
 * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
 *
 * **Description**:\n
 * The 8-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits are
 * filled with zero. The shift amount is specified by the imm3u constant. For the rounding operation of
 * the `.u` form, a value of 1 is added to the most significant discarded bit of each 8-bit data element to
 * calculate the final results. And the results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm3u[2:0];
 * if (sa > 0) {
 *   if (`.u` form) { // SRLI8.u
 *     res[8:0] = ZE9(Rs1.B[x][7:sa-1]) + 1;
 *     Rd.B[x] = res[8:1];
 *   } else { // SRLI8
 *     Rd.B[x] = ZE8(Rs1.B[x][7:sa]);
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_SRLI8(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("srli8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.131.1. SRLI8 ===== */

/* ===== Inline Function Start for 3.131.2. SRLI8.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
 * \brief SRLI8.u (SIMD 8-bit Rounding Shift Right Logical Immediate)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SRLI8 Rt, Ra, imm3u
 * SRLI8.u Rt, Ra, imm3u
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit elements logical right shift operations simultaneously. The shift amount is an
 * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
 *
 * **Description**:\n
 * The 8-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits are
 * filled with zero. The shift amount is specified by the imm3u constant. For the rounding operation of
 * the `.u` form, a value of 1 is added to the most significant discarded bit of each 8-bit data element to
 * calculate the final results. And the results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm3u[2:0];
 * if (sa > 0) {
 *   if (`.u` form) { // SRLI8.u
 *     res[8:0] = ZE9(Rs1.B[x][7:sa-1]) + 1;
 *     Rd.B[x] = res[8:1];
 *   } else { // SRLI8
 *     Rd.B[x] = ZE8(Rs1.B[x][7:sa]);
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_SRLI8_U(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("srli8.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.131.2. SRLI8.u ===== */

/* ===== Inline Function Start for 3.132.1. SRL16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
 * \brief SRL16 (SIMD 16-bit Shift Right Logical)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SRL16 Rt, Ra, Rb
 *  SRL16.u Rt, Ra, Rb
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit elements logical right shift operations simultaneously. The shift amount is a variable from a GPR. The `.u` form performs additional rounding upoperations on the shifted results.
 *
 * **Description**:\n
 * The 16-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
 * are filled with zero. The shift amount is specified by the low-order 4-bits of the value in the Rs2
 * register. For the rounding operation of the `.u` form, a value of 1 is added to the most significant
 * discarded bit of each 16-bit data element to calculate the final results. And the results are written to
 * Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[3:0];
 * if (sa > 0) {
 *   if (`.u` form) { // SRL16.u
 *     res[16:0] = ZE17(Rs1.H[x][15:sa-1]) + 1;
 *     Rd.H[x] = res[16:1];
 *   } else { // SRL16
 *     Rd.H[x] = ZE16(Rs1.H[x][15:sa]);
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SRL16(unsigned long a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("srl16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.132.1. SRL16 ===== */

/* ===== Inline Function Start for 3.132.2. SRL16.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
 * \brief SRL16.u (SIMD 16-bit Rounding Shift Right Logical)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SRL16 Rt, Ra, Rb
 *  SRL16.u Rt, Ra, Rb
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit elements logical right shift operations simultaneously. The shift amount is a variable from a GPR. The `.u` form performs additional rounding upoperations on the shifted results.
 *
 * **Description**:\n
 * The 16-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
 * are filled with zero. The shift amount is specified by the low-order 4-bits of the value in the Rs2
 * register. For the rounding operation of the `.u` form, a value of 1 is added to the most significant
 * discarded bit of each 16-bit data element to calculate the final results. And the results are written to
 * Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[3:0];
 * if (sa > 0) {
 *   if (`.u` form) { // SRL16.u
 *     res[16:0] = ZE17(Rs1.H[x][15:sa-1]) + 1;
 *     Rd.H[x] = res[16:1];
 *   } else { // SRL16
 *     Rd.H[x] = ZE16(Rs1.H[x][15:sa]);
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SRL16_U(unsigned long a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("srl16.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.132.2. SRL16.u ===== */

/* ===== Inline Function Start for 3.133.1. SRLI16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
 * \brief SRLI16 (SIMD 16-bit Shift Right Logical Immediate)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SRLI16 Rt, Ra, imm4u
 * SRLI16.u Rt, Ra, imm4u
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit elements logical right shift operations simultaneously. The shift amount is an
 * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
 *
 * **Description**:\n
 * The 16-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
 * are filled with zero. The shift amount is specified by the imm4u constant. For the rounding
 * operation of the `.u` form, a value of 1 is added to the most significant discarded bit of each 16-bit
 * data element to calculate the final results. And the results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm4u;
 * if (sa > 0) {
 *   if (`.u` form) { // SRLI16.u
 *     res[16:0] = ZE17(Rs1.H[x][15:sa-1]) + 1;
 *     Rd.H[x] = res[16:1];
 *   } else { // SRLI16
 *     Rd.H[x] = ZE16(Rs1.H[x][15:sa]);
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_SRLI16(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("srli16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.133.1. SRLI16 ===== */

/* ===== Inline Function Start for 3.133.2. SRLI16.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
 * \brief SRLI16.u (SIMD 16-bit Rounding Shift Right Logical Immediate)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SRLI16 Rt, Ra, imm4u
 * SRLI16.u Rt, Ra, imm4u
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit elements logical right shift operations simultaneously. The shift amount is an
 * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
 *
 * **Description**:\n
 * The 16-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
 * are filled with zero. The shift amount is specified by the imm4u constant. For the rounding
 * operation of the `.u` form, a value of 1 is added to the most significant discarded bit of each 16-bit
 * data element to calculate the final results. And the results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm4u;
 * if (sa > 0) {
 *   if (`.u` form) { // SRLI16.u
 *     res[16:0] = ZE17(Rs1.H[x][15:sa-1]) + 1;
 *     Rd.H[x] = res[16:1];
 *   } else { // SRLI16
 *     Rd.H[x] = ZE16(Rs1.H[x][15:sa]);
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_SRLI16_U(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("srli16.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.133.2. SRLI16.u ===== */

/* ===== Inline Function Start for 3.134. STAS16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief STAS16 (SIMD 16-bit Straight Addition & Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * STAS16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit integer element addition and 16-bit integer element subtraction in a 32-bit
 * chunk simultaneously. Operands are from corresponding positions in 32-bit chunks.
 *
 * **Description**:\n
 * This instruction adds the 16-bit integer element in [31:16] of 32-bit chunks in Rs1 with
 * the 16-bit integer element in [31:16] of 32-bit chunks in Rs2, and writes the result to [31:16] of 32-bit
 * chunks in Rd; at the same time, it subtracts the 16-bit integer element in [15:0] of 32-bit chunks in
 * Rs2 from the 16-bit integer element in [15:0] of 32-bit chunks, and writes the result to [15:0] of 32-
 * bit chunks in Rd.
 *
 * **Note**:\n
 * This instruction can be used for either signed or unsigned operations.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:16] = Rs1.W[x][31:16] + Rs2.W[x][31:16];
 * Rd.W[x][15:0] = Rs1.W[x][15:0] - Rs2.W[x][15:0];
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_STAS16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("stas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.134. STAS16 ===== */

/* ===== Inline Function Start for 3.135. STSA16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief STSA16 (SIMD 16-bit Straight Subtraction & Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * STSA16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit integer element subtraction and 16-bit integer element addition in a 32-bit
 * chunk simultaneously. Operands are from corresponding positions in 32-bit chunks.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit integer element in [31:16] of 32-bit chunks in Rs2
 * from the 16-bit integer element in [31:16] of 32-bit chunks in Rs1, and writes the result to [31:16] of
 * 32-bit chunks in Rd; at the same time, it adds the 16-bit integer element in [15:0] of 32-bit chunks in
 * Rs2 with the 16-bit integer element in [15:0] of 32-bit chunks in Rs1, and writes the result to [15:0] of
 * 32-bit chunks in Rd.
 *
 * **Note**:\n
 * This instruction can be used for either signed or unsigned operations.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:16] = Rs1.W[x][31:16] - Rs2.W[x][31:16];
 * Rd.W[x][15:0] = Rs1.W[x][15:0] + Rs2.W[x][15:0];
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_STSA16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("stsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.135. STSA16 ===== */

/* ===== Inline Function Start for 3.136. SUB8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
 * \brief SUB8 (SIMD 8-bit Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SUB8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit integer element subtractions simultaneously.
 *
 * **Description**:\n
 * This instruction subtracts the 8-bit integer elements in Rs2 from the 8-bit integer
 * elements in Rs1, and then writes the result to Rd.
 *
 * **Note**:\n
 * This instruction can be used for either signed or unsigned subtraction.
 *
 * **Operations**:\n
 * ~~~
 * Rd.B[x] = Rs1.B[x] - Rs2.B[x];
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SUB8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("sub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.136. SUB8 ===== */

/* ===== Inline Function Start for 3.137. SUB16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief SUB16 (SIMD 16-bit Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * SUB16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit integer element subtractions simultaneously.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit integer elements in Rs2 from the 16-bit integer
 * elements in Rs1, and then writes the result to Rd.
 *
 * **Note**:\n
 * This instruction can be used for either signed or unsigned subtraction.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = Rs1.H[x] - Rs2.H[x];
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SUB16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("sub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.137. SUB16 ===== */

/* ===== Inline Function Start for 3.138. SUB64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
 * \brief SUB64 (64-bit Subtraction)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * SUB64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Perform a 64-bit signed or unsigned integer subtraction.
 *
 * **RV32 Description**:\n
 * This instruction subtracts the 64-bit integer of an even/odd pair of registers
 * specified by Rs2(4,1) from the 64-bit integer of an even/odd pair of registers specified by Rs1(4,1),
 * and then writes the 64-bit result to an even/odd pair of registers specified by Rd(4,1).
 * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
 * register of the pair contains the low 32-bit of the operand.
 *
 * **RV64 Description**:\n
 * This instruction subtracts the 64-bit integer of Rs2 from the 64-bit integer of Rs1,
 * and then writes the 64-bit result to Rd.
 *
 * **Note**:\n
 * This instruction can be used for either signed or unsigned subtraction.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
 * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
 * R[t_H].R[t_L] = R[a_H].R[a_L] - R[b_H].R[b_L];
 * * RV64:
 * Rd = Rs1 - Rs2;
 * ~~~
 *
 * \param [in]  a    unsigned long long type of value stored in a
 * \param [in]  b    unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_SUB64(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("sub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.138. SUB64 ===== */

/* ===== Inline Function Start for 3.139.1. SUNPKD810 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
 * \brief SUNPKD810 (Signed Unpacking Bytes 1 & 0)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * SUNPKD8xy Rd, Rs1
 * xy = {10, 20, 30, 31, 32}
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords
 * of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
 * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit
 * chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x])
 * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y])
 * // SUNPKD810, x=1,y=0
 * // SUNPKD820, x=2,y=0
 * // SUNPKD830, x=3,y=0
 * // SUNPKD831, x=3,y=1
 * // SUNPKD832, x=3,y=2
 * for RV32: m=0,
 * for RV64: m=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SUNPKD810(unsigned long a)
{
    unsigned long result;
    __ASM volatile("sunpkd810 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.139.1. SUNPKD810 ===== */

/* ===== Inline Function Start for 3.139.2. SUNPKD820 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
 * \brief SUNPKD820 (Signed Unpacking Bytes 2 & 0)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * SUNPKD8xy Rd, Rs1
 * xy = {10, 20, 30, 31, 32}
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords
 * of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
 * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit
 * chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x])
 * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y])
 * // SUNPKD810, x=1,y=0
 * // SUNPKD820, x=2,y=0
 * // SUNPKD830, x=3,y=0
 * // SUNPKD831, x=3,y=1
 * // SUNPKD832, x=3,y=2
 * for RV32: m=0,
 * for RV64: m=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SUNPKD820(unsigned long a)
{
    unsigned long result;
    __ASM volatile("sunpkd820 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.139.2. SUNPKD820 ===== */

/* ===== Inline Function Start for 3.139.3. SUNPKD830 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
 * \brief SUNPKD830 (Signed Unpacking Bytes 3 & 0)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * SUNPKD8xy Rd, Rs1
 * xy = {10, 20, 30, 31, 32}
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords
 * of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
 * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit
 * chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x])
 * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y])
 * // SUNPKD810, x=1,y=0
 * // SUNPKD820, x=2,y=0
 * // SUNPKD830, x=3,y=0
 * // SUNPKD831, x=3,y=1
 * // SUNPKD832, x=3,y=2
 * for RV32: m=0,
 * for RV64: m=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SUNPKD830(unsigned long a)
{
    unsigned long result;
    __ASM volatile("sunpkd830 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.139.3. SUNPKD830 ===== */

/* ===== Inline Function Start for 3.139.4. SUNPKD831 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
 * \brief SUNPKD831 (Signed Unpacking Bytes 3 & 1)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * SUNPKD8xy Rd, Rs1
 * xy = {10, 20, 30, 31, 32}
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords
 * of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
 * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit
 * chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x])
 * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y])
 * // SUNPKD810, x=1,y=0
 * // SUNPKD820, x=2,y=0
 * // SUNPKD830, x=3,y=0
 * // SUNPKD831, x=3,y=1
 * // SUNPKD832, x=3,y=2
 * for RV32: m=0,
 * for RV64: m=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SUNPKD831(unsigned long a)
{
    unsigned long result;
    __ASM volatile("sunpkd831 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.139.4. SUNPKD831 ===== */

/* ===== Inline Function Start for 3.139.5. SUNPKD832 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
 * \brief SUNPKD832 (Signed Unpacking Bytes 3 & 2)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * SUNPKD8xy Rd, Rs1
 * xy = {10, 20, 30, 31, 32}
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords
 * of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
 * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit
 * chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x])
 * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y])
 * // SUNPKD810, x=1,y=0
 * // SUNPKD820, x=2,y=0
 * // SUNPKD830, x=3,y=0
 * // SUNPKD831, x=3,y=1
 * // SUNPKD832, x=3,y=2
 * for RV32: m=0,
 * for RV64: m=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SUNPKD832(unsigned long a)
{
    unsigned long result;
    __ASM volatile("sunpkd832 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.139.5. SUNPKD832 ===== */

/* ===== Inline Function Start for 3.140. SWAP8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
 * \brief SWAP8 (Swap Byte within Halfword)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * SWAP8 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Swap the bytes within each halfword of a register.
 *
 * **Description**:\n
 * This instruction swaps the bytes within each halfword of Rs1 and writes the result to
 * Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = CONCAT(Rs1.H[x][7:0],Rs1.H[x][15:8]);
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SWAP8(unsigned long a)
{
    unsigned long result;
    __ASM volatile("swap8 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.140. SWAP8 ===== */

/* ===== Inline Function Start for 3.141. SWAP16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
 * \brief SWAP16 (Swap Halfword within Word)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * SWAP16 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Swap the 16-bit halfwords within each word of a register.
 *
 * **Description**:\n
 * This instruction swaps the 16-bit halfwords within each word of Rs1 and writes the
 * result to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = CONCAT(Rs1.W[x][15:0],Rs1.H[x][31:16]);
 * for RV32: x=0,
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SWAP16(unsigned long a)
{
    unsigned long result;
    __ASM volatile("swap16 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.141. SWAP16 ===== */

/* ===== Inline Function Start for 3.142. UCLIP8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
 * \brief UCLIP8 (SIMD 8-bit Unsigned Clip Value)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UCLIP8 Rt, Ra, imm3u
 * ~~~
 *
 * **Purpose**:\n
 * Limit the 8-bit signed elements of a register into an unsigned range simultaneously.
 *
 * **Description**:\n
 * This instruction limits the 8-bit signed elements stored in Rs1 into an unsigned integer
 * range between 2^imm3u-1 and 0, and writes the limited results to Rd. For example, if imm3u is 3, the 8-
 * bit input values should be saturated between 7 and 0. If saturation is performed, set OV bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * src = Rs1.H[x];
 * if (src > (2^imm3u)-1) {
 *   src = (2^imm3u)-1;
 *   OV = 1;
 * } else if (src < 0) {
 *   src = 0;
 *   OV = 1;
 * }
 * Rd.H[x] = src;
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_UCLIP8(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("uclip8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.142. UCLIP8 ===== */

/* ===== Inline Function Start for 3.143. UCLIP16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
 * \brief UCLIP16 (SIMD 16-bit Unsigned Clip Value)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UCLIP16 Rt, Ra, imm4u
 * ~~~
 *
 * **Purpose**:\n
 * Limit the 16-bit signed elements of a register into an unsigned range simultaneously.
 *
 * **Description**:\n
 * This instruction limits the 16-bit signed elements stored in Rs1 into an unsigned
 * integer range between 2imm4u-1 and 0, and writes the limited results to Rd. For example, if imm4u is
 * 3, the 16-bit input values should be saturated between 7 and 0. If saturation is performed, set OV bit
 * to 1.
 *
 * **Operations**:\n
 * ~~~
 * src = Rs1.H[x];
 * if (src > (2^imm4u)-1) {
 *   src = (2^imm4u)-1;
 *   OV = 1;
 * } else if (src < 0) {
 *   src = 0;
 *   OV = 1;
 * }
 * Rd.H[x] = src;
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_UCLIP16(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("uclip16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.143. UCLIP16 ===== */

/* ===== Inline Function Start for 3.144. UCLIP32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
 * \brief UCLIP32 (SIMD 32-bit Unsigned Clip Value)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UCLIP32 Rd, Rs1, imm5u[4:0]
 * ~~~
 *
 * **Purpose**:\n
 * Limit the 32-bit signed integer elements of a register into an unsigned range
 * simultaneously.
 *
 * **Description**:\n
 * This instruction limits the 32-bit signed integer elements stored in Rs1 into an
 * unsigned integer range between 2imm5u-1 and 0, and writes the limited results to Rd. For example, if
 * imm5u is 3, the 32-bit input values should be saturated between 7 and 0. If saturation is performed,
 * set OV bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * src = Rs1.W[x];
 * if (src > (2^imm5u)-1) {
 *   src = (2^imm5u)-1;
 *   OV = 1;
 * } else if (src < 0) {
 *   src = 0;
 *   OV = 1;
 * }
 * Rd.W[x] = src
 * for RV32: x=0,
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_UCLIP32(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("uclip32 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.144. UCLIP32 ===== */

/* ===== Inline Function Start for 3.145. UCMPLE8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP
 * \brief UCMPLE8 (SIMD 8-bit Unsigned Compare Less Than & Equal)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UCMPLE8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit unsigned integer elements less than & equal comparisons simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 8-bit unsigned integer elements in Rs1 with the 8-bit
 * unsigned integer elements in Rs2 to see if the one in Rs1 is less than or equal to the one in Rs2. If it
 * is true, the result is 0xFF; otherwise, the result is 0x0. The four comparison results are written to
 * Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.B[x] = (Rs1.B[x] <=u Rs2.B[x])? 0xff : 0x0;
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UCMPLE8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ucmple8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.145. UCMPLE8 ===== */

/* ===== Inline Function Start for 3.146. UCMPLE16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP
 * \brief UCMPLE16 (SIMD 16-bit Unsigned Compare Less Than & Equal)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UCMPLE16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit unsigned integer elements less than & equal comparisons simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 16-bit unsigned integer elements in Rs1 with the 16-bit
 * unsigned integer elements in Rs2 to see if the one in Rs1 is less than or equal to the one in Rs2. If it
 * is true, the result is 0xFFFF; otherwise, the result is 0x0. The element comparison results are
 * written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = (Rs1.H[x] <=u Rs2.H[x])? 0xffff : 0x0;
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UCMPLE16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ucmple16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.146. UCMPLE16 ===== */

/* ===== Inline Function Start for 3.147. UCMPLT8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP
 * \brief UCMPLT8 (SIMD 8-bit Unsigned Compare Less Than)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UCMPLT8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit unsigned integer elements less than comparisons simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 8-bit unsigned integer elements in Rs1 with the 8-bit
 * unsigned integer elements in Rs2 to see if the one in Rs1 is less than the one in Rs2. If it is true, the
 * result is 0xFF; otherwise, the result is 0x0. The element comparison results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.B[x] = (Rs1.B[x] <u Rs2.B[x])? 0xff : 0x0;
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UCMPLT8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ucmplt8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.147. UCMPLT8 ===== */

/* ===== Inline Function Start for 3.148. UCMPLT16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP
 * \brief UCMPLT16 (SIMD 16-bit Unsigned Compare Less Than)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UCMPLT16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit unsigned integer elements less than comparisons simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 16-bit unsigned integer elements in Rs1 with the 16-bit
 * unsigned integer elements in Rs2 to see if the one in Rs1 is less than the one in Rs2. If it is true, the
 * result is 0xFFFF; otherwise, the result is 0x0. The element comparison results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = (Rs1.H[x] <u Rs2.H[x])? 0xffff : 0x0;
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UCMPLT16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ucmplt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.148. UCMPLT16 ===== */

/* ===== Inline Function Start for 3.149. UKADD8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
 * \brief UKADD8 (SIMD 8-bit Unsigned Saturating Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UKADD8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit unsigned integer element saturating additions simultaneously.
 *
 * **Description**:\n
 * This instruction adds the 8-bit unsigned integer elements in Rs1 with the 8-bit
 * unsigned integer elements in Rs2. If any of the results are beyond the 8-bit unsigned number range
 * (0 <= RES <= 28-1), they are saturated to the range and the OV bit is set to 1. The saturated results are
 * written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rs1.B[x] + Rs2.B[x];
 * if (res[x] > (2^8)-1) {
 *   res[x] = (2^8)-1;
 *   OV = 1;
 * }
 * Rd.B[x] = res[x];
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UKADD8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ukadd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.149. UKADD8 ===== */

/* ===== Inline Function Start for 3.150. UKADD16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief UKADD16 (SIMD 16-bit Unsigned Saturating Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UKADD16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit unsigned integer element saturating additions simultaneously.
 *
 * **Description**:\n
 * This instruction adds the 16-bit unsigned integer elements in Rs1 with the 16-bit
 * unsigned integer elements in Rs2. If any of the results are beyond the 16-bit unsigned number
 * range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set to 1. The saturated
 * results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rs1.H[x] + Rs2.H[x];
 * if (res[x] > (2^16)-1) {
 *   res[x] = (2^16)-1;
 *   OV = 1;
 * }
 * Rd.H[x] = res[x];
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UKADD16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ukadd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.150. UKADD16 ===== */

/* ===== Inline Function Start for 3.151. UKADD64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
 * \brief UKADD64 (64-bit Unsigned Saturating Addition)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * UKADD64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Add two 64-bit unsigned integers. The result is saturated to the U64 range.
 *
 * **RV32 Description**:\n
 * This instruction adds the 64-bit unsigned integer of an even/odd pair of registers
 * specified by Rs1(4,1) with the 64-bit unsigned integer of an even/odd pair of registers specified by
 * Rs2(4,1). If the 64-bit result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is saturated to the
 * range and the OV bit is set to 1. The saturated result is written to an even/odd pair of registers
 * specified by Rd(4,1).
 * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 *
 * **RV64 Description**:\n
 * This instruction adds the 64-bit unsigned integer in Rs1 with the 64-bit unsigned
 * integer in Rs2. If the 64-bit result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is saturated to
 * the range and the OV bit is set to 1. The saturated result is written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * t_L = CONCAT(Rt(4,1),1'b0); t_H = CONCAT(Rt(4,1),1'b1);
 * a_L = CONCAT(Ra(4,1),1'b0); a_H = CONCAT(Ra(4,1),1'b1);
 * b_L = CONCAT(Rb(4,1),1'b0); b_H = CONCAT(Rb(4,1),1'b1);
 * result = R[a_H].R[a_L] + R[b_H].R[b_L];
 * if (result > (2^64)-1) {
 *   result = (2^64)-1; OV = 1;
 * }
 * R[t_H].R[t_L] = result;
 * * RV64:
 * result = Rs1 + Rs2;
 * if (result > (2^64)-1) {
 *   result = (2^64)-1; OV = 1;
 * }
 * Rd = result;
 * ~~~
 *
 * \param [in]  a    unsigned long long type of value stored in a
 * \param [in]  b    unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_UKADD64(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("ukadd64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.151. UKADD64 ===== */

/* ===== Inline Function Start for 3.152. UKADDH ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
 * \brief UKADDH (Unsigned Addition with U16 Saturation)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * UKADDH Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Add the unsigned lower 32-bit content of two registers with U16 saturation.
 *
 * **Description**:\n
 * The unsigned lower 32-bit content of Rs1 is added with the unsigned lower 32-bit
 * content of Rs2. And the result is saturated to the 16-bit unsigned integer range of [0, 2^16-1] and then
 * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag.
 *
 * **Operations**:\n
 * ~~~
 * tmp = Rs1.W[0] + Rs2.W[0];
 * if (tmp > (2^16)-1) {
 *   tmp = (2^16)-1;
 *   OV = 1;
 * }
 * Rd = SE(tmp[15:0]);
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UKADDH(unsigned int a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("ukaddh %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.152. UKADDH ===== */

/* ===== Inline Function Start for 3.153. UKADDW ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
 * \brief UKADDW (Unsigned Addition with U32 Saturation)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * UKADDW Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Add the unsigned lower 32-bit content of two registers with U32 saturation.
 *
 * **Description**:\n
 * The unsigned lower 32-bit content of Rs1 is added with the unsigned lower 32-bit
 * content of Rs2. And the result is saturated to the 32-bit unsigned integer range of [0, 2^32-1] and then
 * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag.
 *
 * **Operations**:\n
 * ~~~
 * tmp = Rs1.W[0] + Rs2.W[0];
 * if (tmp > (2^32)-1) {
 *   tmp[31:0] = (2^32)-1;
 *   OV = 1;
 * }
 * Rd = tmp[31:0]; // RV32
 * Rd = SE(tmp[31:0]); // RV64
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UKADDW(unsigned int a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("ukaddw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.153. UKADDW ===== */

/* ===== Inline Function Start for 3.154. UKCRAS16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief UKCRAS16 (SIMD 16-bit Unsigned Saturating Cross Addition & Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UKCRAS16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do one 16-bit unsigned integer element saturating addition and one 16-bit unsigned
 * integer element saturating subtraction in a 32-bit chunk simultaneously. Operands are from crossed
 * positions in 32-bit chunks.
 *
 * **Description**:\n
 * This instruction adds the 16-bit unsigned integer element in [31:16] of 32-bit chunks in
 * Rs1 with the 16-bit unsigned integer element in [15:0] of 32-bit chunks in Rs2; at the same time, it
 * subtracts the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs2 from the 16-bit
 * unsigned integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the 16-bit
 * unsigned number range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set to 1.
 * The saturated results are written to [31:16] of 32-bit chunks in Rd for addition and [15:0] of 32-bit
 * chunks in Rd for subtraction.
 *
 * **Operations**:\n
 * ~~~
 * res1 = Rs1.W[x][31:16] + Rs2.W[x][15:0];
 * res2 = Rs1.W[x][15:0] - Rs2.W[x][31:16];
 * if (res1 > (2^16)-1) {
 *   res1 = (2^16)-1;
 *   OV = 1;
 * }
 * if (res2 < 0) {
 *   res2 = 0;
 *   OV = 1;
 * }
 * Rd.W[x][31:16] = res1;
 * Rd.W[x][15:0] = res2;
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UKCRAS16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ukcras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.154. UKCRAS16 ===== */

/* ===== Inline Function Start for 3.155. UKCRSA16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief UKCRSA16 (SIMD 16-bit Unsigned Saturating Cross Subtraction & Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UKCRSA16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do one 16-bit unsigned integer element saturating subtraction and one 16-bit unsigned
 * integer element saturating addition in a 32-bit chunk simultaneously. Operands are from crossed
 * positions in 32-bit chunks.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit unsigned integer element in [15:0] of 32-bit
 * chunks in Rs2 from the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs1; at the
 * same time, it adds the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs2 with the 16-
 * bit unsigned integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the
 * 16-bit unsigned number range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set
 * to 1. The saturated results are written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of
 * 32-bit chunks in Rd for addition.
 *
 * **Operations**:\n
 * ~~~
 * res1 = Rs1.W[x][31:16] - Rs2.W[x][15:0];
 * res2 = Rs1.W[x][15:0] + Rs2.W[x][31:16];
 * if (res1 < 0) {
 *   res1 = 0;
 *   OV = 1;
 * } else if (res2 > (2^16)-1) {
 *   res2 = (2^16)-1;
 *   OV = 1;
 * }
 * Rd.W[x][31:16] = res1;
 * Rd.W[x][15:0] = res2;
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UKCRSA16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ukcrsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.155. UKCRSA16 ===== */

/* ===== Inline Function Start for 3.156. UKMAR64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
 * \brief UKMAR64 (Unsigned Multiply and Saturating Add to 64-Bit Data)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * UKMAR64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the 32-bit unsigned elements in two registers and add the 64-bit multiplication
 * results to the 64-bit unsigned data of a pair of registers (RV32) or a register (RV64). The result is
 * saturated to the U64 range and written back to the pair of registers (RV32) or the register (RV64).
 *
 * **RV32 Description**:\n
 * This instruction multiplies the 32-bit unsigned data of Rs1 with that of Rs2. It
 * adds the 64-bit multiplication result to the 64-bit unsigned data of an even/odd pair of registers
 * specified by Rd(4,1) with unlimited precision. If the 64-bit addition result is beyond the U64 number
 * range (0 <= U64 <= 2^64-1), it is saturated to the range and the OV bit is set to 1. The saturated result is
 * written back to the even/odd pair of registers specified by Rd(4,1).
 * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 *
 * **RV64 Description**:\n
 * This instruction multiplies the 32-bit unsigned elements of Rs1 with that of Rs2.
 * It adds the 64-bit multiplication results to the 64-bit unsigned data in Rd with unlimited precision. If
 * the 64-bit addition result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is saturated to the
 * range and the OV bit is set to 1. The saturated result is written back to Rd.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * result = R[t_H].R[t_L] + (Rs1 * Rs2);
 * if (result > (2^64)-1) {
 *   result = (2^64)-1; OV = 1;
 * }
 * R[t_H].R[t_L] = result;
 * * RV64:
 * // `result` has unlimited precision
 * result = Rd + (Rs1.W[0] u* Rs2.W[0]) + (Rs1.W[1] u* Rs2.W[1]);
 * if (result > (2^64)-1) {
 *   result = (2^64)-1; OV = 1;
 * }
 * Rd = result;
 * ~~~
 *
 * \param [in]  t    unsigned long long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_UKMAR64(unsigned long long t, unsigned long a, unsigned long b)
{
    __ASM volatile("ukmar64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.156. UKMAR64 ===== */

/* ===== Inline Function Start for 3.157. UKMSR64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
 * \brief UKMSR64 (Unsigned Multiply and Saturating Subtract from 64-Bit Data)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * UKMSR64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the 32-bit unsigned elements in two registers and subtract the 64-bit
 * multiplication results from the 64-bit unsigned data of a pair of registers (RV32) or a register (RV64).
 * The result is saturated to the U64 range and written back to the pair of registers (RV32) or a register
 * (RV64).
 *
 * **RV32 Description**:\n
 * This instruction multiplies the 32-bit unsigned data of Rs1 with that of Rs2. It
 * subtracts the 64-bit multiplication result from the 64-bit unsigned data of an even/odd pair of
 * registers specified by Rd(4,1) with unlimited precision. If the 64-bit subtraction result is beyond the
 * U64 number range (0 <= U64 <= 2^64-1), it is saturated to the range and the OV bit is set to 1. The
 * saturated result is written back to the even/odd pair of registers specified by Rd(4,1).
 * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 *
 * **RV64 Description**:\n
 * This instruction multiplies the 32-bit unsigned elements of Rs1 with that of Rs2.
 * It subtracts the 64-bit multiplication results from the 64-bit unsigned data of Rd with unlimited
 * precision. If the 64-bit subtraction result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is
 * saturated to the range and the OV bit is set to 1. The saturated result is written back to Rd.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * result = R[t_H].R[t_L] - (Rs1 u* Rs2);
 * if (result < 0) {
 *   result = 0; OV = 1;
 * }
 * R[t_H].R[t_L] = result;
 * * RV64:
 * // `result` has unlimited precision
 * result = Rd - (Rs1.W[0] u* Rs2.W[0]) - (Rs1.W[1] u* Rs2.W[1]);
 * if (result < 0) {
 *   result = 0; OV = 1;
 * }
 * Rd = result;
 * ~~~
 *
 * \param [in]  t    unsigned long long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_UKMSR64(unsigned long long t, unsigned long a, unsigned long b)
{
    __ASM volatile("ukmsr64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.157. UKMSR64 ===== */

/* ===== Inline Function Start for 3.158. UKSTAS16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief UKSTAS16 (SIMD 16-bit Unsigned Saturating Straight Addition & Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UKSTAS16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do one 16-bit unsigned integer element saturating addition and one 16-bit unsigned
 * integer element saturating subtraction in a 32-bit chunk simultaneously. Operands are from
 * corresponding positions in 32-bit chunks.
 *
 * **Description**:\n
 * This instruction adds the 16-bit unsigned integer element in [31:16] of 32-bit chunks in
 * Rs1 with the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs2; at the same time, it
 * subtracts the 16-bit unsigned integer element in [15:0] of 32-bit chunks in Rs2 from the 16-bit
 * unsigned integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the 16-bit
 * unsigned number range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set to 1.
 * The saturated results are written to [31:16] of 32-bit chunks in Rd for addition and [15:0] of 32-bit
 * chunks in Rd for subtraction.
 *
 * **Operations**:\n
 * ~~~
 * res1 = Rs1.W[x][31:16] + Rs2.W[x][31:16];
 * res2 = Rs1.W[x][15:0] - Rs2.W[x][15:0];
 * if (res1 > (2^16)-1) {
 *   res1 = (2^16)-1;
 *   OV = 1;
 * }
 * if (res2 < 0) {
 *   res2 = 0;
 *   OV = 1;
 * }
 * Rd.W[x][31:16] = res1;
 * Rd.W[x][15:0] = res2;
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UKSTAS16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ukstas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.158. UKSTAS16 ===== */

/* ===== Inline Function Start for 3.159. UKSTSA16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief UKSTSA16 (SIMD 16-bit Unsigned Saturating Straight Subtraction & Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UKSTSA16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do one 16-bit unsigned integer element saturating subtraction and one 16-bit unsigned
 * integer element saturating addition in a 32-bit chunk simultaneously. Operands are from
 * corresponding positions in 32-bit chunks.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit unsigned integer element in [31:16] of 32-bit
 * chunks in Rs2 from the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs1; at the
 * same time, it adds the 16-bit unsigned integer element in [15:0] of 32-bit chunks in Rs2 with the 16-
 * bit unsigned integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the
 * 16-bit unsigned number range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set
 * to 1. The saturated results are written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of
 * 32-bit chunks in Rd for addition.
 *
 * **Operations**:\n
 * ~~~
 * res1 = Rs1.W[x][31:16] - Rs2.W[x][31:16];
 * res2 = Rs1.W[x][15:0] + Rs2.W[x][15:0];
 * if (res1 < 0) {
 *   res1 = 0;
 *   OV = 1;
 * } else if (res2 > (2^16)-1) {
 *   res2 = (2^16)-1;
 *   OV = 1;
 * }
 * Rd.W[x][31:16] = res1;
 * Rd.W[x][15:0] = res2;
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UKSTSA16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ukstsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.159. UKSTSA16 ===== */

/* ===== Inline Function Start for 3.160. UKSUB8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
 * \brief UKSUB8 (SIMD 8-bit Unsigned Saturating Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UKSUB8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit unsigned integer elements saturating subtractions simultaneously.
 *
 * **Description**:\n
 * This instruction subtracts the 8-bit unsigned integer elements in Rs2 from the 8-bit
 * unsigned integer elements in Rs1. If any of the results are beyond the 8-bit unsigned number range
 * (0 <= RES <= 28-1), they are saturated to the range and the OV bit is set to 1. The saturated results are
 * written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rs1.B[x] - Rs2.B[x];
 * if (res[x] < 0) {
 *   res[x] = 0;
 *   OV = 1;
 * }
 * Rd.B[x] = res[x];
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UKSUB8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("uksub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.160. UKSUB8 ===== */

/* ===== Inline Function Start for 3.161. UKSUB16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief UKSUB16 (SIMD 16-bit Unsigned Saturating Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UKSUB16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit unsigned integer elements saturating subtractions simultaneously.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit unsigned integer elements in Rs2 from the 16-bit
 * unsigned integer elements in Rs1. If any of the results are beyond the 16-bit unsigned number
 * range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set to 1. The saturated
 * results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rs1.H[x] - Rs2.H[x];
 * if (res[x] < 0) {
 *   res[x] = 0;
 *   OV = 1;
 * }
 * Rd.H[x] = res[x];
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UKSUB16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("uksub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.161. UKSUB16 ===== */

/* ===== Inline Function Start for 3.162. UKSUB64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
 * \brief UKSUB64 (64-bit Unsigned Saturating Subtraction)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * UKSUB64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Perform a 64-bit signed integer subtraction. The result is saturated to the U64 range.
 *
 * **RV32 Description**:\n
 * This instruction subtracts the 64-bit unsigned integer of an even/odd pair of
 * registers specified by Rs2(4,1) from the 64-bit unsigned integer of an even/odd pair of registers
 * specified by Rs1(4,1). If the 64-bit result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is
 * saturated to the range and the OV bit is set to 1. The saturated result is then written to an even/odd
 * pair of registers specified by Rd(4,1).
 * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
 * register of the pair contains the low 32-bit of the operand.
 *
 * **RV64 Description**:\n
 * This instruction subtracts the 64-bit unsigned integer of Rs2 from the 64-bit
 * unsigned integer of an even/odd pair of Rs1. If the 64-bit result is beyond the U64 number range (0 <=
 * U64 <= 2^64-1), it is saturated to the range and the OV bit is set to 1. The saturated result is then written
 * to Rd.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
 * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
 * result = R[a_H].R[a_L] - R[b_H].R[b_L];
 * if (result < 0) {
 *   result = 0; OV = 1;
 * }
 * R[t_H].R[t_L] = result;
 * * RV64
 * result = Rs1 - Rs2;
 * if (result < 0) {
 *   result = 0; OV = 1;
 * }
 * Rd = result;
 * ~~~
 *
 * \param [in]  a    unsigned long long type of value stored in a
 * \param [in]  b    unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_UKSUB64(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("uksub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.162. UKSUB64 ===== */

/* ===== Inline Function Start for 3.163. UKSUBH ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
 * \brief UKSUBH (Unsigned Subtraction with U16 Saturation)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * UKSUBH Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Subtract the unsigned lower 32-bit content of two registers with U16 saturation.
 *
 * **Description**:\n
 * The unsigned lower 32-bit content of Rs2 is subtracted from the unsigned lower 32-bit
 * content of Rs1. And the result is saturated to the 16-bit unsigned integer range of [0, 2^16-1] and then
 * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag.
 *
 * **Operations**:\n
 * ~~~
 * tmp = Rs1.W[0] - Rs2.W[0];
 * if (tmp > (2^16)-1) {
 *   tmp = (2^16)-1;
 *   OV = 1;
 * }
 * else if (tmp < 0) {
 *   tmp = 0;
 *   OV = 1;
 * }
 * Rd = SE(tmp[15:0]);
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UKSUBH(unsigned int a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("uksubh %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.163. UKSUBH ===== */

/* ===== Inline Function Start for 3.164. UKSUBW ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
 * \brief UKSUBW (Unsigned Subtraction with U32 Saturation)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * UKSUBW Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Subtract the unsigned lower 32-bit content of two registers with unsigned 32-bit
 * saturation.
 *
 * **Description**:\n
 * The unsigned lower 32-bit content of Rs2 is subtracted from the unsigned lower 32-bit
 * content of Rs1. And the result is saturated to the 32-bit unsigned integer range of [0, 2^32-1] and then
 * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag.
 *
 * **Operations**:\n
 * ~~~
 * tmp = Rs1.W[0] - Rs2.W[0];
 * if (tmp < 0) {
 *   tmp[31:0] = 0;
 *   OV = 1;
 * }
 * Rd = tmp[31:0]; // RV32
 * Rd = SE(tmp[31:0]); // RV64
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UKSUBW(unsigned int a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("uksubw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.164. UKSUBW ===== */

/* ===== Inline Function Start for 3.165. UMAR64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
 * \brief UMAR64 (Unsigned Multiply and Add to 64-Bit Data)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * UMAR64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the 32-bit unsigned elements in two registers and add the 64-bit multiplication
 * results to the 64-bit unsigned data of a pair of registers (RV32) or a register (RV64). The result is
 * written back to the pair of registers (RV32) or a register (RV64).
 *
 * **RV32 Description**:\n
 * This instruction multiplies the 32-bit unsigned data of Rs1 with that of Rs2. It
 * adds the 64-bit multiplication result to the 64-bit unsigned data of an even/odd pair of registers
 * specified by Rd(4,1). The addition result is written back to the even/odd pair of registers specified by
 * Rd(4,1).
 * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 *
 * **RV64 Description**:\n
 * This instruction multiplies the 32-bit unsigned elements of Rs1 with that of Rs2.
 * It adds the 64-bit multiplication results to the 64-bit unsigned data of Rd. The addition result is
 * written back to Rd.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * R[t_H].R[t_L] = R[t_H].R[t_L] + (Rs1 * Rs2);
 * * RV64:
 * Rd = Rd + (Rs1.W[0] u* Rs2.W[0]) + (Rs1.W[1] u* Rs2.W[1]);
 * ~~~
 *
 * \param [in]  t    unsigned long long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_UMAR64(unsigned long long t, unsigned long a, unsigned long b)
{
    __ASM volatile("umar64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.165. UMAR64 ===== */

/* ===== Inline Function Start for 3.166. UMAQA ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_8B_MULT_32B_ADD
 * \brief UMAQA (Unsigned Multiply Four Bytes with 32- bit Adds)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * UMAQA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do four unsigned 8-bit multiplications from 32-bit chunks of two registers; and then adds
 * the four 16-bit results and the content of corresponding 32-bit chunks of a third register together.
 *
 * **Description**:\n
 * This instruction multiplies the four unsigned 8-bit elements of 32-bit chunks of Rs1 with the four
 * unsigned 8-bit elements of 32-bit chunks of Rs2 and then adds the four results together with the
 * unsigned content of the corresponding 32-bit chunks of Rd. The final results are written back to the
 * corresponding 32-bit chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rd.W[x] + (Rs1.W[x].B[3] u* Rs2.W[x].B[3]) +
 *          (Rs1.W[x].B[2] u* Rs2.W[x].B[2]) + (Rs1.W[x].B[1] u* Rs2.W[x].B[1]) +
 *          (Rs1.W[x].B[0] u* Rs2.W[x].B[0]);
 * Rd.W[x] = res[x];
 * for RV32: x=0,
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  t    unsigned long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UMAQA(unsigned long t, unsigned long a, unsigned long b)
{
    __ASM volatile("umaqa %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.166. UMAQA ===== */

/* ===== Inline Function Start for 3.167. UMAX8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
 * \brief UMAX8 (SIMD 8-bit Unsigned Maximum)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UMAX8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit unsigned integer elements finding maximum operations simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 8-bit unsigned integer elements in Rs1 with the four 8-
 * bit unsigned integer elements in Rs2 and selects the numbers that is greater than the other one. The
 * two selected results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.B[x] = (Rs1.B[x] >u Rs2.B[x])? Rs1.B[x] : Rs2.B[x];
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UMAX8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("umax8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.167. UMAX8 ===== */

/* ===== Inline Function Start for 3.168. UMAX16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
 * \brief UMAX16 (SIMD 16-bit Unsigned Maximum)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UMAX16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit unsigned integer elements finding maximum operations simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 16-bit unsigned integer elements in Rs1 with the 16-bit
 * unsigned integer elements in Rs2 and selects the numbers that is greater than the other one. The
 * selected results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = (Rs1.H[x] >u Rs2.H[x])? Rs1.H[x] : Rs2.H[x];
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UMAX16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("umax16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.168. UMAX16 ===== */

/* ===== Inline Function Start for 3.169. UMIN8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
 * \brief UMIN8 (SIMD 8-bit Unsigned Minimum)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UMIN8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit unsigned integer elements finding minimum operations simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 8-bit unsigned integer elements in Rs1 with the 8-bit
 * unsigned integer elements in Rs2 and selects the numbers that is less than the other one. The
 * selected results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.B[x] = (Rs1.B[x] <u Rs2.B[x])? Rs1.B[x] : Rs2.B[x];
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UMIN8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("umin8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.169. UMIN8 ===== */

/* ===== Inline Function Start for 3.170. UMIN16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
 * \brief UMIN16 (SIMD 16-bit Unsigned Minimum)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UMIN16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit unsigned integer elements finding minimum operations simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 16-bit unsigned integer elements in Rs1 with the 16-bit
 * unsigned integer elements in Rs2 and selects the numbers that is less than the other one. The
 * selected results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = (Rs1.H[x] <u Rs2.H[x])? Rs1.H[x] : Rs2.H[x];
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UMIN16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("umin16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.170. UMIN16 ===== */

/* ===== Inline Function Start for 3.171. UMSR64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
 * \brief UMSR64 (Unsigned Multiply and Subtract from 64-Bit Data)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * UMSR64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the 32-bit unsigned elements in two registers and subtract the 64-bit
 * multiplication results from the 64-bit unsigned data of a pair of registers (RV32) or a register (RV64).
 * The result is written back to the pair of registers (RV32) or a register (RV64).
 *
 * **RV32 Description**:\n
 * This instruction multiplies the 32-bit unsigned data of Rs1 with that of Rs2. It
 * subtracts the 64-bit multiplication result from the 64-bit unsigned data of an even/odd pair of
 * registers specified by Rd(4,1). The subtraction result is written back to the even/odd pair of registers
 * specified by Rd(4,1).
 * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 *
 * **RV64 Description**:\n
 * This instruction multiplies the 32-bit unsigned elements of Rs1 with that of Rs2.
 * It subtracts the 64-bit multiplication results from the 64-bit unsigned data of Rd. The subtraction
 * result is written back to Rd.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * R[t_H].R[t_L] = R[t_H].R[t_L] - (Rs1 * Rs2);
 * * RV64:
 * Rd = Rd - (Rs1.W[0] u* Rs2.W[0]) - (Rs1.W[1] u* Rs2.W[1]);
 * ~~~
 *
 * \param [in]  t    unsigned long long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_UMSR64(unsigned long long t, unsigned long a, unsigned long b)
{
    __ASM volatile("umsr64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 3.171. UMSR64 ===== */

/* ===== Inline Function Start for 3.172.1. UMUL8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY
 * \brief UMUL8 (SIMD Unsigned 8-bit Multiply)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UMUL8 Rd, Rs1, Rs2
 * UMULX8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do unsigned 8-bit multiplications and generate four 16-bit results simultaneously.
 *
 * **RV32 Description**:\n
 * For the `UMUL8` instruction, multiply the unsigned 8-bit data elements of Rs1
 * with the corresponding unsigned 8-bit data elements of Rs2.
 * For the `UMULX8` instruction, multiply the first and second unsigned 8-bit data elements of Rs1
 * with the second and first unsigned 8-bit data elements of Rs2. At the same time, multiply the third
 * and fourth unsigned 8-bit data elements of Rs1 with the fourth and third unsigned 8-bit data
 * elements of Rs2.
 * The four 16-bit results are then written into an even/odd pair of registers specified by Rd(4,1).
 * Rd(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the two 16-bit results calculated from the top part of
 * Rs1 and the even `2d` register of the pair contains the two 16-bit results calculated from the bottom
 * part of Rs1.
 *
 * **RV64 Description**:\n
 * For the `UMUL8` instruction, multiply the unsigned 8-bit data elements of Rs1
 * with the corresponding unsigned 8-bit data elements of Rs2.
 * For the `UMULX8` instruction, multiply the first and second unsigned 8-bit data elements of Rs1
 * with the second and first unsigned 8-bit data elements of Rs2. At the same time, multiply the third
 * and fourth unsigned 8-bit data elements of Rs1 with the fourth and third unsigned 8-bit data
 * elements of Rs2.
 * The four 16-bit results are then written into Rd. The Rd.W[1] contains the two 16-bit results
 * calculated from the top part of Rs1 and the Rd.W[0] contains the two 16-bit results calculated from
 * the bottom part of Rs1.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * if (is `UMUL8`) {
 *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
 *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
 * } else if (is `UMULX8`) {
 *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
 *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom
 * }
 * rest[x/2] = op1t[x/2] u* op2t[x/2];
 * resb[x/2] = op1b[x/2] u* op2b[x/2];
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * R[t_H].H[1] = rest[1]; R[t_H].H[0] = resb[1];
 * R[t_L].H[1] = rest[0]; R[t_L].H[0] = resb[0];
 * x = 0 and 2
 * * RV64:
 * if (is `UMUL8`) {
 *     op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
 *     op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
 * } else if (is `UMULX8`) {
 *     op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
 *     op1b[x/2]  =  Rs1.B[x]; op2b[x/2]  =  Rs2.B[x+1];  //  Rs1  bottom
 * }
 * rest[x/2]  =  op1t[x/2]  u*  op2t[x/2];
 * resb[x/2]  =  op1b[x/2]  u*  op2b[x/2];
 * t_L  =  CONCAT(Rd(4,1),1'b0); t_H  =  CONCAT(Rd(4,1),1'b1);
 * Rd.W[1].H[1] = rest[1]; Rd.W[1].H[0] = resb[1];
 * Rd.W[0].H[1] = rest[0]; Rd.W[0].H[0] = resb[0]; x = 0 and 2
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_UMUL8(unsigned int a, unsigned int b)
{
    unsigned long long result;
    __ASM volatile("umul8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.172.1. UMUL8 ===== */

/* ===== Inline Function Start for 3.172.2. UMULX8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY
 * \brief UMULX8 (SIMD Unsigned Crossed 8-bit Multiply)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UMUL8 Rd, Rs1, Rs2
 * UMULX8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do unsigned 8-bit multiplications and generate four 16-bit results simultaneously.
 *
 * **RV32 Description**:\n
 * For the `UMUL8` instruction, multiply the unsigned 8-bit data elements of Rs1
 * with the corresponding unsigned 8-bit data elements of Rs2.
 * For the `UMULX8` instruction, multiply the first and second unsigned 8-bit data elements of Rs1
 * with the second and first unsigned 8-bit data elements of Rs2. At the same time, multiply the third
 * and fourth unsigned 8-bit data elements of Rs1 with the fourth and third unsigned 8-bit data
 * elements of Rs2.
 * The four 16-bit results are then written into an even/odd pair of registers specified by Rd(4,1).
 * Rd(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the two 16-bit results calculated from the top part of
 * Rs1 and the even `2d` register of the pair contains the two 16-bit results calculated from the bottom
 * part of Rs1.
 *
 * **RV64 Description**:\n
 * For the `UMUL8` instruction, multiply the unsigned 8-bit data elements of Rs1
 * with the corresponding unsigned 8-bit data elements of Rs2.
 * For the `UMULX8` instruction, multiply the first and second unsigned 8-bit data elements of Rs1
 * with the second and first unsigned 8-bit data elements of Rs2. At the same time, multiply the third
 * and fourth unsigned 8-bit data elements of Rs1 with the fourth and third unsigned 8-bit data
 * elements of Rs2.
 * The four 16-bit results are then written into Rd. The Rd.W[1] contains the two 16-bit results
 * calculated from the top part of Rs1 and the Rd.W[0] contains the two 16-bit results calculated from
 * the bottom part of Rs1.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * if (is `UMUL8`) {
 *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
 *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
 * } else if (is `UMULX8`) {
 *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
 *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom
 * }
 * rest[x/2] = op1t[x/2] u* op2t[x/2];
 * resb[x/2] = op1b[x/2] u* op2b[x/2];
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * R[t_H].H[1] = rest[1]; R[t_H].H[0] = resb[1];
 * R[t_L].H[1] = rest[0]; R[t_L].H[0] = resb[0];
 * x = 0 and 2
 * * RV64:
 * if (is `UMUL8`) {
 *     op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
 *     op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
 * } else if (is `UMULX8`) {
 *     op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
 *     op1b[x/2]  =  Rs1.B[x]; op2b[x/2]  =  Rs2.B[x+1];  //  Rs1  bottom
 * }
 * rest[x/2]  =  op1t[x/2]  u*  op2t[x/2];
 * resb[x/2]  =  op1b[x/2]  u*  op2b[x/2];
 * t_L  =  CONCAT(Rd(4,1),1'b0); t_H  =  CONCAT(Rd(4,1),1'b1);
 * Rd.W[1].H[1] = rest[1]; Rd.W[1].H[0] = resb[1];
 * Rd.W[0].H[1] = rest[0]; Rd.W[0].H[0] = resb[0]; x = 0 and 2
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_UMULX8(unsigned int a, unsigned int b)
{
    unsigned long long result;
    __ASM volatile("umulx8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.172.2. UMULX8 ===== */

/* ===== Inline Function Start for 3.173.1. UMUL16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY
 * \brief UMUL16 (SIMD Unsigned 16-bit Multiply)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UMUL16 Rd, Rs1, Rs2
 * UMULX16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do unsigned 16-bit multiplications and generate two 32-bit results simultaneously.
 *
 * **RV32 Description**:\n
 * For the `UMUL16` instruction, multiply the top 16-bit U16 content of Rs1 with
 * the top 16-bit U16 content of Rs2. At the same time, multiply the bottom 16-bit U16 content of Rs1
 * with the bottom 16-bit U16 content of Rs2.
 * For the `UMULX16` instruction, multiply the top 16-bit U16 content of Rs1 with the bottom 16-bit
 * U16 content of Rs2. At the same time, multiply the bottom 16-bit U16 content of Rs1 with the top 16-
 * bit U16 content of Rs2.
 * The two U32 results are then written into an even/odd pair of registers specified by Rd(4,1). Rd(4,1),
 * i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair includes
 * register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the 32-bit result calculated from the top part of Rs1 and
 * the even `2d` register of the pair contains the 32-bit result calculated from the bottom part of Rs1.
 *
 * **RV64 Description**:\n
 * For the `UMUL16` instruction, multiply the top 16-bit U16 content of the lower
 * 32-bit word in Rs1 with the top 16-bit U16 content of the lower 32-bit word in Rs2. At the same time,
 * multiply the bottom 16-bit U16 content of the lower 32-bit word in Rs1 with the bottom 16-bit U16
 * content of the lower 32-bit word in Rs2.
 * For the `UMULX16` instruction, multiply the top 16-bit U16 content of the lower 32-bit word in Rs1
 * with the bottom 16-bit U16 content of the lower 32-bit word in Rs2. At the same time, multiply the
 * bottom 16-bit U16 content of the lower 32-bit word in Rs1 with the top 16-bit U16 content of the
 * lower 32-bit word in Rs2.
 * The two 32-bit U32 results are then written into Rd. The result calculated from the top 16-bit of the
 * lower 32-bit word in Rs1 is written to Rd.W[1]. And the result calculated from the bottom 16-bit of
 * the lower 32-bit word in Rs1 is written to Rd.W[0]
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * if (is `UMUL16`) {
 *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
 *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
 * } else if (is `UMULX16`) {
 *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
 *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
 * }
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   res = aop u* bop;
 * }
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * R[t_H] = rest;
 * R[t_L] = resb;
 * * RV64:
 * if (is `UMUL16`) {
 *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
 *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
 * } else if (is `UMULX16`) {
 *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
 *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
 * }
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   res = aop u* bop;
 * }
 * Rd.W[1] = rest;
 * Rd.W[0] = resb;
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_UMUL16(unsigned int a, unsigned int b)
{
    unsigned long long result;
    __ASM volatile("umul16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.173.1. UMUL16 ===== */

/* ===== Inline Function Start for 3.173.2. UMULX16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY
 * \brief UMULX16 (SIMD Unsigned Crossed 16-bit Multiply)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * UMUL16 Rd, Rs1, Rs2
 * UMULX16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do unsigned 16-bit multiplications and generate two 32-bit results simultaneously.
 *
 * **RV32 Description**:\n
 * For the `UMUL16` instruction, multiply the top 16-bit U16 content of Rs1 with
 * the top 16-bit U16 content of Rs2. At the same time, multiply the bottom 16-bit U16 content of Rs1
 * with the bottom 16-bit U16 content of Rs2.
 * For the `UMULX16` instruction, multiply the top 16-bit U16 content of Rs1 with the bottom 16-bit
 * U16 content of Rs2. At the same time, multiply the bottom 16-bit U16 content of Rs1 with the top 16-
 * bit U16 content of Rs2.
 * The two U32 results are then written into an even/odd pair of registers specified by Rd(4,1). Rd(4,1),
 * i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair includes
 * register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the 32-bit result calculated from the top part of Rs1 and
 * the even `2d` register of the pair contains the 32-bit result calculated from the bottom part of Rs1.
 *
 * **RV64 Description**:\n
 * For the `UMUL16` instruction, multiply the top 16-bit U16 content of the lower
 * 32-bit word in Rs1 with the top 16-bit U16 content of the lower 32-bit word in Rs2. At the same time,
 * multiply the bottom 16-bit U16 content of the lower 32-bit word in Rs1 with the bottom 16-bit U16
 * content of the lower 32-bit word in Rs2.
 * For the `UMULX16` instruction, multiply the top 16-bit U16 content of the lower 32-bit word in Rs1
 * with the bottom 16-bit U16 content of the lower 32-bit word in Rs2. At the same time, multiply the
 * bottom 16-bit U16 content of the lower 32-bit word in Rs1 with the top 16-bit U16 content of the
 * lower 32-bit word in Rs2.
 * The two 32-bit U32 results are then written into Rd. The result calculated from the top 16-bit of the
 * lower 32-bit word in Rs1 is written to Rd.W[1]. And the result calculated from the bottom 16-bit of
 * the lower 32-bit word in Rs1 is written to Rd.W[0]
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * if (is `UMUL16`) {
 *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
 *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
 * } else if (is `UMULX16`) {
 *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
 *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
 * }
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   res = aop u* bop;
 * }
 * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
 * R[t_H] = rest;
 * R[t_L] = resb;
 * * RV64:
 * if (is `UMUL16`) {
 *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
 *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
 * } else if (is `UMULX16`) {
 *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
 *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
 * }
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   res = aop u* bop;
 * }
 * Rd.W[1] = rest;
 * Rd.W[0] = resb;
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_UMULX16(unsigned int a, unsigned int b)
{
    unsigned long long result;
    __ASM volatile("umulx16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.173.2. UMULX16 ===== */

/* ===== Inline Function Start for 3.174. URADD8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
 * \brief URADD8 (SIMD 8-bit Unsigned Halving Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * URADD8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit unsigned integer element additions simultaneously. The results are halved to
 * avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the 8-bit unsigned integer elements in Rs1 with the 8-bit
 * unsigned integer elements in Rs2. The results are first logically right-shifted by 1 bit and then
 * written to Rd.
 *
 * **Examples**:\n
 * ~~~
 * * Ra = 0x7F, Rb = 0x7F, Rt = 0x7F
 * * Ra = 0x80, Rb = 0x80, Rt = 0x80
 * * Ra = 0x40, Rb = 0x80, Rt = 0x60
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.B[x] = (Rs1.B[x] + Rs2.B[x]) u>> 1;
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_URADD8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("uradd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.174. URADD8 ===== */

/* ===== Inline Function Start for 3.175. URADD16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief URADD16 (SIMD 16-bit Unsigned Halving Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * URADD16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit unsigned integer element additions simultaneously. The results are halved to
 * avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the 16-bit unsigned integer elements in Rs1 with the 16-bit
 * unsigned integer elements in Rs2. The results are first logically right-shifted by 1 bit and then
 * written to Rd.
 *
 * **Examples**:\n
 * ~~~
 * * Ra = 0x7FFF, Rb = 0x7FFF Rt = 0x7FFF
 * * Ra = 0x8000, Rb = 0x8000 Rt = 0x8000
 * * Ra = 0x4000, Rb = 0x8000 Rt = 0x6000
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = (Rs1.H[x] + Rs2.H[x]) u>> 1;
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_URADD16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("uradd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.175. URADD16 ===== */

/* ===== Inline Function Start for 3.176. URADD64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
 * \brief URADD64 (64-bit Unsigned Halving Addition)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * URADD64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Add two 64-bit unsigned integers. The result is halved to avoid overflow or saturation.
 *
 * **RV32 Description**:\n
 * This instruction adds the 64-bit unsigned integer of an even/odd pair of registers
 * specified by Rs1(4,1) with the 64-bit unsigned integer of an even/odd pair of registers specified by
 * Rs2(4,1). The 64-bit addition result is first logically right-shifted by 1 bit and then written to an
 * even/odd pair of registers specified by Rd(4,1).
 * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 *
 * **RV64 Description**:\n
 * This instruction adds the 64-bit unsigned integer in Rs1 with the 64-bit unsigned
 * integer Rs2. The 64-bit addition result is first logically right-shifted by 1 bit and then written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * t_L = CONCAT(Rt(4,1),1'b0); t_H = CONCAT(Rt(4,1),1'b1);
 * a_L = CONCAT(Ra(4,1),1'b0); a_H = CONCAT(Ra(4,1),1'b1);
 * b_L = CONCAT(Rb(4,1),1'b0); b_H = CONCAT(Rb(4,1),1'b1);
 * R[t_H].R[t_L] = (R[a_H].R[a_L] + R[b_H].R[b_L]) u>> 1;
 * * RV64:
 * Rd = (Rs1 + Rs2) u>> 1;
 * ~~~
 *
 * \param [in]  a    unsigned long long type of value stored in a
 * \param [in]  b    unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_URADD64(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("uradd64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.176. URADD64 ===== */

/* ===== Inline Function Start for 3.177. URADDW ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
 * \brief URADDW (32-bit Unsigned Halving Addition)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * URADDW Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Add 32-bit unsigned integers and the results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the first 32-bit unsigned integer in Rs1 with the first 32-bit
 * unsigned integer in Rs2. The result is first logically right-shifted by 1 bit and then sign-extended and
 * written to Rd.
 *
 * **Examples**:\n
 * ~~~
 * * Ra = 0x7FFFFFFF, Rb = 0x7FFFFFFF Rt = 0x7FFFFFFF
 * * Ra = 0x80000000, Rb = 0x80000000 Rt = 0x80000000
 * * Ra = 0x40000000, Rb = 0x80000000 Rt = 0x60000000
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * Rd[31:0] = (Rs1[31:0] + Rs2[31:0]) u>> 1;
 * * RV64:
 * resw[31:0] = (Rs1[31:0] + Rs2[31:0]) u>> 1;
 * Rd[63:0] = SE(resw[31:0]);
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_URADDW(unsigned int a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("uraddw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.177. URADDW ===== */

/* ===== Inline Function Start for 3.178. URCRAS16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief URCRAS16 (SIMD 16-bit Unsigned Halving Cross Addition & Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * URCRAS16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit unsigned integer element addition and 16-bit unsigned integer element
 * subtraction in a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit chunks.
 * The results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs1
 * with the 16-bit unsigned integer in [15:0] of 32-bit chunks in Rs2, and subtracts the 16-bit unsigned
 * integer in [31:16] of 32-bit chunks in Rs2 from the 16-bit unsigned integer in [15:0] of 32-bit chunks
 * in Rs1. The element results are first logically right-shifted by 1 bit and then written to [31:16] of 32-
 * bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
 *
 * **Examples**:\n
 * ~~~
 * Please see `URADD16` and `URSUB16` instructions.
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:16] = (Rs1.W[x][31:16] + Rs2.W[x][15:0]) u>> 1;
 * Rd.W[x][15:0] = (Rs1.W[x][15:0] - Rs2.W[x][31:16]) u>> 1;
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_URCRAS16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("urcras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.178. URCRAS16 ===== */

/* ===== Inline Function Start for 3.179. URCRSA16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief URCRSA16 (SIMD 16-bit Unsigned Halving Cross Subtraction & Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * URCRSA16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit unsigned integer element subtraction and 16-bit unsigned integer element
 * addition in a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit chunks.
 * The results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit unsigned integer in [15:0] of 32-bit chunks in Rs2
 * from the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs1, and adds the 16-bit unsigned
 * integer in [15:0] of 32-bit chunks in Rs1 with the 16-bit unsigned integer in [31:16] of 32-bit chunks
 * in Rs2. The two results are first logically right-shifted by 1 bit and then written to [31:16] of 32-bit
 * chunks in Rd and [15:0] of 32-bit chunks in Rd.
 *
 * **Examples**:\n
 * ~~~
 * Please see `URADD16` and `URSUB16` instructions.
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:16] = (Rs1.W[x][31:16] - Rs2.W[x][15:0]) u>> 1;
 * Rd.W[x][15:0] = (Rs1.W[x][15:0] + Rs2.W[x][31:16]) u>> 1;
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_URCRSA16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("urcrsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.179. URCRSA16 ===== */

/* ===== Inline Function Start for 3.180. URSTAS16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief URSTAS16 (SIMD 16-bit Unsigned Halving Straight Addition & Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * URSTAS16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit unsigned integer element addition and 16-bit unsigned integer element
 * subtraction in a 32-bit chunk simultaneously. Operands are from corresponding positions in 32-bit
 * chunks. The results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs1
 * with the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs2, and subtracts the 16-bit unsigned
 * integer in [15:0] of 32-bit chunks in Rs2 from the 16-bit unsigned integer in [15:0] of 32-bit chunks
 * in Rs1. The element results are first logically right-shifted by 1 bit and then written to [31:16] of 32-
 * bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
 *
 * **Examples**:\n
 * ~~~
 * Please see `URADD16` and `URSUB16` instructions.
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:16] = (Rs1.W[x][31:16] + Rs2.W[x][31:16]) u>> 1;
 * Rd.W[x][15:0] = (Rs1.W[x][15:0] - Rs2.W[x][15:0]) u>> 1;
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_URSTAS16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("urstas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.180. URSTAS16 ===== */

/* ===== Inline Function Start for 3.181. URSTSA16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief URSTSA16 (SIMD 16-bit Unsigned Halving Straight Subtraction & Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * URCRSA16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit unsigned integer element subtraction and 16-bit unsigned integer element
 * addition in a 32-bit chunk simultaneously. Operands are from corresponding positions in 32-bit
 * chunks. The results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs2
 * from the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs1, and adds the 16-bit unsigned
 * integer in [15:0] of 32-bit chunks in Rs1 with the 16-bit unsigned integer in [15:0] of 32-bit chunks in
 * Rs2. The two results are first logically right-shifted by 1 bit and then written to [31:16] of 32-bit
 * chunks in Rd and [15:0] of 32-bit chunks in Rd.
 *
 * **Examples**:\n
 * ~~~
 * Please see `URADD16` and `URSUB16` instructions.
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:16] = (Rs1.W[x][31:16] - Rs2.W[x][31:16]) u>> 1;
 * Rd.W[x][15:0] = (Rs1.W[x][15:0] + Rs2.W[x][15:0]) u>> 1;
 * for RV32, x=0
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_URSTSA16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("urstsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.181. URSTSA16 ===== */

/* ===== Inline Function Start for 3.182. URSUB8 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
 * \brief URSUB8 (SIMD 8-bit Unsigned Halving Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * URSUB8 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit unsigned integer element subtractions simultaneously. The results are halved to
 * avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the 8-bit unsigned integer elements in Rs2 from the 8-bit
 * unsigned integer elements in Rs1. The results are first logically right-shifted by 1 bit and then
 * written to Rd.
 *
 * **Examples**:\n
 * ~~~
 * * Ra = 0x7F, Rb = 0x80 Rt = 0xFF
 * * Ra = 0x80, Rb = 0x7F Rt = 0x00
 * * Ra = 0x80, Rb = 0x40 Rt = 0x20
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.B[x] = (Rs1.B[x] - Rs2.B[x]) u>> 1;
 * for RV32: x=3...0,
 * for RV64: x=7...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_URSUB8(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ursub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.182. URSUB8 ===== */

/* ===== Inline Function Start for 3.183. URSUB16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
 * \brief URSUB16 (SIMD 16-bit Unsigned Halving Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * URSUB16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit unsigned integer element subtractions simultaneously. The results are halved to
 * avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit unsigned integer elements in Rs2 from the 16-bit
 * unsigned integer elements in Rs1. The results are first logically right-shifted by 1 bit and then
 * written to Rd.
 *
 * **Examples**:\n
 * ~~~
 * * Ra = 0x7FFF, Rb = 0x8000 Rt = 0xFFFF
 * * Ra = 0x8000, Rb = 0x7FFF Rt = 0x0000
 * * Ra = 0x8000, Rb = 0x4000 Rt = 0x2000
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = (Rs1.H[x] - Rs2.H[x]) u>> 1;
 * for RV32: x=1...0,
 * for RV64: x=3...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_URSUB16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ursub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.183. URSUB16 ===== */

/* ===== Inline Function Start for 3.184. URSUB64 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
 * \brief URSUB64 (64-bit Unsigned Halving Subtraction)
 * \details
 * **Type**: DSP (64-bit Profile)
 *
 * **Syntax**:\n
 * ~~~
 * URSUB64 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Perform a 64-bit unsigned integer subtraction. The result is halved to avoid overflow or
 * saturation.
 *
 * **RV32 Description**:\n
 * This instruction subtracts the 64-bit unsigned integer of an even/odd pair of
 * registers specified by Rs2(4,1) from the 64-bit unsigned integer of an even/odd pair of registers
 * specified by Rs1(4,1). The subtraction result is first logically right-shifted by 1 bit and then written
 * to an even/odd pair of registers specified by Rd(4,1).
 * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
 * includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
 * of the pair contains the low 32-bit of the result.
 *
 * **RV64 Description**:\n
 * This instruction subtracts the 64-bit unsigned integer in Rs2 from the 64-bit
 * unsigned integer in Rs1. The subtraction result is first logically right-shifted by 1 bit and then
 * written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * t_L = CONCAT(Rt(4,1),1'b0); t_H = CONCAT(Rt(4,1),1'b1);
 * a_L = CONCAT(Ra(4,1),1'b0); a_H = CONCAT(Ra(4,1),1'b1);
 * b_L = CONCAT(Rb(4,1),1'b0); b_H = CONCAT(Rb(4,1),1'b1);
 * R[t_H].R[t_L] = (R[a_H].R[a_L] - R[b_H].R[b_L]) u>> 1;
 * * RV64:
 * Rd = (Rs1 - Rs2) u>> 1;
 * ~~~
 *
 * \param [in]  a    unsigned long long type of value stored in a
 * \param [in]  b    unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_URSUB64(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("ursub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.184. URSUB64 ===== */

/* ===== Inline Function Start for 3.185. URSUBW ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
 * \brief URSUBW (32-bit Unsigned Halving Subtraction)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * URSUBW Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Subtract 32-bit unsigned integers and the result is halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the first 32-bit signed integer in Rs2 from the first 32-bit
 * signed integer in Rs1. The result is first logically right-shifted by 1 bit and then sign-extended and
 * written to Rd.
 *
 * **Examples**:\n
 * ~~~
 * * Ra = 0x7FFFFFFF, Rb = 0x80000000 Rt = 0xFFFFFFFF
 * * Ra = 0x80000000, Rb = 0x7FFFFFFF Rt = 0x00000000
 * * Ra = 0x80000000, Rb = 0x40000000 Rt = 0x20000000
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * Rd[31:0] = (Rs1[31:0] - Rs2[31:0]) u>> 1;
 * * RV64:
 * resw[31:0] = (Rs1[31:0] - Rs2[31:0]) u>> 1;
 * Rd[63:0] = SE(resw[31:0]);
 * ~~~
 *
 * \param [in]  a    unsigned int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_URSUBW(unsigned int a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("ursubw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.185. URSUBW ===== */

/* ===== Inline Function Start for 3.186. WEXTI ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
 * \brief WEXTI (Extract Word from 64-bit Immediate)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * WEXTI Rd, Rs1, #LSBloc
 * ~~~
 *
 * **Purpose**:\n
 * Extract a 32-bit word from a 64-bit value stored in an even/odd pair of registers (RV32) or
 * a register (RV64) starting from a specified immediate LSB bit position.
 *
 * **RV32 Description**:\n
 * This instruction extracts a 32-bit word from a 64-bit value of an even/odd pair of registers specified
 * by Rs1(4,1) starting from a specified immediate LSB bit position, #LSBloc. The extracted word is
 * written to Rd.
 * Rs1(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register
 * pair includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the 64-bit value and the even `2d`
 * register of the pair contains the low 32-bit of the 64-bit value.
 *
 * **RV64 Description**:\n
 * This instruction extracts a 32-bit word from a 64-bit value in Rs1 starting from a specified
 * immediate LSB bit position, #LSBloc. The extracted word is sign-extended and written to lower 32-
 * bit of Rd.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * Idx0 = CONCAT(Rs1(4,1),1'b0); Idx1 = CONCAT(Rs2(4,1),1'b1);
 * src[63:0] = Concat(R[Idx1], R[Idx0]);
 * Rd = src[31+LSBloc:LSBloc];
 * * RV64:
 * ExtractW = Rs1[31+LSBloc:LSBloc];
 * Rd = SE(ExtractW)
 * ~~~
 *
 * \param [in]  a    long long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_WEXTI(a, b)    \
    ({    \
        unsigned long result;    \
        long long __a = (long long)(a);    \
        __ASM volatile("wexti %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 3.186. WEXTI ===== */

/* ===== Inline Function Start for 3.187. WEXT ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
 * \brief WEXT (Extract Word from 64-bit)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * WEXT Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Extract a 32-bit word from a 64-bit value stored in an even/odd pair of registers (RV32) or
 * a register (RV64) starting from a specified LSB bit position in a register.
 *
 * **RV32 Description**:\n
 * This instruction extracts a 32-bit word from a 64-bit value of an even/odd pair of registers specified
 * by Rs1(4,1) starting from a specified LSB bit position, specified in Rs2[4:0]. The extracted word is
 * written to Rd.
 * Rs1(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register
 * pair includes register 2d and 2d+1.
 * The odd `2d+1` register of the pair contains the high 32-bit of the 64-bit value and the even `2d`
 * register of the pair contains the low 32-bit of the 64-bit value.
 *
 * **Operations**:\n
 * ~~~
 * * RV32:
 * Idx0 = CONCAT(Rs1(4,1),1'b0); Idx1 = CONCAT(Rs1(4,1),1'b1);
 * src[63:0] = Concat(R[Idx1], R[Idx0]);
 * LSBloc = Rs2[4:0];
 * Rd = src[31+LSBloc:LSBloc];
 * * RV64:
 * LSBloc = Rs2[4:0];
 * ExtractW = Rs1[31+LSBloc:LSBloc];
 * Rd = SE(ExtractW)
 * ~~~
 *
 * \param [in]  a    long long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_WEXT(long long a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("wext %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 3.187. WEXT ===== */

/* ===== Inline Function Start for 3.188.1. ZUNPKD810 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
 * \brief ZUNPKD810 (Unsigned Unpacking Bytes 1 & 0)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * ZUNPKD8xy Rd, Rs1
 * xy = {10, 20, 30, 31, 32}
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned
 * halfwords of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
 * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit
 * chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x])
 * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y])
 * // ZUNPKD810, x=1,y=0
 * // ZUNPKD820, x=2,y=0
 * // ZUNPKD830, x=3,y=0
 * // ZUNPKD831, x=3,y=1
 * // ZUNPKD832, x=3,y=2
 * for RV32: m=0,
 * for RV64: m=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD810(unsigned long a)
{
    unsigned long result;
    __ASM volatile("zunpkd810 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.188.1. ZUNPKD810 ===== */

/* ===== Inline Function Start for 3.188.2. ZUNPKD820 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
 * \brief ZUNPKD820 (Unsigned Unpacking Bytes 2 & 0)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * ZUNPKD8xy Rd, Rs1
 * xy = {10, 20, 30, 31, 32}
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned
 * halfwords of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
 * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit
 * chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x])
 * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y])
 * // ZUNPKD810, x=1,y=0
 * // ZUNPKD820, x=2,y=0
 * // ZUNPKD830, x=3,y=0
 * // ZUNPKD831, x=3,y=1
 * // ZUNPKD832, x=3,y=2
 * for RV32: m=0,
 * for RV64: m=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD820(unsigned long a)
{
    unsigned long result;
    __ASM volatile("zunpkd820 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.188.2. ZUNPKD820 ===== */

/* ===== Inline Function Start for 3.188.3. ZUNPKD830 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
 * \brief ZUNPKD830 (Unsigned Unpacking Bytes 3 & 0)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * ZUNPKD8xy Rd, Rs1
 * xy = {10, 20, 30, 31, 32}
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned
 * halfwords of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
 * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit
 * chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x])
 * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y])
 * // ZUNPKD810, x=1,y=0
 * // ZUNPKD820, x=2,y=0
 * // ZUNPKD830, x=3,y=0
 * // ZUNPKD831, x=3,y=1
 * // ZUNPKD832, x=3,y=2
 * for RV32: m=0,
 * for RV64: m=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD830(unsigned long a)
{
    unsigned long result;
    __ASM volatile("zunpkd830 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.188.3. ZUNPKD830 ===== */

/* ===== Inline Function Start for 3.188.4. ZUNPKD831 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
 * \brief ZUNPKD831 (Unsigned Unpacking Bytes 3 & 1)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * ZUNPKD8xy Rd, Rs1
 * xy = {10, 20, 30, 31, 32}
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned
 * halfwords of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
 * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit
 * chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x])
 * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y])
 * // ZUNPKD810, x=1,y=0
 * // ZUNPKD820, x=2,y=0
 * // ZUNPKD830, x=3,y=0
 * // ZUNPKD831, x=3,y=1
 * // ZUNPKD832, x=3,y=2
 * for RV32: m=0,
 * for RV64: m=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD831(unsigned long a)
{
    unsigned long result;
    __ASM volatile("zunpkd831 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.188.4. ZUNPKD831 ===== */

/* ===== Inline Function Start for 3.188.5. ZUNPKD832 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
 * \brief ZUNPKD832 (Unsigned Unpacking Bytes 3 & 2)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * ZUNPKD8xy Rd, Rs1
 * xy = {10, 20, 30, 31, 32}
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned
 * halfwords of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
 * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit
 * chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x])
 * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y])
 * // ZUNPKD810, x=1,y=0
 * // ZUNPKD820, x=2,y=0
 * // ZUNPKD830, x=3,y=0
 * // ZUNPKD831, x=3,y=1
 * // ZUNPKD832, x=3,y=2
 * for RV32: m=0,
 * for RV64: m=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD832(unsigned long a)
{
    unsigned long result;
    __ASM volatile("zunpkd832 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 3.188.5. ZUNPKD832 ===== */

#if (__RISCV_XLEN == 64) || defined(__ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__)

/* ===== Inline Function Start for 4.1. ADD32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief ADD32 (SIMD 32-bit Addition)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * ADD32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit integer element additions simultaneously.
 *
 * **Description**:\n
 * This instruction adds the 32-bit integer elements in Rs1 with the 32-bit integer
 * elements in Rs2, and then writes the 32-bit element results to Rd.
 *
 * **Note**:\n
 * This instruction can be used for either signed or unsigned addition.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = Rs1.W[x] + Rs2.W[x];
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_ADD32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("add32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.1. ADD32 ===== */

/* ===== Inline Function Start for 4.2. CRAS32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief CRAS32 (SIMD 32-bit Cross Addition & Subtraction)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * CRAS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit integer element addition and 32-bit integer element subtraction in a 64-bit
 * chunk simultaneously. Operands are from crossed 32-bit elements.
 *
 * **Description**:\n
 * This instruction adds the 32-bit integer element in [63:32] of Rs1 with the 32-bit
 * integer element in [31:0] of Rs2, and writes the result to [63:32] of Rd; at the same time, it subtracts
 * the 32-bit integer element in [63:32] of Rs2 from the 32-bit integer element in [31:0] of Rs1, and
 * writes the result to [31:0] of Rd.
 *
 * **Note**:\n
 * This instruction can be used for either signed or unsigned operations.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[1] = Rs1.W[1] + Rs2.W[0];
 * Rd.W[0] = Rs1.W[0] - Rs2.W[1];
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_CRAS32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("cras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.2. CRAS32 ===== */

/* ===== Inline Function Start for 4.3. CRSA32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief CRSA32 (SIMD 32-bit Cross Subtraction & Addition)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * CRSA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit integer element subtraction and 32-bit integer element addition in a 64-bit
 * chunk simultaneously. Operands are from crossed 32-bit elements.
 * *Description: *
 * This instruction subtracts the 32-bit integer element in [31:0] of Rs2 from the 32-bit integer element
 * in [63:32] of Rs1, and writes the result to [63:32] of Rd; at the same time, it adds the 32-bit integer
 * element in [31:0] of Rs1 with the 32-bit integer element in [63:32] of Rs2, and writes the result to
 * [31:0] of Rd
 *
 * **Note**:\n
 * This instruction can be used for either signed or unsigned operations.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[1] = Rs1.W[1] - Rs2.W[0];
 * Rd.W[0] = Rs1.W[0] + Rs2.W[1];
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_CRSA32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("crsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.3. CRSA32 ===== */

/* ===== Inline Function Start for 4.4. KABS32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC
 * \brief KABS32 (Scalar 32-bit Absolute Value with Saturation)
 * \details
 * **Type**: DSP (RV64 Only)
24    20
19    15
14    12
11    7
KABS32
10010
Rs1
000
Rd
6    0
GE80B
1111111
 *
 * **Syntax**:\n
 * ~~~
 * KABS32 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Get the absolute value of signed 32-bit integer elements in a general register.
 *
 * **Description**:\n
 * This instruction calculates the absolute value of signed 32-bit integer elements stored
 * in Rs1. The results are written to Rd. This instruction with the minimum negative integer input of
 * 0x80000000 will produce a saturated output of maximum positive integer of 0x7fffffff and the OV
 * flag will be set to 1.
 *
 * **Operations**:\n
 * ~~~
 * if (Rs1.W[x] >= 0) {
 *   res[x] = Rs1.W[x];
 * } else {
 *   If (Rs1.W[x] == 0x80000000) {
 *     res[x] = 0x7fffffff;
 *     OV = 1;
 *   } else {
 *     res[x] = -Rs1.W[x];
 *   }
 * }
 * Rd.W[x] = res[x];
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KABS32(unsigned long a)
{
    unsigned long result;
    __ASM volatile("kabs32 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for 4.4. KABS32 ===== */

/* ===== Inline Function Start for 4.5. KADD32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief KADD32 (SIMD 32-bit Signed Saturating Addition)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KADD32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element saturating additions simultaneously.
 *
 * **Description**:\n
 * This instruction adds the 32-bit signed integer elements in Rs1 with the 32-bit signed
 * integer elements in Rs2. If any of the results are beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1),
 * they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rs1.W[x] + Rs2.W[x];
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KADD32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("kadd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.5. KADD32 ===== */

/* ===== Inline Function Start for 4.6. KCRAS32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief KCRAS32 (SIMD 32-bit Signed Saturating Cross Addition & Subtraction)
 * \details
 * **Type**: SIM (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KCRAS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element saturating addition and 32-bit signed integer element
 * saturating subtraction in a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements.
 *
 * **Description**:\n
 * This instruction adds the 32-bit integer element in [63:32] of Rs1 with the 32-bit
 * integer element in [31:0] of Rs2; at the same time, it subtracts the 32-bit integer element in [63:32] of
 * Rs2 from the 32-bit integer element in [31:0] of Rs1. If any of the results are beyond the Q31 number
 * range (-2^31 <= Q31 <= 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated
 * results are written to [63:32] of Rd for addition and [31:0] of Rd for subtraction.
 *
 * **Operations**:\n
 * ~~~
 * res[1] = Rs1.W[1] + Rs2.W[0];
 * res[0] = Rs1.W[0] - Rs2.W[1];
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[1] = res[1];
 * Rd.W[0] = res[0];
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KCRAS32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("kcras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.6. KCRAS32 ===== */

/* ===== Inline Function Start for 4.7. KCRSA32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief KCRSA32 (SIMD 32-bit Signed Saturating Cross Subtraction & Addition)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KCRSA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element saturating subtraction and 32-bit signed integer element
 * saturating addition in a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements.
 * *Description: *
 * This instruction subtracts the 32-bit integer element in [31:0] of Rs2 from the 32-bit integer element
 * in [63:32] of Rs1; at the same time, it adds the 32-bit integer element in [31:0] of Rs1 with the 32-bit
 * integer element in [63:32] of Rs2. If any of the results are beyond the Q31 number range (-2^31 <= Q31
 * <= 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to
 * [63:32] of Rd for subtraction and [31:0] of Rd for addition.
 *
 * **Operations**:\n
 * ~~~
 * res[1] = Rs1.W[1] - Rs2.W[0];
 * res[0] = Rs1.W[0] + Rs2.W[1];
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[1] = res[1];
 * Rd.W[0] = res[0];
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KCRSA32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("kcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.7. KCRSA32 ===== */

/* ===== Inline Function Start for 4.8.1. KDMBB16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
 * \brief KDMBB16 (SIMD Signed Saturating Double Multiply B16 x B16)
 * \details
 * **Type**: SIMD (RV64 only)
 *
 * **Syntax**:\n
 * ~~~
 * KDMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
 * of the 32-bit chunks in registers and then double and saturate the Q31 results into the 32-bit chunks
 * in the destination register. If saturation happens, an overflow flag OV will be set.
 *
 * **Description**:\n
 * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
 * or bottom 16-bit Q15 content of the 32-bit portions in Rs2. The Q30 results are then doubled and
 * saturated into Q31 values. The Q31 values are then written into the 32-bit chunks in Rd. When both
 * the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated to 0x7FFFFFFF
 * and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * // KDMBB16: (x,y,z)=(0,0,0),(2,2,1)
 * // KDMBT16: (x,y,z)=(0,1,0),(2,3,1)
 * // KDMTT16: (x,y,z)=(1,1,0),(3,3,1)
 * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y];
 * If (0x8000 != aop[z] | 0x8000 != bop[z]) {
 *   Mresult[z] = aop[z] * bop[z];
 *   resQ31[z] = Mresult[z] << 1;
 * } else {
 *   resQ31[z] = 0x7FFFFFFF;
 *   OV = 1;
 * }
 * Rd.W[z] = resQ31[z];
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KDMBB16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("kdmbb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.8.1. KDMBB16 ===== */

/* ===== Inline Function Start for 4.8.2. KDMBT16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
 * \brief KDMBT16 (SIMD Signed Saturating Double Multiply B16 x T16)
 * \details
 * **Type**: SIMD (RV64 only)
 *
 * **Syntax**:\n
 * ~~~
 * KDMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
 * of the 32-bit chunks in registers and then double and saturate the Q31 results into the 32-bit chunks
 * in the destination register. If saturation happens, an overflow flag OV will be set.
 *
 * **Description**:\n
 * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
 * or bottom 16-bit Q15 content of the 32-bit portions in Rs2. The Q30 results are then doubled and
 * saturated into Q31 values. The Q31 values are then written into the 32-bit chunks in Rd. When both
 * the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated to 0x7FFFFFFF
 * and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * // KDMBB16: (x,y,z)=(0,0,0),(2,2,1)
 * // KDMBT16: (x,y,z)=(0,1,0),(2,3,1)
 * // KDMTT16: (x,y,z)=(1,1,0),(3,3,1)
 * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y];
 * If (0x8000 != aop[z] | 0x8000 != bop[z]) {
 *   Mresult[z] = aop[z] * bop[z];
 *   resQ31[z] = Mresult[z] << 1;
 * } else {
 *   resQ31[z] = 0x7FFFFFFF;
 *   OV = 1;
 * }
 * Rd.W[z] = resQ31[z];
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KDMBT16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("kdmbt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.8.2. KDMBT16 ===== */

/* ===== Inline Function Start for 4.8.3. KDMTT16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
 * \brief KDMTT16 (SIMD Signed Saturating Double Multiply T16 x T16)
 * \details
 * **Type**: SIMD (RV64 only)
 *
 * **Syntax**:\n
 * ~~~
 * KDMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
 * of the 32-bit chunks in registers and then double and saturate the Q31 results into the 32-bit chunks
 * in the destination register. If saturation happens, an overflow flag OV will be set.
 *
 * **Description**:\n
 * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
 * or bottom 16-bit Q15 content of the 32-bit portions in Rs2. The Q30 results are then doubled and
 * saturated into Q31 values. The Q31 values are then written into the 32-bit chunks in Rd. When both
 * the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated to 0x7FFFFFFF
 * and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * // KDMBB16: (x,y,z)=(0,0,0),(2,2,1)
 * // KDMBT16: (x,y,z)=(0,1,0),(2,3,1)
 * // KDMTT16: (x,y,z)=(1,1,0),(3,3,1)
 * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y];
 * If (0x8000 != aop[z] | 0x8000 != bop[z]) {
 *   Mresult[z] = aop[z] * bop[z];
 *   resQ31[z] = Mresult[z] << 1;
 * } else {
 *   resQ31[z] = 0x7FFFFFFF;
 *   OV = 1;
 * }
 * Rd.W[z] = resQ31[z];
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KDMTT16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("kdmtt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.8.3. KDMTT16 ===== */

/* ===== Inline Function Start for 4.9.1. KDMABB16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
 * \brief KDMABB16 (SIMD Signed Saturating Double Multiply Addition B16 x B16)
 * \details
 * **Type**: SIMD (RV64 only)
 *
 * **Syntax**:\n
 * ~~~
 * KDMAxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
 * of the 32-bit chunks in registers and then double and saturate the Q31 results, add the results with
 * the values of the corresponding 32-bit chunks from the destination register and write the saturated
 * addition results back into the corresponding 32-bit chunks of the destination register. If saturation
 * happens, an overflow flag OV will be set.
 *
 * **Description**:\n
 * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
 * or bottom 16-bit Q15 content of the corresponding 32-bit portions in Rs2. The Q30 results are then
 * doubled and saturated into Q31 values. The Q31 values are then added with the content of the
 * corresponding 32-bit portions of Rd. If the addition results are beyond the Q31 number range (-2^31 <=
 * Q31 <= 2^31-1), they are saturated to the range and the OV flag is set to 1. The results after saturation
 * are written back to Rd.
 * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be
 * set.
 *
 * **Operations**:\n
 * ~~~
 * // KDMABB16: (x,y,z)=(0,0,0),(2,2,1)
 * // KDMABT16: (x,y,z)=(0,1,0),(2,3,1)
 * // KDMATT16: (x,y,z)=(1,1,0),(3,3,1)
 * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y];
 * If (0x8000 != aop[z] | 0x8000 != bop[z]) {
 *   Mresult[z] = aop[z] * bop[z];
 *   resQ31[z] = Mresult[z] << 1;
 * } else {
 *   resQ31[z] = 0x7FFFFFFF;
 *   OV = 1;
 * }
 * resadd[z] = Rd.W[z] + resQ31[z];
 * if (resadd[z] > (2^31)-1) {
 *   resadd[z] = (2^31)-1;
 *   OV = 1;
 * } else if (resadd[z] < -2^31) {
 *   resadd[z] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[z] = resadd[z];
 * ~~~
 *
 * \param [in]  t    unsigned long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KDMABB16(unsigned long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kdmabb16 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 4.9.1. KDMABB16 ===== */

/* ===== Inline Function Start for 4.9.2. KDMABT16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
 * \brief KDMABT16 (SIMD Signed Saturating Double Multiply Addition B16 x T16)
 * \details
 * **Type**: SIMD (RV64 only)
 *
 * **Syntax**:\n
 * ~~~
 * KDMAxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
 * of the 32-bit chunks in registers and then double and saturate the Q31 results, add the results with
 * the values of the corresponding 32-bit chunks from the destination register and write the saturated
 * addition results back into the corresponding 32-bit chunks of the destination register. If saturation
 * happens, an overflow flag OV will be set.
 *
 * **Description**:\n
 * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
 * or bottom 16-bit Q15 content of the corresponding 32-bit portions in Rs2. The Q30 results are then
 * doubled and saturated into Q31 values. The Q31 values are then added with the content of the
 * corresponding 32-bit portions of Rd. If the addition results are beyond the Q31 number range (-2^31 <=
 * Q31 <= 2^31-1), they are saturated to the range and the OV flag is set to 1. The results after saturation
 * are written back to Rd.
 * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be
 * set.
 *
 * **Operations**:\n
 * ~~~
 * // KDMABB16: (x,y,z)=(0,0,0),(2,2,1)
 * // KDMABT16: (x,y,z)=(0,1,0),(2,3,1)
 * // KDMATT16: (x,y,z)=(1,1,0),(3,3,1)
 * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y];
 * If (0x8000 != aop[z] | 0x8000 != bop[z]) {
 *   Mresult[z] = aop[z] * bop[z];
 *   resQ31[z] = Mresult[z] << 1;
 * } else {
 *   resQ31[z] = 0x7FFFFFFF;
 *   OV = 1;
 * }
 * resadd[z] = Rd.W[z] + resQ31[z];
 * if (resadd[z] > (2^31)-1) {
 *   resadd[z] = (2^31)-1;
 *   OV = 1;
 * } else if (resadd[z] < -2^31) {
 *   resadd[z] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[z] = resadd[z];
 * ~~~
 *
 * \param [in]  t    unsigned long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KDMABT16(unsigned long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kdmabt16 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 4.9.2. KDMABT16 ===== */

/* ===== Inline Function Start for 4.9.3. KDMATT16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
 * \brief KDMATT16 (SIMD Signed Saturating Double Multiply Addition T16 x T16)
 * \details
 * **Type**: SIMD (RV64 only)
 *
 * **Syntax**:\n
 * ~~~
 * KDMAxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
 * of the 32-bit chunks in registers and then double and saturate the Q31 results, add the results with
 * the values of the corresponding 32-bit chunks from the destination register and write the saturated
 * addition results back into the corresponding 32-bit chunks of the destination register. If saturation
 * happens, an overflow flag OV will be set.
 *
 * **Description**:\n
 * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
 * or bottom 16-bit Q15 content of the corresponding 32-bit portions in Rs2. The Q30 results are then
 * doubled and saturated into Q31 values. The Q31 values are then added with the content of the
 * corresponding 32-bit portions of Rd. If the addition results are beyond the Q31 number range (-2^31 <=
 * Q31 <= 2^31-1), they are saturated to the range and the OV flag is set to 1. The results after saturation
 * are written back to Rd.
 * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be
 * set.
 *
 * **Operations**:\n
 * ~~~
 * // KDMABB16: (x,y,z)=(0,0,0),(2,2,1)
 * // KDMABT16: (x,y,z)=(0,1,0),(2,3,1)
 * // KDMATT16: (x,y,z)=(1,1,0),(3,3,1)
 * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y];
 * If (0x8000 != aop[z] | 0x8000 != bop[z]) {
 *   Mresult[z] = aop[z] * bop[z];
 *   resQ31[z] = Mresult[z] << 1;
 * } else {
 *   resQ31[z] = 0x7FFFFFFF;
 *   OV = 1;
 * }
 * resadd[z] = Rd.W[z] + resQ31[z];
 * if (resadd[z] > (2^31)-1) {
 *   resadd[z] = (2^31)-1;
 *   OV = 1;
 * } else if (resadd[z] < -2^31) {
 *   resadd[z] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[z] = resadd[z];
 * ~~~
 *
 * \param [in]  t    unsigned long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KDMATT16(unsigned long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kdmatt16 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 4.9.3. KDMATT16 ===== */

/* ===== Inline Function Start for 4.10.1. KHMBB16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
 * \brief KHMBB16 (SIMD Signed Saturating Half Multiply B16 x B16)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KHMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
 * of the 32-bit chunks in registers and then right-shift 15 bits to turn the Q30 results into Q15
 * numbers again and saturate the Q15 results into the destination register. If saturation happens, an
 * overflow flag OV will be set.
 *
 * **Description**:\n
 * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
 * or bottom 16-bit Q15 content of the 32-bit portion in Rs2. The Q30 results are then right-shifted 15-
 * bits and saturated into Q15 values. The 32-bit Q15 values are then written into the 32-bit chunks in
 * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated
 * to 0x7FFF and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * // KHMBB16: (x,y,z)=(0,0,0),(2,2,1)
 * // KHMBT16: (x,y,z)=(0,1,0),(2,3,1)
 * // KHMTT16: (x,y,z)=(1,1,0),(3,3,1)
 * aop = Rs1.H[x]; bop = Rs2.H[y];
 * If (0x8000 != aop | 0x8000 != bop) {
 *   Mresult[31:0] = aop * bop;
 *   res[15:0] = Mresult[30:15];
 * } else {
 *   res[15:0] = 0x7FFF;
 *   OV = 1;
 * }
 * Rd.W[z] = SE32(res[15:0]);
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KHMBB16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("khmbb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.10.1. KHMBB16 ===== */

/* ===== Inline Function Start for 4.10.2. KHMBT16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
 * \brief KHMBT16 (SIMD Signed Saturating Half Multiply B16 x T16)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KHMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
 * of the 32-bit chunks in registers and then right-shift 15 bits to turn the Q30 results into Q15
 * numbers again and saturate the Q15 results into the destination register. If saturation happens, an
 * overflow flag OV will be set.
 *
 * **Description**:\n
 * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
 * or bottom 16-bit Q15 content of the 32-bit portion in Rs2. The Q30 results are then right-shifted 15-
 * bits and saturated into Q15 values. The 32-bit Q15 values are then written into the 32-bit chunks in
 * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated
 * to 0x7FFF and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * // KHMBB16: (x,y,z)=(0,0,0),(2,2,1)
 * // KHMBT16: (x,y,z)=(0,1,0),(2,3,1)
 * // KHMTT16: (x,y,z)=(1,1,0),(3,3,1)
 * aop = Rs1.H[x]; bop = Rs2.H[y];
 * If (0x8000 != aop | 0x8000 != bop) {
 *   Mresult[31:0] = aop * bop;
 *   res[15:0] = Mresult[30:15];
 * } else {
 *   res[15:0] = 0x7FFF;
 *   OV = 1;
 * }
 * Rd.W[z] = SE32(res[15:0]);
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KHMBT16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("khmbt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.10.2. KHMBT16 ===== */

/* ===== Inline Function Start for 4.10.3. KHMTT16 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
 * \brief KHMTT16 (SIMD Signed Saturating Half Multiply T16 x T16)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KHMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
 * of the 32-bit chunks in registers and then right-shift 15 bits to turn the Q30 results into Q15
 * numbers again and saturate the Q15 results into the destination register. If saturation happens, an
 * overflow flag OV will be set.
 *
 * **Description**:\n
 * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
 * or bottom 16-bit Q15 content of the 32-bit portion in Rs2. The Q30 results are then right-shifted 15-
 * bits and saturated into Q15 values. The 32-bit Q15 values are then written into the 32-bit chunks in
 * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated
 * to 0x7FFF and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * // KHMBB16: (x,y,z)=(0,0,0),(2,2,1)
 * // KHMBT16: (x,y,z)=(0,1,0),(2,3,1)
 * // KHMTT16: (x,y,z)=(1,1,0),(3,3,1)
 * aop = Rs1.H[x]; bop = Rs2.H[y];
 * If (0x8000 != aop | 0x8000 != bop) {
 *   Mresult[31:0] = aop * bop;
 *   res[15:0] = Mresult[30:15];
 * } else {
 *   res[15:0] = 0x7FFF;
 *   OV = 1;
 * }
 * Rd.W[z] = SE32(res[15:0]);
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KHMTT16(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("khmtt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.10.3. KHMTT16 ===== */

/* ===== Inline Function Start for 4.11.1. KMABB32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT_ADD
 * \brief KMABB32 (Saturating Signed Multiply Bottom Words & Add)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KMABB32 Rd, Rs1, Rs2
 * KMABT32 Rd, Rs1, Rs2
 * KMATT32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit element in a register with the 32-bit element in another register
 * and add the result to the content of 64-bit data in the third register. The addition result may be
 * saturated and is written to the third register.
 * * KMABB32: rd + bottom*bottom
 * * KMABT32: rd + bottom*top
 * * KMATT32: rd + top*top
 *
 * **Description**:\n
 * For the `KMABB32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit
 * element in Rs2.
 * For the `KMABT32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit
 * element in Rs2.
 * For the `KMATT32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit
 * element in Rs2.
 * The multiplication result is added to the content of 64-bit data in Rd. If the addition result is beyond
 * the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The
 * result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rd + (Rs1.W[0] * Rs2.W[0]); // KMABB32
 *  res = Rd + (Rs1.W[0] * Rs2.W[1]); // KMABT32
 *  res = Rd + (Rs1.W[1] * Rs2.W[1]); // KMATT32
 *  if (res > (2^63)-1) {
 *    res = (2^63)-1;
 *    OV = 1;
 *  } else if (res < -2^63) {
 *    res = -2^63;
 *    OV = 1;
 *  }
 *  Rd = res;
 * *Exceptions:* None
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMABB32(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmabb32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 4.11.1. KMABB32 ===== */

/* ===== Inline Function Start for 4.11.2. KMABT32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT_ADD
 * \brief KMABT32 (Saturating Signed Multiply Bottom & Top Words & Add)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KMABB32 Rd, Rs1, Rs2
 * KMABT32 Rd, Rs1, Rs2
 * KMATT32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit element in a register with the 32-bit element in another register
 * and add the result to the content of 64-bit data in the third register. The addition result may be
 * saturated and is written to the third register.
 * * KMABB32: rd + bottom*bottom
 * * KMABT32: rd + bottom*top
 * * KMATT32: rd + top*top
 *
 * **Description**:\n
 * For the `KMABB32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit
 * element in Rs2.
 * For the `KMABT32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit
 * element in Rs2.
 * For the `KMATT32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit
 * element in Rs2.
 * The multiplication result is added to the content of 64-bit data in Rd. If the addition result is beyond
 * the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The
 * result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rd + (Rs1.W[0] * Rs2.W[0]); // KMABB32
 *  res = Rd + (Rs1.W[0] * Rs2.W[1]); // KMABT32
 *  res = Rd + (Rs1.W[1] * Rs2.W[1]); // KMATT32
 *  if (res > (2^63)-1) {
 *    res = (2^63)-1;
 *    OV = 1;
 *  } else if (res < -2^63) {
 *    res = -2^63;
 *    OV = 1;
 *  }
 *  Rd = res;
 * *Exceptions:* None
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMABT32(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmabt32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 4.11.2. KMABT32 ===== */

/* ===== Inline Function Start for 4.11.3. KMATT32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT_ADD
 * \brief KMATT32 (Saturating Signed Multiply Top Words & Add)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KMABB32 Rd, Rs1, Rs2
 * KMABT32 Rd, Rs1, Rs2
 * KMATT32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit element in a register with the 32-bit element in another register
 * and add the result to the content of 64-bit data in the third register. The addition result may be
 * saturated and is written to the third register.
 * * KMABB32: rd + bottom*bottom
 * * KMABT32: rd + bottom*top
 * * KMATT32: rd + top*top
 *
 * **Description**:\n
 * For the `KMABB32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit
 * element in Rs2.
 * For the `KMABT32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit
 * element in Rs2.
 * For the `KMATT32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit
 * element in Rs2.
 * The multiplication result is added to the content of 64-bit data in Rd. If the addition result is beyond
 * the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The
 * result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rd + (Rs1.W[0] * Rs2.W[0]); // KMABB32
 *  res = Rd + (Rs1.W[0] * Rs2.W[1]); // KMABT32
 *  res = Rd + (Rs1.W[1] * Rs2.W[1]); // KMATT32
 *  if (res > (2^63)-1) {
 *    res = (2^63)-1;
 *    OV = 1;
 *  } else if (res < -2^63) {
 *    res = -2^63;
 *    OV = 1;
 *  }
 *  Rd = res;
 * *Exceptions:* None
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMATT32(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmatt32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 4.11.3. KMATT32 ===== */

/* ===== Inline Function Start for 4.12.1. KMADA32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
 * \brief KMADA32 (Saturating Signed Multiply Two Words and Two Adds)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KMADA32 Rd, Rs1, Rs2
 * KMAXDA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32-bit multiplications from 32-bit data in two registers; and then adds the
 * two 64-bit results and 64-bit data in a third register together. The addition result may be saturated.
 * * KMADA32: rd + top*top + bottom*bottom
 * * KMAXDA32: rd + top*bottom + bottom*top
 *
 * **Description**:\n
 * For the `KMADA32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-
 * bit element in Rs2 and then adds the result to the result of multiplying the top 32-bit element in Rs1
 * with the top 32-bit element in Rs2. It is actually an alias of the `KMAR64` instruction.
 * For the `KMAXDA32` instruction, it multiplies the top 32-bit element in Rs1 with the bottom 32-bit
 * element in Rs2 and then adds the result to the result of multiplying the bottom 32-bit element in Rs1
 * with the top 32-bit element in Rs2.
 * The result is added to the content of 64-bit data in Rd. If the addition result is beyond the Q63
 * number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The 64-bit
 * result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rd + (Rs1.W[1] * Rs2.w[1]) + (Rs1.W[0] * Rs2.W[0]); // KMADA32
 * res = Rd + (Rs1.W[1] * Rs2.W[0]) + (Rs1.W[0] * Rs2.W[1]); // KMAXDA32
 * if (res > (2^63)-1) {
 *   res = (2^63)-1;
 *   OV = 1;
 * } else if (res < -2^63) {
 *   res = -2^63;
 *   OV = 1;
 * }
 * Rd = res;
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMADA32(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmada32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 4.12.1. KMADA32 ===== */

/* ===== Inline Function Start for 4.12.2. KMAXDA32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
 * \brief KMAXDA32 (Saturating Signed Crossed Multiply Two Words and Two Adds)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KMADA32 Rd, Rs1, Rs2
 * KMAXDA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32-bit multiplications from 32-bit data in two registers; and then adds the
 * two 64-bit results and 64-bit data in a third register together. The addition result may be saturated.
 * * KMADA32: rd + top*top + bottom*bottom
 * * KMAXDA32: rd + top*bottom + bottom*top
 *
 * **Description**:\n
 * For the `KMADA32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-
 * bit element in Rs2 and then adds the result to the result of multiplying the top 32-bit element in Rs1
 * with the top 32-bit element in Rs2. It is actually an alias of the `KMAR64` instruction.
 * For the `KMAXDA32` instruction, it multiplies the top 32-bit element in Rs1 with the bottom 32-bit
 * element in Rs2 and then adds the result to the result of multiplying the bottom 32-bit element in Rs1
 * with the top 32-bit element in Rs2.
 * The result is added to the content of 64-bit data in Rd. If the addition result is beyond the Q63
 * number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The 64-bit
 * result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rd + (Rs1.W[1] * Rs2.w[1]) + (Rs1.W[0] * Rs2.W[0]); // KMADA32
 * res = Rd + (Rs1.W[1] * Rs2.W[0]) + (Rs1.W[0] * Rs2.W[1]); // KMAXDA32
 * if (res > (2^63)-1) {
 *   res = (2^63)-1;
 *   OV = 1;
 * } else if (res < -2^63) {
 *   res = -2^63;
 *   OV = 1;
 * }
 * Rd = res;
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMAXDA32(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmaxda32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 4.12.2. KMAXDA32 ===== */

/* ===== Inline Function Start for 4.13.1. KMDA32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
 * \brief KMDA32 (Signed Multiply Two Words and Add)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KMDA32 Rd, Rs1, Rs2
 * KMXDA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32-bit multiplications from the 32-bit element of two registers; and then
 * adds the two 64-bit results together. The addition result may be saturated.
 * * KMDA32: top*top + bottom*bottom
 * * KMXDA32: top*bottom + bottom*top
 *
 * **Description**:\n
 * For the `KMDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
 * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1
 * with the top 32-bit element of Rs2.
 * For the `KMXDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
 * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1
 * with the bottom 32-bit element of Rs2.
 * The addition result is checked for saturation. If saturation happens, the result is saturated to 2^63-1.
 * The final result is written to Rd. The 32-bit contents are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * if ((Rs1 != 0x8000000080000000) or (Rs2 != 0x8000000080000000)) {
 *   Rd = (Rs1.W[1] * Rs2.W[1]) + (Rs1.W[0] * Rs2.W[0]); // KMDA32
 *   Rd = (Rs1.W[1] * Rs2.W[0]) + (Rs1.W[0] * Rs2.W[1]); // KMXDA32
 * } else {
 *   Rd = 0x7fffffffffffffff;
 *   OV = 1;
 * }
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMDA32(unsigned long a, unsigned long b)
{
    long result;
    __ASM volatile("kmda32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.13.1. KMDA32 ===== */

/* ===== Inline Function Start for 4.13.2. KMXDA32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
 * \brief KMXDA32 (Signed Crossed Multiply Two Words and Add)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KMDA32 Rd, Rs1, Rs2
 * KMXDA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32-bit multiplications from the 32-bit element of two registers; and then
 * adds the two 64-bit results together. The addition result may be saturated.
 * * KMDA32: top*top + bottom*bottom
 * * KMXDA32: top*bottom + bottom*top
 *
 * **Description**:\n
 * For the `KMDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
 * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1
 * with the top 32-bit element of Rs2.
 * For the `KMXDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
 * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1
 * with the bottom 32-bit element of Rs2.
 * The addition result is checked for saturation. If saturation happens, the result is saturated to 2^63-1.
 * The final result is written to Rd. The 32-bit contents are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * if ((Rs1 != 0x8000000080000000) or (Rs2 != 0x8000000080000000)) {
 *   Rd = (Rs1.W[1] * Rs2.W[1]) + (Rs1.W[0] * Rs2.W[0]); // KMDA32
 *   Rd = (Rs1.W[1] * Rs2.W[0]) + (Rs1.W[0] * Rs2.W[1]); // KMXDA32
 * } else {
 *   Rd = 0x7fffffffffffffff;
 *   OV = 1;
 * }
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMXDA32(unsigned long a, unsigned long b)
{
    long result;
    __ASM volatile("kmxda32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.13.2. KMXDA32 ===== */

/* ===== Inline Function Start for 4.14.1. KMADS32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
 * \brief KMADS32 (Saturating Signed Multiply Two Words & Subtract & Add)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KMADS32 Rd, Rs1, Rs2
 * KMADRS32 Rd, Rs1, Rs2
 * KMAXDS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32-bit multiplications from 32-bit elements in two registers; and then
 * perform a subtraction operation between the two 64-bit results. Then add the subtraction result to
 * 64-bit data in a third register. The addition result may be saturated.
 * * KMADS32: rd + (top*top - bottom*bottom)
 * * KMADRS32: rd + (bottom*bottom - top*top)
 * * KMAXDS32: rd + (top*bottom - bottom*top)
 *
 * **Description**:\n
 * For the `KMADS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit
 * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
 * Rs1 with the top 32-bit element in Rs2.
 * For the `KMADRS32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit
 * element in Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
 * element in Rs1 with the bottom 32-bit element in Rs2.
 * For the `KMAXDS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit
 * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
 * Rs1 with the bottom 32-bit element in Rs2.
 * The subtraction result is then added to the content of 64-bit data in Rd. If the addition result is
 * beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to
 * 1. The 64-bit result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated
 * as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rd + (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMADS32
 * res = Rd + (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // KMADRS32
 * res = Rd + (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMAXDS32
 * if (res > (2^63)-1) {
 *   res = (2^63)-1;
 *   OV = 1;
 * } else if (res < -2^63) {
 *   res = -2^63;
 *   OV = 1;
 * }
 * Rd = res;
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMADS32(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmads32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 4.14.1. KMADS32 ===== */

/* ===== Inline Function Start for 4.14.2. KMADRS32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
 * \brief KMADRS32 (Saturating Signed Multiply Two Words & Reverse Subtract & Add)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KMADS32 Rd, Rs1, Rs2
 * KMADRS32 Rd, Rs1, Rs2
 * KMAXDS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32-bit multiplications from 32-bit elements in two registers; and then
 * perform a subtraction operation between the two 64-bit results. Then add the subtraction result to
 * 64-bit data in a third register. The addition result may be saturated.
 * * KMADS32: rd + (top*top - bottom*bottom)
 * * KMADRS32: rd + (bottom*bottom - top*top)
 * * KMAXDS32: rd + (top*bottom - bottom*top)
 *
 * **Description**:\n
 * For the `KMADS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit
 * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
 * Rs1 with the top 32-bit element in Rs2.
 * For the `KMADRS32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit
 * element in Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
 * element in Rs1 with the bottom 32-bit element in Rs2.
 * For the `KMAXDS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit
 * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
 * Rs1 with the bottom 32-bit element in Rs2.
 * The subtraction result is then added to the content of 64-bit data in Rd. If the addition result is
 * beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to
 * 1. The 64-bit result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated
 * as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rd + (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMADS32
 * res = Rd + (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // KMADRS32
 * res = Rd + (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMAXDS32
 * if (res > (2^63)-1) {
 *   res = (2^63)-1;
 *   OV = 1;
 * } else if (res < -2^63) {
 *   res = -2^63;
 *   OV = 1;
 * }
 * Rd = res;
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMADRS32(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmadrs32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 4.14.2. KMADRS32 ===== */

/* ===== Inline Function Start for 4.14.3. KMAXDS32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
 * \brief KMAXDS32 (Saturating Signed Crossed Multiply Two Words & Subtract & Add)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KMADS32 Rd, Rs1, Rs2
 * KMADRS32 Rd, Rs1, Rs2
 * KMAXDS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32-bit multiplications from 32-bit elements in two registers; and then
 * perform a subtraction operation between the two 64-bit results. Then add the subtraction result to
 * 64-bit data in a third register. The addition result may be saturated.
 * * KMADS32: rd + (top*top - bottom*bottom)
 * * KMADRS32: rd + (bottom*bottom - top*top)
 * * KMAXDS32: rd + (top*bottom - bottom*top)
 *
 * **Description**:\n
 * For the `KMADS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit
 * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
 * Rs1 with the top 32-bit element in Rs2.
 * For the `KMADRS32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit
 * element in Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
 * element in Rs1 with the bottom 32-bit element in Rs2.
 * For the `KMAXDS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit
 * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
 * Rs1 with the bottom 32-bit element in Rs2.
 * The subtraction result is then added to the content of 64-bit data in Rd. If the addition result is
 * beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to
 * 1. The 64-bit result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated
 * as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rd + (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMADS32
 * res = Rd + (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // KMADRS32
 * res = Rd + (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMAXDS32
 * if (res > (2^63)-1) {
 *   res = (2^63)-1;
 *   OV = 1;
 * } else if (res < -2^63) {
 *   res = -2^63;
 *   OV = 1;
 * }
 * Rd = res;
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMAXDS32(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmaxds32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 4.14.3. KMAXDS32 ===== */

/* ===== Inline Function Start for 4.15.1. KMSDA32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
 * \brief KMSDA32 (Saturating Signed Multiply Two Words & Add & Subtract)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KMSDA32 Rd, Rs1, Rs2
 * KMSXDA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32-bit multiplications from the 32-bit element of two registers; and then
 * subtracts the two 64-bit results from a third register. The subtraction result may be saturated.
 * * KMSDA: rd - top*top - bottom*bottom
 * * KMSXDA: rd - top*bottom - bottom*top
 *
 * **Description**:\n
 * For the `KMSDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
 * element of Rs2 and multiplies the top 32-bit element of Rs1 with the top 32-bit element of Rs2.
 * For the `KMSXDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
 * element of Rs2 and multiplies the top 32-bit element of Rs1 with the bottom 32-bit element of Rs2.
 * The two 64-bit multiplication results are then subtracted from the content of Rd. If the subtraction
 * result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit
 * is set to 1. The result after saturation is written to Rd. The 32-bit contents are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rd - (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMSDA32
 * res = Rd - (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMSXDA32
 * if (res > (2^63)-1) {
 *   res = (2^63)-1;
 *   OV = 1;
 * } else if (res < -2^63) {
 *   res = -2^63;
 *   OV = 1;
 * }
 * Rd = res;
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMSDA32(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmsda32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 4.15.1. KMSDA32 ===== */

/* ===== Inline Function Start for 4.15.2. KMSXDA32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
 * \brief KMSXDA32 (Saturating Signed Crossed Multiply Two Words & Add & Subtract)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KMSDA32 Rd, Rs1, Rs2
 * KMSXDA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32-bit multiplications from the 32-bit element of two registers; and then
 * subtracts the two 64-bit results from a third register. The subtraction result may be saturated.
 * * KMSDA: rd - top*top - bottom*bottom
 * * KMSXDA: rd - top*bottom - bottom*top
 *
 * **Description**:\n
 * For the `KMSDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
 * element of Rs2 and multiplies the top 32-bit element of Rs1 with the top 32-bit element of Rs2.
 * For the `KMSXDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
 * element of Rs2 and multiplies the top 32-bit element of Rs1 with the bottom 32-bit element of Rs2.
 * The two 64-bit multiplication results are then subtracted from the content of Rd. If the subtraction
 * result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit
 * is set to 1. The result after saturation is written to Rd. The 32-bit contents are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rd - (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMSDA32
 * res = Rd - (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMSXDA32
 * if (res > (2^63)-1) {
 *   res = (2^63)-1;
 *   OV = 1;
 * } else if (res < -2^63) {
 *   res = -2^63;
 *   OV = 1;
 * }
 * Rd = res;
 * ~~~
 *
 * \param [in]  t    long type of value stored in t
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_KMSXDA32(long t, unsigned long a, unsigned long b)
{
    __ASM volatile("kmsxda32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for 4.15.2. KMSXDA32 ===== */

/* ===== Inline Function Start for 4.16. KSLL32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
 * \brief KSLL32 (SIMD 32-bit Saturating Shift Left Logical)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KSLL32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit elements logical left shift operations with saturation simultaneously. The shift
 * amount is a variable from a GPR.
 *
 * **Description**:\n
 * The 32-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled
 * with zero and the shift amount is specified by the low-order 5-bits of the value in the Rs2 register.
 * Any shifted value greater than 2^31-1 is saturated to 2^31-1. Any shifted value smaller than -2^31 is
 * saturated to -2^31. And the saturated results are written to Rd. If any saturation is performed, set OV
 * bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[4:0];
 * if (sa != 0) {
 *   res[(31+sa):0] = Rs1.W[x] << sa;
 *   if (res > (2^31)-1) {
 *     res = 0x7fffffff; OV = 1;
 *   } else if (res < -2^31) {
 *     res = 0x80000000; OV = 1;
 *   }
 *   Rd.W[x] = res[31:0];
 * } else {
 *   Rd = Rs1;
 * }
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KSLL32(unsigned long a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("ksll32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.16. KSLL32 ===== */

/* ===== Inline Function Start for 4.17. KSLLI32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
 * \brief KSLLI32 (SIMD 32-bit Saturating Shift Left Logical Immediate)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KSLLI32 Rd, Rs1, imm5u
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit elements logical left shift operations with saturation simultaneously. The shift
 * amount is an immediate value.
 *
 * **Description**:\n
 * The 32-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled
 * with zero and the shift amount is specified by the imm5u constant. Any shifted value greater than
 * 2^31-1 is saturated to 2^31-1. Any shifted value smaller than -2^31 is saturated to -2^31. And the saturated
 * results are written to Rd. If any saturation is performed, set OV bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm5u[4:0];
 * if (sa != 0) {
 *   res[(31+sa):0] = Rs1.W[x] << sa;
 *   if (res > (2^31)-1) {
 *     res = 0x7fffffff; OV = 1;
 *   } else if (res < -2^31) {
 *     res = 0x80000000; OV = 1;
 *   }
 *   Rd.W[x] = res[31:0];
 * } else {
 *   Rd = Rs1;
 * }
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_KSLLI32(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("kslli32 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 4.17. KSLLI32 ===== */

/* ===== Inline Function Start for 4.18.1. KSLRA32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
 * \brief KSLRA32 (SIMD 32-bit Shift Left Logical with Saturation or Shift Right Arithmetic)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KSLRA32 Rd, Rs1, Rs2
 * KSLRA32.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit elements logical left (positive) or arithmetic right (negative) shift operation with
 * Q31 saturation for the left shift. The `.u` form performs additional rounding up operations for the
 * right shift.
 *
 * **Description**:\n
 * The 32-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
 * based on the value of Rs2[5:0]. Rs2[5:0] is in the signed range of [-25, 25-1]. A positive Rs2[5:0] means
 * logical left shift and a negative Rs2[5:0] means arithmetic right shift. The shift amount is the
 * absolute value of Rs2[5:0]. However, the behavior of `Rs2[5:0]==-25 (0x20)` is defined to be
 * equivalent to the behavior of `Rs2[5:0]==-(25-1) (0x21)`.
 * The left-shifted results are saturated to the 32-bit signed integer range of [-2^31, 2^31-1]. For the `.u`
 * form of the instruction, the right-shifted results are added a 1 to the most significant discarded bit
 * position for rounding effect. After the shift, saturation, or rounding, the final results are written to
 * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:6] will not affect
 * this instruction.
 *
 * **Operations**:\n
 * ~~~
 * if (Rs2[5:0] < 0) {
 *   sa = -Rs2[5:0];
 *   sa = (sa == 32)? 31 : sa;
 *   if (`.u` form) {
 *     res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1;
 *     Rd.W[x] = res[31:0];
 *   } else {
 *     Rd.W[x] = SE32(Rs1.W[x][31:sa]);
 *   }
 * } else {
 *   sa = Rs2[4:0];
 *   res[(31+sa):0] = Rs1.W[x] <<(logic) sa;
 *   if (res > (2^31)-1) {
 *     res[31:0] = 0x7fffffff; OV = 1;
 *   } else if (res < -2^31) {
 *     res[31:0] = 0x80000000; OV = 1;
 *   }
 *   Rd.W[x] = res[31:0];
 * }
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KSLRA32(unsigned long a, int b)
{
    unsigned long result;
    __ASM volatile("kslra32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.18.1. KSLRA32 ===== */

/* ===== Inline Function Start for 4.18.2. KSLRA32.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
 * \brief KSLRA32.u (SIMD 32-bit Shift Left Logical with Saturation or Rounding Shift Right Arithmetic)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KSLRA32 Rd, Rs1, Rs2
 * KSLRA32.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit elements logical left (positive) or arithmetic right (negative) shift operation with
 * Q31 saturation for the left shift. The `.u` form performs additional rounding up operations for the
 * right shift.
 *
 * **Description**:\n
 * The 32-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
 * based on the value of Rs2[5:0]. Rs2[5:0] is in the signed range of [-25, 25-1]. A positive Rs2[5:0] means
 * logical left shift and a negative Rs2[5:0] means arithmetic right shift. The shift amount is the
 * absolute value of Rs2[5:0]. However, the behavior of `Rs2[5:0]==-25 (0x20)` is defined to be
 * equivalent to the behavior of `Rs2[5:0]==-(25-1) (0x21)`.
 * The left-shifted results are saturated to the 32-bit signed integer range of [-2^31, 2^31-1]. For the `.u`
 * form of the instruction, the right-shifted results are added a 1 to the most significant discarded bit
 * position for rounding effect. After the shift, saturation, or rounding, the final results are written to
 * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:6] will not affect
 * this instruction.
 *
 * **Operations**:\n
 * ~~~
 * if (Rs2[5:0] < 0) {
 *   sa = -Rs2[5:0];
 *   sa = (sa == 32)? 31 : sa;
 *   if (`.u` form) {
 *     res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1;
 *     Rd.W[x] = res[31:0];
 *   } else {
 *     Rd.W[x] = SE32(Rs1.W[x][31:sa]);
 *   }
 * } else {
 *   sa = Rs2[4:0];
 *   res[(31+sa):0] = Rs1.W[x] <<(logic) sa;
 *   if (res > (2^31)-1) {
 *     res[31:0] = 0x7fffffff; OV = 1;
 *   } else if (res < -2^31) {
 *     res[31:0] = 0x80000000; OV = 1;
 *   }
 *   Rd.W[x] = res[31:0];
 * }
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KSLRA32_U(unsigned long a, int b)
{
    unsigned long result;
    __ASM volatile("kslra32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.18.2. KSLRA32.u ===== */

/* ===== Inline Function Start for 4.19. KSTAS32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief KSTAS32 (SIMD 32-bit Signed Saturating Straight Addition & Subtraction)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KSTAS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element saturating addition and 32-bit signed integer element
 * saturating subtraction in a 64-bit chunk simultaneously. Operands are from corresponding 32-bit
 * elements.
 *
 * **Description**:\n
 * This instruction adds the 32-bit integer element in [63:32] of Rs1 with the 32-bit
 * integer element in [63:32] of Rs2; at the same time, it subtracts the 32-bit integer element in [31:0] of
 * Rs2 from the 32-bit integer element in [31:0] of Rs1. If any of the results are beyond the Q31 number
 * range (-2^31 <= Q31 <= 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated
 * results are written to [63:32] of Rd for addition and [31:0] of Rd for subtraction.
 *
 * **Operations**:\n
 * ~~~
 * res[1] = Rs1.W[1] + Rs2.W[1];
 * res[0] = Rs1.W[0] - Rs2.W[0];
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[1] = res[1];
 * Rd.W[0] = res[0];
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KSTAS32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("kstas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.19. KSTAS32 ===== */

/* ===== Inline Function Start for 4.20. KSTSA32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief KSTSA32 (SIMD 32-bit Signed Saturating Straight Subtraction & Addition)
 * \details
 * **Type**: SIM (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KSTSA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element saturating subtraction and 32-bit signed integer element
 * saturating addition in a 64-bit chunk simultaneously. Operands are from corresponding 32-bit
 * elements.
 * *Description: *
 * This instruction subtracts the 32-bit integer element in [63:32] of Rs2 from the 32-bit integer
 * element in [63:32] of Rs1; at the same time, it adds the 32-bit integer element in [31:0] of Rs1 with
 * the 32-bit integer element in [31:0] of Rs2. If any of the results are beyond the Q31 number range (
 * -2^31 <= Q31 <= 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated results are
 * written to [63:32] of Rd for subtraction and [31:0] of Rd for addition.
 *
 * **Operations**:\n
 * ~~~
 * res[1] = Rs1.W[1] - Rs2.W[1];
 * res[0] = Rs1.W[0] + Rs2.W[0];
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[1] = res[1];
 * Rd.W[0] = res[0];
 * for RV64, x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KSTSA32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("kstsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.20. KSTSA32 ===== */

/* ===== Inline Function Start for 4.21. KSUB32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief KSUB32 (SIMD 32-bit Signed Saturating Subtraction)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * KSUB32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer elements saturating subtractions simultaneously.
 *
 * **Description**:\n
 * This instruction subtracts the 32-bit signed integer elements in Rs2 from the 32-bit
 * signed integer elements in Rs1. If any of the results are beyond the Q31 number range (-2^31 <= Q31 <=
 * 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to
 * Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rs1.W[x] - Rs2.W[x];
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_KSUB32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ksub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.21. KSUB32 ===== */

/* ===== Inline Function Start for 4.22.1. PKBB32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK
 * \brief PKBB32 (Pack Two 32-bit Data from Both Bottom Half)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * PKBB32 Rd, Rs1, Rs2
 * PKBT32 Rd, Rs1, Rs2
 * PKTT32 Rd, Rs1, Rs2
 * PKTB32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Pack 32-bit data from 64-bit chunks in two registers.
 * * PKBB32: bottom.bottom
 * * PKBT32: bottom.top
 * * PKTT32: top.top
 * * PKTB32: top.bottom
 *
 * **Description**:\n
 * (PKBB32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
 * (PKBT32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
 * (PKTT32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
 * (PKTB32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
 *
 * **Operations**:\n
 * ~~~
 * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*0*_]); // PKBB32
 * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*1*_]); // PKBT32
 * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*1*_]); // PKTT32
 * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*0*_]); // PKTB32
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_PKBB32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("pkbb32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.22.1. PKBB32 ===== */

/* ===== Inline Function Start for 4.22.2. PKBT32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK
 * \brief PKBT32 (Pack Two 32-bit Data from Bottom and Top Half)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * PKBB32 Rd, Rs1, Rs2
 * PKBT32 Rd, Rs1, Rs2
 * PKTT32 Rd, Rs1, Rs2
 * PKTB32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Pack 32-bit data from 64-bit chunks in two registers.
 * * PKBB32: bottom.bottom
 * * PKBT32: bottom.top
 * * PKTT32: top.top
 * * PKTB32: top.bottom
 *
 * **Description**:\n
 * (PKBB32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
 * (PKBT32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
 * (PKTT32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
 * (PKTB32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
 *
 * **Operations**:\n
 * ~~~
 * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*0*_]); // PKBB32
 * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*1*_]); // PKBT32
 * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*1*_]); // PKTT32
 * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*0*_]); // PKTB32
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_PKBT32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("pkbt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.22.2. PKBT32 ===== */

/* ===== Inline Function Start for 4.22.3. PKTT32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK
 * \brief PKTT32 (Pack Two 32-bit Data from Both Top Half)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * PKBB32 Rd, Rs1, Rs2
 * PKBT32 Rd, Rs1, Rs2
 * PKTT32 Rd, Rs1, Rs2
 * PKTB32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Pack 32-bit data from 64-bit chunks in two registers.
 * * PKBB32: bottom.bottom
 * * PKBT32: bottom.top
 * * PKTT32: top.top
 * * PKTB32: top.bottom
 *
 * **Description**:\n
 * (PKBB32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
 * (PKBT32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
 * (PKTT32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
 * (PKTB32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
 *
 * **Operations**:\n
 * ~~~
 * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*0*_]); // PKBB32
 * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*1*_]); // PKBT32
 * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*1*_]); // PKTT32
 * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*0*_]); // PKTB32
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_PKTT32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("pktt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.22.3. PKTT32 ===== */

/* ===== Inline Function Start for 4.22.4. PKTB32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK
 * \brief PKTB32 (Pack Two 32-bit Data from Top and Bottom Half)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * PKBB32 Rd, Rs1, Rs2
 * PKBT32 Rd, Rs1, Rs2
 * PKTT32 Rd, Rs1, Rs2
 * PKTB32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Pack 32-bit data from 64-bit chunks in two registers.
 * * PKBB32: bottom.bottom
 * * PKBT32: bottom.top
 * * PKTT32: top.top
 * * PKTB32: top.bottom
 *
 * **Description**:\n
 * (PKBB32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
 * (PKBT32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
 * (PKTT32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
 * (PKTB32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
 *
 * **Operations**:\n
 * ~~~
 * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*0*_]); // PKBB32
 * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*1*_]); // PKBT32
 * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*1*_]); // PKTT32
 * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*0*_]); // PKTB32
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_PKTB32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("pktb32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.22.4. PKTB32 ===== */

/* ===== Inline Function Start for 4.23. RADD32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief RADD32 (SIMD 32-bit Signed Halving Addition)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * RADD32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element additions simultaneously. The results are halved to avoid
 * overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the 32-bit signed integer elements in Rs1 with the 32-bit signed
 * integer elements in Rs2. The results are first arithmetically right-shifted by 1 bit and then written to
 * Rd.
 *
 * **Examples**:\n
 * ~~~
 * * Rs1 = 0x7FFFFFFF, Rs2 = 0x7FFFFFFF Rd = 0x7FFFFFFF
 * * Rs1 = 0x80000000, Rs2 = 0x80000000 Rd = 0x80000000
 * * Rs1 = 0x40000000, Rs2 = 0x80000000 Rd = 0xE0000000
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = (Rs1.W[x] + Rs2.W[x]) s>> 1;
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_RADD32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("radd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.23. RADD32 ===== */

/* ===== Inline Function Start for 4.24. RCRAS32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief RCRAS32 (SIMD 32-bit Signed Halving Cross Addition & Subtraction)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * RCRAS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element addition and 32-bit signed integer element subtraction in
 * a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements. The results are halved to
 * avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the 32-bit signed integer element in [63:32] of Rs1 with the 32-bit
 * signed integer element in [31:0] of Rs2, and subtracts the 32-bit signed integer element in [63:32] of
 * Rs2 from the 32-bit signed integer element in [31:0] of Rs1. The element results are first
 * arithmetically right-shifted by 1 bit and then written to [63:32] of Rd for addition and [31:0] of Rd
 * for subtraction.
 *
 * **Examples**:\n
 * ~~~
 * Please see `RADD32` and `RSUB32` instructions.
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[1] = (Rs1.W[1] + Rs2.W[0]) s>> 1;
 * Rd.W[0] = (Rs1.W[0] - Rs2.W[1]) s>> 1;
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_RCRAS32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("rcras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.24. RCRAS32 ===== */

/* ===== Inline Function Start for 4.25. RCRSA32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief RCRSA32 (SIMD 32-bit Signed Halving Cross Subtraction & Addition)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * RCRSA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element subtraction and 32-bit signed integer element addition in
 * a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements. The results are halved to
 * avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the 32-bit signed integer element in [31:0] of Rs2 from the
 * 32-bit signed integer element in [63:32] of Rs1, and adds the 32-bit signed element integer in [31:0]
 * of Rs1 with the 32-bit signed integer element in [63:32] of Rs2. The two results are first
 * arithmetically right-shifted by 1 bit and then written to [63:32] of Rd for subtraction and [31:0] of
 * Rd for addition.
 *
 * **Examples**:\n
 * ~~~
 * Please see `RADD32` and `RSUB32` instructions.
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[1] = (Rs1.W[1] - Rs2.W[0]) s>> 1;
 * Rd.W[0] = (Rs1.W[0] + Rs2.W[1]) s>> 1;
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_RCRSA32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("rcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.25. RCRSA32 ===== */

/* ===== Inline Function Start for 4.26. RSTAS32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief RSTAS32 (SIMD 32-bit Signed Halving Straight Addition & Subtraction)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * RSTAS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element addition and 32-bit signed integer element subtraction in
 * a 64-bit chunk simultaneously. Operands are from corresponding 32-bit elements. The results are
 * halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the 32-bit signed integer element in [63:32] of Rs1 with the 32-bit
 * signed integer element in [63:32] of Rs2, and subtracts the 32-bit signed integer element in [31:0] of
 * Rs2 from the 32-bit signed integer element in [31:0] of Rs1. The element results are first
 * arithmetically right-shifted by 1 bit and then written to [63:32] of Rd for addition and [31:0] of Rd
 * for subtraction.
 *
 * **Examples**:\n
 * ~~~
 * Please see `RADD32` and `RSUB32` instructions.
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[1] = (Rs1.W[1] + Rs2.W[1]) s>> 1;
 * Rd.W[0] = (Rs1.W[0] - Rs2.W[0]) s>> 1;
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_RSTAS32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("rstas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.26. RSTAS32 ===== */

/* ===== Inline Function Start for 4.27. RSTSA32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief RSTSA32 (SIMD 32-bit Signed Halving Straight Subtraction & Addition)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * RSTSA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element subtraction and 32-bit signed integer element addition in
 * a 64-bit chunk simultaneously. Operands are from corresponding 32-bit elements. The results are
 * halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the 32-bit signed integer element in [63:32] of Rs2 from the
 * 32-bit signed integer element in [63:32] of Rs1, and adds the 32-bit signed element integer in [31:0]
 * of Rs1 with the 32-bit signed integer element in [31:0] of Rs2. The two results are first arithmetically
 * right-shifted by 1 bit and then written to [63:32] of Rd for subtraction and [31:0] of Rd for addition.
 *
 * **Examples**:\n
 * ~~~
 * Please see `RADD32` and `RSUB32` instructions.
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[1] = (Rs1.W[1] - Rs2.W[1]) s>> 1;
 * Rd.W[0] = (Rs1.W[0] + Rs2.W[0]) s>> 1;
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_RSTSA32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("rstsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.27. RSTSA32 ===== */

/* ===== Inline Function Start for 4.28. RSUB32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief RSUB32 (SIMD 32-bit Signed Halving Subtraction)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * RSUB32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element subtractions simultaneously. The results are halved to
 * avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the 32-bit signed integer elements in Rs2 from the 32-bit
 * signed integer elements in Rs1. The results are first arithmetically right-shifted by 1 bit and then
 * written to Rd.
 *
 * **Examples**:\n
 * ~~~
 * * Ra = 0x7FFFFFFF, Rb = 0x80000000 Rt = 0x7FFFFFFF
 * * Ra = 0x80000000, Rb = 0x7FFFFFFF Rt = 0x80000000
 * * Ra = 0x80000000, Rb = 0x40000000 Rt = 0xA0000000
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = (Rs1.W[x] - Rs2.W[x]) s>> 1;
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_RSUB32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("rsub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.28. RSUB32 ===== */

/* ===== Inline Function Start for 4.29. SLL32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
 * \brief SLL32 (SIMD 32-bit Shift Left Logical)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SLL32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit elements logical left shift operations simultaneously. The shift amount is a
 * variable from a GPR.
 *
 * **Description**:\n
 * The 32-bit elements in Rs1 are left-shifted logically. And the results are written to Rd.
 * The shifted out bits are filled with zero and the shift amount is specified by the low-order 5-bits of
 * the value in the Rs2 register.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[4:0];
 * Rd.W[x] = Rs1.W[x] << sa;
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SLL32(unsigned long a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("sll32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.29. SLL32 ===== */

/* ===== Inline Function Start for 4.30. SLLI32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
 * \brief SLLI32 (SIMD 32-bit Shift Left Logical Immediate)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SLLI32 Rd, Rs1, imm5u[4:0]
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit element logical left shift operations simultaneously. The shift amount is an
 * immediate value.
 *
 * **Description**:\n
 * The 32-bit elements in Rs1 are left-shifted logically. The shifted out bits are filled with
 * zero and the shift amount is specified by the imm5u[4:0] constant. And the results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm5u[4:0];
 * Rd.W[x] = Rs1.W[x] << sa;
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_SLLI32(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("slli32 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 4.30. SLLI32 ===== */

/* ===== Inline Function Start for 4.31. SMAX32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC
 * \brief SMAX32 (SIMD 32-bit Signed Maximum)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SMAX32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer elements finding maximum operations simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 32-bit signed integer elements in Rs1 with the 32-bit
 * signed integer elements in Rs2 and selects the numbers that is greater than the other one. The
 * selected results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = (Rs1.W[x] > Rs2.W[x])? Rs1.W[x] : Rs2.W[x];
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SMAX32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("smax32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.31. SMAX32 ===== */

/* ===== Inline Function Start for 4.32.1. SMBB32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT
 * \brief SMBB32 (Signed Multiply Bottom Word & Bottom Word)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SMBB32 Rd, Rs1, Rs2
 * SMBT32 Rd, Rs1, Rs2
 * SMTT32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit element of a register with the signed 32-bit element of another
 * register and write the 64-bit result to a third register.
 * * SMBB32: bottom*bottom
 * * SMBT32: bottom*top
 * * SMTT32: top*top
 *
 * **Description**:\n
 * For the `SMBB32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
 * element of Rs2. It is actually an alias of `MULSR64` instruction.
 * For the `SMBT32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
 * element of Rs2.
 * For the `SMTT32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit element
 * of Rs2.
 * The 64-bit multiplication result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as
 * signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rs1.W[0] * Rs2.W[0]; // SMBB32 res = Rs1.W[0] * Rs2.w[1]; // SMBT32 res = Rs1.W[1] * Rs2.W[1];
 * // SMTT32 Rd = res;
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMBB32(unsigned long a, unsigned long b)
{
    long result;
    __ASM volatile("smbb32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.32.1. SMBB32 ===== */

/* ===== Inline Function Start for 4.32.2. SMBT32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT
 * \brief SMBT32 (Signed Multiply Bottom Word & Top Word)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SMBB32 Rd, Rs1, Rs2
 * SMBT32 Rd, Rs1, Rs2
 * SMTT32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit element of a register with the signed 32-bit element of another
 * register and write the 64-bit result to a third register.
 * * SMBB32: bottom*bottom
 * * SMBT32: bottom*top
 * * SMTT32: top*top
 *
 * **Description**:\n
 * For the `SMBB32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
 * element of Rs2. It is actually an alias of `MULSR64` instruction.
 * For the `SMBT32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
 * element of Rs2.
 * For the `SMTT32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit element
 * of Rs2.
 * The 64-bit multiplication result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as
 * signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rs1.W[0] * Rs2.W[0]; // SMBB32 res = Rs1.W[0] * Rs2.w[1]; // SMBT32 res = Rs1.W[1] * Rs2.W[1];
 * // SMTT32 Rd = res;
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMBT32(unsigned long a, unsigned long b)
{
    long result;
    __ASM volatile("smbt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.32.2. SMBT32 ===== */

/* ===== Inline Function Start for 4.32.3. SMTT32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT
 * \brief SMTT32 (Signed Multiply Top Word & Top Word)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SMBB32 Rd, Rs1, Rs2
 * SMBT32 Rd, Rs1, Rs2
 * SMTT32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit element of a register with the signed 32-bit element of another
 * register and write the 64-bit result to a third register.
 * * SMBB32: bottom*bottom
 * * SMBT32: bottom*top
 * * SMTT32: top*top
 *
 * **Description**:\n
 * For the `SMBB32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
 * element of Rs2. It is actually an alias of `MULSR64` instruction.
 * For the `SMBT32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
 * element of Rs2.
 * For the `SMTT32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit element
 * of Rs2.
 * The 64-bit multiplication result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as
 * signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rs1.W[0] * Rs2.W[0]; // SMBB32 res = Rs1.W[0] * Rs2.w[1]; // SMBT32 res = Rs1.W[1] * Rs2.W[1];
 * // SMTT32 Rd = res;
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMTT32(unsigned long a, unsigned long b)
{
    long result;
    __ASM volatile("smtt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.32.3. SMTT32 ===== */

/* ===== Inline Function Start for 4.33.1. SMDS32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
 * \brief SMDS32 (Signed Multiply Two Words and Subtract)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SMDS32 Rd, Rs1, Rs2
 * SMDRS32 Rd, Rs1, Rs2
 * SMXDS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32-bit multiplications from the l 32-bit element of two registers; and then
 * perform a subtraction operation between the two 64-bit results.
 * * SMDS32: top*top - bottom*bottom
 * * SMDRS32: bottom*bottom - top*top
 * * SMXDS32: top*bottom - bottom*top
 *
 * **Description**:\n
 * For the `SMDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
 * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
 * Rs1 with the top 32-bit element of Rs2.
 * For the `SMDRS32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit
 * element of Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
 * element of Rs1 with the bottom 32-bit element of Rs2.
 * For the `SMXDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
 * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
 * Rs1 with the bottom 32-bit element of Rs2.
 * The subtraction result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * Rt = (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // SMDS32
 * Rt = (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // SMDRS32
 * Rt = (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // SMXDS32
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMDS32(unsigned long a, unsigned long b)
{
    long result;
    __ASM volatile("smds32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.33.1. SMDS32 ===== */

/* ===== Inline Function Start for 4.33.2. SMDRS32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
 * \brief SMDRS32 (Signed Multiply Two Words and Reverse Subtract)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SMDS32 Rd, Rs1, Rs2
 * SMDRS32 Rd, Rs1, Rs2
 * SMXDS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32-bit multiplications from the l 32-bit element of two registers; and then
 * perform a subtraction operation between the two 64-bit results.
 * * SMDS32: top*top - bottom*bottom
 * * SMDRS32: bottom*bottom - top*top
 * * SMXDS32: top*bottom - bottom*top
 *
 * **Description**:\n
 * For the `SMDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
 * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
 * Rs1 with the top 32-bit element of Rs2.
 * For the `SMDRS32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit
 * element of Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
 * element of Rs1 with the bottom 32-bit element of Rs2.
 * For the `SMXDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
 * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
 * Rs1 with the bottom 32-bit element of Rs2.
 * The subtraction result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * Rt = (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // SMDS32
 * Rt = (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // SMDRS32
 * Rt = (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // SMXDS32
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMDRS32(unsigned long a, unsigned long b)
{
    long result;
    __ASM volatile("smdrs32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.33.2. SMDRS32 ===== */

/* ===== Inline Function Start for 4.33.3. SMXDS32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
 * \brief SMXDS32 (Signed Crossed Multiply Two Words and Subtract)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SMDS32 Rd, Rs1, Rs2
 * SMDRS32 Rd, Rs1, Rs2
 * SMXDS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32-bit multiplications from the l 32-bit element of two registers; and then
 * perform a subtraction operation between the two 64-bit results.
 * * SMDS32: top*top - bottom*bottom
 * * SMDRS32: bottom*bottom - top*top
 * * SMXDS32: top*bottom - bottom*top
 *
 * **Description**:\n
 * For the `SMDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
 * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
 * Rs1 with the top 32-bit element of Rs2.
 * For the `SMDRS32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit
 * element of Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
 * element of Rs1 with the bottom 32-bit element of Rs2.
 * For the `SMXDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
 * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
 * Rs1 with the bottom 32-bit element of Rs2.
 * The subtraction result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed
 * integers.
 *
 * **Operations**:\n
 * ~~~
 * Rt = (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // SMDS32
 * Rt = (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // SMDRS32
 * Rt = (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // SMXDS32
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_SMXDS32(unsigned long a, unsigned long b)
{
    long result;
    __ASM volatile("smxds32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.33.3. SMXDS32 ===== */

/* ===== Inline Function Start for 4.34. SMIN32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC
 * \brief SMIN32 (SIMD 32-bit Signed Minimum)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SMIN32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer elements finding minimum operations simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 32-bit signed integer elements in Rs1 with the 32-bit
 * signed integer elements in Rs2 and selects the numbers that is less than the other one. The selected
 * results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = (Rs1.W[x] < Rs2.W[x])? Rs1.W[x] : Rs2.W[x];
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SMIN32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("smin32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.34. SMIN32 ===== */

/* ===== Inline Function Start for 4.35.1. SRA32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
 * \brief SRA32 (SIMD 32-bit Shift Right Arithmetic)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SRA32 Rd, Rs1, Rs2
 * SRA32.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit element arithmetic right shift operations simultaneously. The shift amount is a
 * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
 * results.
 *
 * **Description**:\n
 * The 32-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
 * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order
 * 5-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is
 * added to the most significant discarded bit of each 32-bit data element to calculate the final results.
 * And the results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[4:0];
 * if (sa > 0) {
 *   if (`.u` form) { // SRA32.u
 *     res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1;
 *     Rd.W[x] = res[31:0];
 *   else { // SRA32
 *     Rd.W[x] = SE32(Rs1.W[x][31:sa])
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SRA32(unsigned long a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("sra32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.35.1. SRA32 ===== */

/* ===== Inline Function Start for 4.35.2. SRA32.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
 * \brief SRA32.u (SIMD 32-bit Rounding Shift Right Arithmetic)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SRA32 Rd, Rs1, Rs2
 * SRA32.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit element arithmetic right shift operations simultaneously. The shift amount is a
 * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
 * results.
 *
 * **Description**:\n
 * The 32-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
 * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order
 * 5-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is
 * added to the most significant discarded bit of each 32-bit data element to calculate the final results.
 * And the results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[4:0];
 * if (sa > 0) {
 *   if (`.u` form) { // SRA32.u
 *     res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1;
 *     Rd.W[x] = res[31:0];
 *   else { // SRA32
 *     Rd.W[x] = SE32(Rs1.W[x][31:sa])
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SRA32_U(unsigned long a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("sra32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.35.2. SRA32.u ===== */

/* ===== Inline Function Start for 4.36.1. SRAI32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
 * \brief SRAI32 (SIMD 32-bit Shift Right Arithmetic Immediate)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SRAI32 Rd, Rs1, imm5u
 * SRAI32.u Rd, Rs1, imm5u
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit elements arithmetic right shift operations simultaneously. The shift amount is
 * an immediate value. The `.u` form performs additional rounding up operations on the shifted
 * results.
 *
 * **Description**:\n
 * The 32-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
 * bits are filled with the sign-bit of the 32-bit data elements. The shift amount is specified by the
 * imm5u constant. For the rounding operation of the `.u` form, a value of 1 is added to the most
 * significant discarded bit of each 32-bit data to calculate the final results. And the results are written
 * to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm5u[4:0];
 *   if (sa > 0) {
 *   if (`.u` form) { // SRAI32.u
 *     res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1;
 *     Rd.W[x] = res[31:0];
 *   else { // SRAI32
 *     Rd.W[x] = SE32(Rs1.W[x][31:sa]);
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_SRAI32(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("srai32 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 4.36.1. SRAI32 ===== */

/* ===== Inline Function Start for 4.36.2. SRAI32.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
 * \brief SRAI32.u (SIMD 32-bit Rounding Shift Right Arithmetic Immediate)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SRAI32 Rd, Rs1, imm5u
 * SRAI32.u Rd, Rs1, imm5u
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit elements arithmetic right shift operations simultaneously. The shift amount is
 * an immediate value. The `.u` form performs additional rounding up operations on the shifted
 * results.
 *
 * **Description**:\n
 * The 32-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
 * bits are filled with the sign-bit of the 32-bit data elements. The shift amount is specified by the
 * imm5u constant. For the rounding operation of the `.u` form, a value of 1 is added to the most
 * significant discarded bit of each 32-bit data to calculate the final results. And the results are written
 * to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm5u[4:0];
 *   if (sa > 0) {
 *   if (`.u` form) { // SRAI32.u
 *     res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1;
 *     Rd.W[x] = res[31:0];
 *   else { // SRAI32
 *     Rd.W[x] = SE32(Rs1.W[x][31:sa]);
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_SRAI32_U(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("srai32.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 4.36.2. SRAI32.u ===== */

/* ===== Inline Function Start for 4.37. SRAIW.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_NON_SIMD_32B_SHIFT
 * \brief SRAIW.u (Rounding Shift Right Arithmetic Immediate Word)
 * \details
 * **Type**: DSP (RV64 only)
 *
 * **Syntax**:\n
 * ~~~
 * SRAIW.u Rd, Rs1, imm5u
 * ~~~
 *
 * **Purpose**:\n
 * Perform a 32-bit arithmetic right shift operation with rounding. The shift amount is an
 * immediate value.
 *
 * **Description**:\n
 * This instruction right-shifts the lower 32-bit content of Rs1 arithmetically. The shifted
 * out bits are filled with the sign-bit Rs1(31) and the shift amount is specified by the imm5u constant.
 * For the rounding operation, a value of 1 is added to the most significant discarded bit of the data to
 * calculate the final result. And the result is sign-extended and written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm5u;
 * if (sa != 0) {
 *   res[31:-1] = SE33(Rs1[31:(sa-1)]) + 1;
 *   Rd = SE32(res[31:0]);
 * } else {
 *   Rd = SE32(Rs1.W[0]);
 * }
 * ~~~
 *
 * \param [in]  a    int type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in long type
 */
#define __RV_SRAIW_U(a, b)    \
    ({    \
        long result;    \
        int __a = (int)(a);    \
        __ASM volatile("sraiw.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 4.37. SRAIW.u ===== */

/* ===== Inline Function Start for 4.38.1. SRL32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
 * \brief SRL32 (SIMD 32-bit Shift Right Logical)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SRL32 Rd, Rs1, Rs2
 * SRL32.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit element logical right shift operations simultaneously. The shift amount is a
 * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
 * results.
 *
 * **Description**:\n
 * The 32-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
 * are filled with zero. The shift amount is specified by the low-order 5-bits of the value in the Rs2
 * register. For the rounding operation of the `.u` form, a value of 1 is added to the most significant
 * discarded bit of each 32-bit data element to calculate the final results. And the results are written to
 * Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[4:0];
 * if (sa > 0) {
 *   if (`.u` form) { // SRA32.u
 *     res[31:-1] = ZE33(Rs1.W[x][31:sa-1]) + 1;
 *     Rd.W[x] = res[31:0];
 *   else { // SRA32
 *     Rd.W[x] = ZE32(Rs1.W[x][31:sa])
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SRL32(unsigned long a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("srl32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.38.1. SRL32 ===== */

/* ===== Inline Function Start for 4.38.2. SRL32.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
 * \brief SRL32.u (SIMD 32-bit Rounding Shift Right Logical)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SRL32 Rd, Rs1, Rs2
 * SRL32.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit element logical right shift operations simultaneously. The shift amount is a
 * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
 * results.
 *
 * **Description**:\n
 * The 32-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
 * are filled with zero. The shift amount is specified by the low-order 5-bits of the value in the Rs2
 * register. For the rounding operation of the `.u` form, a value of 1 is added to the most significant
 * discarded bit of each 32-bit data element to calculate the final results. And the results are written to
 * Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[4:0];
 * if (sa > 0) {
 *   if (`.u` form) { // SRA32.u
 *     res[31:-1] = ZE33(Rs1.W[x][31:sa-1]) + 1;
 *     Rd.W[x] = res[31:0];
 *   else { // SRA32
 *     Rd.W[x] = ZE32(Rs1.W[x][31:sa])
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SRL32_U(unsigned long a, unsigned int b)
{
    unsigned long result;
    __ASM volatile("srl32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.38.2. SRL32.u ===== */

/* ===== Inline Function Start for 4.39.1. SRLI32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
 * \brief SRLI32 (SIMD 32-bit Shift Right Logical Immediate)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SRLI32 Rd, Rs1, imm5u
 * SRLI32.u Rd, Rs1, imm5u
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit elements logical right shift operations simultaneously. The shift amount is an
 * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
 *
 * **Description**:\n
 * The 32-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
 * are filled with zero. The shift amount is specified by the imm5u constant. For the rounding
 * operation of the `.u` form, a value of 1 is added to the most significant discarded bit of each 32-bit
 * data to calculate the final results. And the results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm5u[4:0];
 * if (sa > 0) {
 *   if (`.u` form) { // SRLI32.u
 *     res[31:-1] = ZE33(Rs1.W[x][31:sa-1]) + 1;
 *     Rd.W[x] = res[31:0];
 *   else { // SRLI32
 *     Rd.W[x] = ZE32(Rs1.W[x][31:sa]);
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_SRLI32(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("srli32 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 4.39.1. SRLI32 ===== */

/* ===== Inline Function Start for 4.39.2. SRLI32.u ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
 * \brief SRLI32.u (SIMD 32-bit Rounding Shift Right Logical Immediate)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SRLI32 Rd, Rs1, imm5u
 * SRLI32.u Rd, Rs1, imm5u
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit elements logical right shift operations simultaneously. The shift amount is an
 * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
 *
 * **Description**:\n
 * The 32-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
 * are filled with zero. The shift amount is specified by the imm5u constant. For the rounding
 * operation of the `.u` form, a value of 1 is added to the most significant discarded bit of each 32-bit
 * data to calculate the final results. And the results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = imm5u[4:0];
 * if (sa > 0) {
 *   if (`.u` form) { // SRLI32.u
 *     res[31:-1] = ZE33(Rs1.W[x][31:sa-1]) + 1;
 *     Rd.W[x] = res[31:0];
 *   else { // SRLI32
 *     Rd.W[x] = ZE32(Rs1.W[x][31:sa]);
 *   }
 * } else {
 *   Rd = Rs1;
 * }
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned int type of value stored in b
 * \return value stored in unsigned long type
 */
#define __RV_SRLI32_U(a, b)    \
    ({    \
        unsigned long result;    \
        unsigned long __a = (unsigned long)(a);    \
        __ASM volatile("srli32.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for 4.39.2. SRLI32.u ===== */

/* ===== Inline Function Start for 4.40. STAS32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief STAS32 (SIMD 32-bit Straight Addition & Subtraction)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * STAS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit integer element addition and 32-bit integer element subtraction in a 64-bit
 * chunk simultaneously. Operands are from corresponding 32-bit elements.
 *
 * **Description**:\n
 * This instruction adds the 32-bit integer element in [63:32] of Rs1 with the 32-bit
 * integer element in [63:32] of Rs2, and writes the result to [63:32] of Rd; at the same time, it subtracts
 * the 32-bit integer element in [31:0] of Rs2 from the 32-bit integer element in [31:0] of Rs1, and
 * writes the result to [31:0] of Rd.
 *
 * **Note**:\n
 * This instruction can be used for either signed or unsigned operations.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[1] = Rs1.W[1] + Rs2.W[1];
 * Rd.W[0] = Rs1.W[0] - Rs2.W[0];
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_STAS32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("stas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.40. STAS32 ===== */

/* ===== Inline Function Start for 4.41. STSA32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief STSA32 (SIMD 32-bit Straight Subtraction & Addition)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * STSA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit integer element subtraction and 32-bit integer element addition in a 64-bit
 * chunk simultaneously. Operands are from corresponding 32-bit elements.
 * *Description: *
 * This instruction subtracts the 32-bit integer element in [63:32] of Rs2 from the 32-bit integer
 * element in [63:32] of Rs1, and writes the result to [63:32] of Rd; at the same time, it adds the 32-bit
 * integer element in [31:0] of Rs1 with the 32-bit integer element in [31:0] of Rs2, and writes the result
 * to [31:0] of Rd
 *
 * **Note**:\n
 * This instruction can be used for either signed or unsigned operations.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[1] = Rs1.W[1] - Rs2.W[1];
 * Rd.W[0] = Rs1.W[0] + Rs2.W[0];
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_STSA32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("stsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.41. STSA32 ===== */

/* ===== Inline Function Start for 4.42. SUB32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief SUB32 (SIMD 32-bit Subtraction)
 * \details
 * **Type**: DSP (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * SUB32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit integer element subtractions simultaneously.
 *
 * **Description**:\n
 * This instruction subtracts the 32-bit integer elements in Rs2 from the 32-bit integer
 * elements in Rs1, and then writes the results to Rd.
 *
 * **Note**:\n
 * This instruction can be used for either signed or unsigned subtraction.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = Rs1.W[x] - Rs2.W[x];
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_SUB32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("sub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.42. SUB32 ===== */

/* ===== Inline Function Start for 4.43. UKADD32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief UKADD32 (SIMD 32-bit Unsigned Saturating Addition)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * UKADD32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit unsigned integer element saturating additions simultaneously.
 *
 * **Description**:\n
 * This instruction adds the 32-bit unsigned integer elements in Rs1 with the 32-bit
 * unsigned integer elements in Rs2. If any of the results are beyond the 32-bit unsigned number
 * range (0 <= RES <= 2^32-1), they are saturated to the range and the OV bit is set to 1. The saturated
 * results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rs1.W[x] + Rs2.W[x];
 * if (res[x] > (2^32)-1) {
 *   res[x] = (2^32)-1;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UKADD32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ukadd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.43. UKADD32 ===== */

/* ===== Inline Function Start for 4.44. UKCRAS32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief UKCRAS32 (SIMD 32-bit Unsigned Saturating Cross Addition & Subtraction)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * UKCRAS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do one 32-bit unsigned integer element saturating addition and one 32-bit unsigned
 * integer element saturating subtraction in a 64-bit chunk simultaneously. Operands are from crossed
 * 32-bit elements.
 *
 * **Description**:\n
 * This instruction adds the 32-bit unsigned integer element in [63:32] of Rs1 with the 32-
 * bit unsigned integer element in [31:0] of Rs2; at the same time, it subtracts the 32-bit unsigned
 * integer element in [63:32] of Rs2 from the 32-bit unsigned integer element in [31:0] Rs1. If any of the
 * results are beyond the 32-bit unsigned number range (0 <= RES <= 2^32-1), they are saturated to the
 * range and the OV bit is set to 1. The saturated results are written to [63:32] of Rd for addition and
 * [31:0] of Rd for subtraction.
 *
 * **Operations**:\n
 * ~~~
 * res1 = Rs1.W[1] + Rs2.W[0];
 * res2 = Rs1.W[0] - Rs2.W[1];
 * if (res1 > (2^32)-1) {
 *   res1 = (2^32)-1;
 *   OV = 1;
 * }
 * if (res2 < 0) {
 *   res2 = 0;
 *   OV = 1;
 * }
 * Rd.W[1] = res1;
 * Rd.W[0] = res2;
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UKCRAS32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ukcras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.44. UKCRAS32 ===== */

/* ===== Inline Function Start for 4.45. UKCRSA32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief UKCRSA32 (SIMD 32-bit Unsigned Saturating Cross Subtraction & Addition)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * UKCRSA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do one 32-bit unsigned integer element saturating subtraction and one 32-bit unsigned
 * integer element saturating addition in a 64-bit chunk simultaneously. Operands are from crossed
 * 32-bit elements.
 *
 * **Description**:\n
 * This instruction subtracts the 32-bit unsigned integer element in [31:0] of Rs2 from the
 * 32-bit unsigned integer element in [63:32] of Rs1; at the same time, it adds the 32-bit unsigned
 * integer element in [63:32] of Rs2 with the 32-bit unsigned integer element in [31:0] Rs1. If any of the
 * results are beyond the 32-bit unsigned number range (0 <= RES <= 2^32-1), they are saturated to the
 * range and the OV bit is set to 1. The saturated results are written to [63:32] of Rd for subtraction and
 * [31:0] of Rd for addition.
 *
 * **Operations**:\n
 * ~~~
 * res1 = Rs1.W[1] - Rs2.W[0];
 * res2 = Rs1.W[0] + Rs2.W[1];
 * if (res1 < 0) {
 *   res1 = 0;
 *   OV = 1;
 * } else if (res2 > (2^32)-1) {
 *   res2 = (2^32)-1;
 *   OV = 1;
 * }
 * Rd.W[1] = res1;
 * Rd.W[0] = res2;
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UKCRSA32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ukcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.45. UKCRSA32 ===== */

/* ===== Inline Function Start for 4.46. UKSTAS32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief UKSTAS32 (SIMD 32-bit Unsigned Saturating Straight Addition & Subtraction)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * UKSTAS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do one 32-bit unsigned integer element saturating addition and one 32-bit unsigned
 * integer element saturating subtraction in a 64-bit chunk simultaneously. Operands are from
 * corresponding 32-bit elements.
 *
 * **Description**:\n
 * This instruction adds the 32-bit unsigned integer element in [63:32] of Rs1 with the 32-
 * bit unsigned integer element in [63:32] of Rs2; at the same time, it subtracts the 32-bit unsigned
 * integer element in [31:0] of Rs2 from the 32-bit unsigned integer element in [31:0] Rs1. If any of the
 * results are beyond the 32-bit unsigned number range (0 <= RES <= 2^32-1), they are saturated to the
 * range and the OV bit is set to 1. The saturated results are written to [63:32] of Rd for addition and
 * [31:0] of Rd for subtraction.
 *
 * **Operations**:\n
 * ~~~
 * res1 = Rs1.W[1] + Rs2.W[1];
 * res2 = Rs1.W[0] - Rs2.W[0];
 * if (res1 > (2^32)-1) {
 *   res1 = (2^32)-1;
 *   OV = 1;
 * }
 * if (res2 < 0) {
 *   res2 = 0;
 *   OV = 1;
 * }
 * Rd.W[1] = res1;
 * Rd.W[0] = res2;
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UKSTAS32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ukstas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.46. UKSTAS32 ===== */

/* ===== Inline Function Start for 4.47. UKSTSA32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief UKSTSA32 (SIMD 32-bit Unsigned Saturating Straight Subtraction & Addition)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * UKSTSA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do one 32-bit unsigned integer element saturating subtraction and one 32-bit unsigned
 * integer element saturating addition in a 64-bit chunk simultaneously. Operands are from
 * corresponding 32-bit elements.
 *
 * **Description**:\n
 * This instruction subtracts the 32-bit unsigned integer element in [63:32] of Rs2 from
 * the 32-bit unsigned integer element in [63:32] of Rs1; at the same time, it adds the 32-bit unsigned
 * integer element in [31:0] of Rs2 with the 32-bit unsigned integer element in [31:0] Rs1. If any of the
 * results are beyond the 32-bit unsigned number range (0 <= RES <= 2^32-1), they are saturated to the
 * range and the OV bit is set to 1. The saturated results are written to [63:32] of Rd for subtraction and
 * [31:0] of Rd for addition.
 *
 * **Operations**:\n
 * ~~~
 * res1 = Rs1.W[1] - Rs2.W[1];
 * res2 = Rs1.W[0] + Rs2.W[0];
 * if (res1 < 0) {
 *   res1 = 0;
 *   OV = 1;
 * } else if (res2 > (2^32)-1) {
 *   res2 = (2^32)-1;
 *   OV = 1;
 * }
 * Rd.W[1] = res1;
 * Rd.W[0] = res2;
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UKSTSA32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ukstsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.47. UKSTSA32 ===== */

/* ===== Inline Function Start for 4.48. UKSUB32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief UKSUB32 (SIMD 32-bit Unsigned Saturating Subtraction)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * UKSUB32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit unsigned integer elements saturating subtractions simultaneously.
 *
 * **Description**:\n
 * This instruction subtracts the 32-bit unsigned integer elements in Rs2 from the 32-bit
 * unsigned integer elements in Rs1. If any of the results are beyond the 32-bit unsigned number
 * range (0 <= RES <= 2^32-1), they are saturated to the range and the OV bit is set to 1. The saturated
 * results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rs1.W[x] - Rs2.W[x];
 * if (res[x] < 0) {
 *   res[x] = 0;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UKSUB32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("uksub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.48. UKSUB32 ===== */

/* ===== Inline Function Start for 4.49. UMAX32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC
 * \brief UMAX32 (SIMD 32-bit Unsigned Maximum)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * UMAX32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit unsigned integer elements finding maximum operations simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 32-bit unsigned integer elements in Rs1 with the 32-bit
 * unsigned integer elements in Rs2 and selects the numbers that is greater than the other one. The
 * selected results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = (Rs1.W[x] u> Rs2.W[x])? Rs1.W[x] : Rs2.W[x];
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UMAX32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("umax32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.49. UMAX32 ===== */

/* ===== Inline Function Start for 4.50. UMIN32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC
 * \brief UMIN32 (SIMD 32-bit Unsigned Minimum)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * UMIN32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit unsigned integer elements finding minimum operations simultaneously.
 *
 * **Description**:\n
 * This instruction compares the 32-bit unsigned integer elements in Rs1 with the 32-bit
 * unsigned integer elements in Rs2 and selects the numbers that is less than the other one. The
 * selected results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = (Rs1.W[x] <u Rs2.W[x])? Rs1.W[x] : Rs2.W[x];
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_UMIN32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("umin32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.50. UMIN32 ===== */

/* ===== Inline Function Start for 4.51. URADD32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief URADD32 (SIMD 32-bit Unsigned Halving Addition)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * URADD32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit unsigned integer element additions simultaneously. The results are halved to
 * avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the 32-bit unsigned integer elements in Rs1 with the 32-bit
 * unsigned integer elements in Rs2. The results are first logically right-shifted by 1 bit and then
 * written to Rd.
 *
 * **Examples**:\n
 * ~~~
 * * Ra = 0x7FFFFFFF, Rb = 0x7FFFFFFF Rt = 0x7FFFFFFF
 * * Ra = 0x80000000, Rb = 0x80000000 Rt = 0x80000000
 * * Ra = 0x40000000, Rb = 0x80000000 Rt = 0x60000000
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = (Rs1.W[x] + Rs2.W[x]) u>> 1;
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_URADD32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("uradd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.51. URADD32 ===== */

/* ===== Inline Function Start for 4.52. URCRAS32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief URCRAS32 (SIMD 32-bit Unsigned Halving Cross Addition & Subtraction)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * URCRAS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit unsigned integer element addition and 32-bit unsigned integer element
 * subtraction in a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements. The
 * results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the 32-bit unsigned integer element in [63:32] of Rs1 with the 32-
 * bit unsigned integer element in [31:0] of Rs2, and subtracts the 32-bit unsigned integer element in
 * [63:32] of Rs2 from the 32-bit unsigned integer element in [31:0] of Rs1. The element results are first
 * logically right-shifted by 1 bit and then written to [63:32] of Rd for addition and [31:0] of Rd for
 * subtraction.
 *
 * **Examples**:\n
 * ~~~
 * Please see `URADD32` and `URSUB32` instructions.
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[1] = (Rs1.W[1] + Rs2.W[0]) u>> 1;
 * Rd.W[0] = (Rs1.W[0] - Rs2.W[1]) u>> 1;
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_URCRAS32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("urcras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.52. URCRAS32 ===== */

/* ===== Inline Function Start for 4.53. URCRSA32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief URCRSA32 (SIMD 32-bit Unsigned Halving Cross Subtraction & Addition)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * URCRSA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit unsigned integer element subtraction and 32-bit unsigned integer element
 * addition in a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements. The results
 * are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the 32-bit unsigned integer element in [31:0] of Rs2 from the
 * 32-bit unsigned integer element in [63:32] of Rs1, and adds the 32-bit unsigned element integer in
 * [31:0] of Rs1 with the 32-bit unsigned integer element in [63:32] of Rs2. The two results are first
 * logically right-shifted by 1 bit and then written to [63:32] of Rd for subtraction and [31:0] of Rd for
 * addition.
 *
 * **Examples**:\n
 * ~~~
 * Please see `URADD32` and `URSUB32` instructions.
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[1] = (Rs1.W[1] - Rs2.W[0]) u>> 1;
 * Rd.W[0] = (Rs1.W[0] + Rs2.W[1]) u>> 1;
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_URCRSA32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("urcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.53. URCRSA32 ===== */

/* ===== Inline Function Start for 4.54. URSTAS32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief URSTAS32 (SIMD 32-bit Unsigned Halving Straight Addition & Subtraction)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * URSTAS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit unsigned integer element addition and 32-bit unsigned integer element
 * subtraction in a 64-bit chunk simultaneously. Operands are from corresponding 32-bit elements.
 * The results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the 32-bit unsigned integer element in [63:32] of Rs1 with the 32-
 * bit unsigned integer element in [63:32] of Rs2, and subtracts the 32-bit unsigned integer element in
 * [31:0] of Rs2 from the 32-bit unsigned integer element in [31:0] of Rs1. The element results are first
 * logically right-shifted by 1 bit and then written to [63:32] of Rd for addition and [31:0] of Rd for
 * subtraction.
 *
 * **Examples**:\n
 * ~~~
 * Please see `URADD32` and `URSUB32` instructions.
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[1] = (Rs1.W[1] + Rs2.W[1]) u>> 1;
 * Rd.W[0] = (Rs1.W[0] - Rs2.W[0]) u>> 1;
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_URSTAS32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("urstas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.54. URSTAS32 ===== */

/* ===== Inline Function Start for 4.55. URSTSA32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief URSTSA32 (SIMD 32-bit Unsigned Halving Straight Subtraction & Addition)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * URSTSA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit unsigned integer element subtraction and 32-bit unsigned integer element
 * addition in a 64-bit chunk simultaneously. Operands are from corresponding 32-bit elements. The
 * results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the 32-bit unsigned integer element in [63:32] of Rs2 from
 * the 32-bit unsigned integer element in [63:32] of Rs1, and adds the 32-bit unsigned element integer
 * in [31:0] of Rs1 with the 32-bit unsigned integer element in [31:0] of Rs2. The two results are first
 * logically right-shifted by 1 bit and then written to [63:32] of Rd for subtraction and [31:0] of Rd for
 * addition.
 *
 * **Examples**:\n
 * ~~~
 * Please see `URADD32` and `URSUB32` instructions.
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[1] = (Rs1.W[1] - Rs2.W[1]) u>> 1;
 * Rd.W[0] = (Rs1.W[0] + Rs2.W[0]) u>> 1;
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_URSTSA32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("urstsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.55. URSTSA32 ===== */

/* ===== Inline Function Start for 4.56. URSUB32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
 * \brief URSUB32 (SIMD 32-bit Unsigned Halving Subtraction)
 * \details
 * **Type**: SIMD (RV64 Only)
 *
 * **Syntax**:\n
 * ~~~
 * URSUB32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit unsigned integer element subtractions simultaneously. The results are halved to
 * avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the 32-bit unsigned integer elements in Rs2 from the 32-bit
 * unsigned integer elements in Rs1. The results are first logically right-shifted by 1 bit and then
 * written to Rd.
 *
 * **Examples**:\n
 * ~~~
 * * Ra = 0x7FFFFFFF, Rb = 0x80000000, Rt = 0xFFFFFFFF
 * * Ra = 0x80000000, Rb = 0x7FFFFFFF, Rt = 0x00000000
 * * Ra = 0x80000000, Rb = 0x40000000, Rt = 0x20000000
 * ~~~
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = (Rs1.W[x] - Rs2.W[x]) u>> 1;
 * for RV64: x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long type of value stored in a
 * \param [in]  b    unsigned long type of value stored in b
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_URSUB32(unsigned long a, unsigned long b)
{
    unsigned long result;
    __ASM volatile("ursub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for 4.56. URSUB32 ===== */

#endif /* __RISCV_XLEN == 64 */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_NUCLEI_Default      Nuclei Default SIMD DSP Additional Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic
 * \brief    (RV32 & RV64)Nuclei Customized DSP Instructions
 * \details  This is Nuclei customized DSP instructions for both RV32 and RV64
 */

/* ===== Inline Function Start for EXPD80 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_Default
 * \brief EXPD80 (Expand and Copy Byte 0 to 32bit(when rv32) or 64bit(when rv64))
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * EXPD80 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * When rv32, Copy 8-bit data from 32-bit chunks into 4 bytes in a register.
 * When rv64, Copy 8-bit data from 64-bit chunks into 8 bytes in a register.
 *
 * **Description**:\n
 * Moves Rs1.B[0][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0]
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:0] = CONCAT(Rs1.B[0][7:0], Rs1.B[0][7:0], Rs1.B[0][7:0], Rs1.B[0][7:0]);
 * for RV32: x=0
 * ~~~
 *
 * \param [in]  a unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_EXPD80(unsigned long a)
{
    unsigned long result;
    __ASM volatile("expd80 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for EXPD80 ===== */

/* ===== Inline Function Start for EXPD81 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_Default
 * \brief EXPD81 (Expand and Copy Byte 1 to 32bit(rv32) or 64bit(when rv64))
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * EXPD81 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Copy 8-bit data from 32-bit chunks into 4 bytes in a register.
 *
 * **Description**:\n
 * Moves Rs1.B[1][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0]
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:0] = CONCAT(Rs1.B[1][7:0], Rs1.B[1][7:0], Rs1.B[1][7:0], Rs1.B[1][7:0]);
 * for RV32: x=0
 * ~~~
 *
 * \param [in]  a unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_EXPD81(unsigned long a)
{
    unsigned long result;
    __ASM volatile("expd81 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for EXPD81 ===== */

/* ===== Inline Function Start for EXPD82 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_Default
 * \brief EXPD82 (Expand and Copy Byte 2 to 32bit(rv32) or 64bit(when rv64))
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * EXPD82 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Copy 8-bit data from 32-bit chunks into 4 bytes in a register.
 *
 * **Description**:\n
 * Moves Rs1.B[2][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0]
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:0] = CONCAT(Rs1.B[2][7:0], Rs1.B[2][7:0], Rs1.B[2][7:0], Rs1.B[2][7:0]);
 * for RV32: x=0
 * ~~~
 *
 * \param [in]  a unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_EXPD82(unsigned long a)
{
    unsigned long result;
    __ASM volatile("expd82 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for EXPD82 ===== */

/* ===== Inline Function Start for EXPD83 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_Default
 * \brief EXPD83 (Expand and Copy Byte 3 to 32bit(rv32) or 64bit(when rv64))
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * EXPD83 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Copy 8-bit data from 32-bit chunks into 4 bytes in a register.
 *
 * **Description**:\n
 * Moves Rs1.B[3][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0]
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:0] = CONCAT(Rs1.B[3][7:0], Rs1.B[3][7:0], Rs1.B[3][7:0], Rs1.B[3][7:0]);
 * for RV32: x=0
 * ~~~
 *
 * \param [in]  a unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_EXPD83(unsigned long a)
{
    unsigned long result;
    __ASM volatile("expd83 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for EXPD83 ===== */

#if (__RISCV_XLEN == 64)
/* ===== Inline Function Start for EXPD84 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_Default
 * \brief EXPD84 (Expand and Copy Byte 4 to 64bit)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * EXPD84 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Only RV64, copy 8-bit data from 64-bit chunks into 8 bytes in a register.
 *
 * **Description**:\n
 * Moves Rs1.B[4][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0]
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:0] = CONCAT(Rs1.B[4][7:0], Rs1.B[4][7:0], Rs1.B[4][7:0], Rs1.B[4][7:0]);
 * for RV32: x=0
 * ~~~
 *
 * \param [in]  a unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_EXPD84(unsigned long a)
{
    unsigned long result;
    __ASM volatile("expd84 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for EXPD84 ===== */

/* ===== Inline Function Start for EXPD85 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_Default
 * \brief EXPD85 (Expand and Copy Byte 5 to 64bit)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * EXPD85 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Only RV64, copy 8-bit data from 64-bit chunks into 8 bytes in a register.
 *
 * **Description**:\n
 * Moves Rs1.B[5][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0]
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:0] = CONCAT(Rs1.B[5][7:0], Rs1.B[5][7:0], Rs1.B[5][7:0], Rs1.B[5][7:0]);
 * for RV32: x=0
 * ~~~
 *
 * \param [in]  a unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_EXPD85(unsigned long a)
{
    unsigned long result;
    __ASM volatile("expd85 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for EXPD85 ===== */

/* ===== Inline Function Start for EXPD86 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_Default
 * \brief EXPD86 (Expand and Copy Byte 6 to 64bit)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * EXPD86 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Only RV64, copy 8-bit data from 64-bit chunks into 8 bytes in a register.
 *
 * **Description**:\n
 * Moves Rs1.B[6][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0]
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:0] = CONCAT(Rs1.B[6][7:0], Rs1.B[6][7:0], Rs1.B[6][7:0], Rs1.B[6][7:0]);
 * for RV32: x=0
 * ~~~
 *
 * \param [in]  a unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_EXPD86(unsigned long a)
{
    unsigned long result;
    __ASM volatile("expd86 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for EXPD86 ===== */

/* ===== Inline Function Start for EXPD87 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_Default
 * \brief EXPD87 (Expand and Copy Byte 7 to 64bit)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * EXPD87 Rd, Rs1
 * ~~~
 *
 * **Purpose**:\n
 * Only RV64, copy 8-bit data from 64-bit chunks into 8 bytes in a register.
 *
 * **Description**:\n
 * Moves Rs1.B[7][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0]
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:0] = CONCAT(Rs1.B[7][7:0], Rs1.B[7][7:0], Rs1.B[7][7:0], Rs1.B[7][7:0]);
 * for RV32: x=0
 * ~~~
 *
 * \param [in]  a unsigned long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_EXPD87(unsigned long a)
{
    unsigned long result;
    __ASM volatile("expd87 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for EXPD87 ===== */
#endif /* __RISCV_XLEN == 64 */

#if (__RISCV_XLEN == 32) || defined(__ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__)
/* XXXXX Nuclei Extended DSP Instructions for RV32 XXXXX */

/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_NUCLEI_N1      Nuclei N1 SIMD DSP Additional Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic
 * \brief    (RV32 only)Nuclei Customized N1 DSP Instructions
 * \details  This is Nuclei customized DSP N1 instructions only for RV32
 */
/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_NUCLEI_N2      Nuclei N2 SIMD DSP Additional Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic
 * \brief    (RV32 only)Nuclei Customized N2 DSP Instructions
 * \details  This is Nuclei customized DSP N2 instructions only for RV32
 */
/**
 * \defgroup NMSIS_Core_DSP_Intrinsic_NUCLEI_N3      Nuclei N3 SIMD DSP Additional Instructions
 * \ingroup  NMSIS_Core_DSP_Intrinsic
 * \brief    (RV32 only)Nuclei Customized N3 DSP Instructions
 * \details  This is Nuclei customized DSP N3 instructions only for RV32
 */

/* ===== Inline Function Start for DKHM8 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N1
 * \brief DKHM8 (64-bit SIMD Signed Saturating Q7 Multiply)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKHM8 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do Q7xQ7 element multiplications simultaneously. The Q14 results are then reduced to Q7
 * numbers again.
 *
 * **Description**:\n
 * For the `DKHM8` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1
 * with the top 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7
 * content of 16-bit chunks in Rs1 with the bottom 8-bit Q7 content of 16-bit chunks in Rs2.
 *
 * The Q14 results are then right-shifted 7-bits and saturated into Q7 values. The Q7 results are then
 * written into Rd. When both the two Q7 inputs of a multiplication are 0x80, saturation will happen.
 * The result will be saturated to 0x7F and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.B[x+1]; op2t = Rs2.B[x+1]; // top
 * op1b = Rs1.B[x]; op2b = Rs2.B[x]; // bottom
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   if (0x80 != aop | 0x80 != bop) {
 *     res = (aop s* bop) >> 7;
 *   } else {
 *     res= 0x7F;
 *     OV = 1;
 *   }
 * }
 * Rd.H[x/2] = concat(rest, resb);
 * for RV32, x=0,2,4,6
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKHM8(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dkhm8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKHM8 ===== */

/* ===== Inline Function Start for DKHM16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N1
 * \brief DKHM16 (64-bit SIMD Signed Saturating Q15 Multiply)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKHM16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do Q15xQ15 element multiplications simultaneously. The Q30 results are then reduced to
 * Q15 numbers again.
 *
 * **Description**:\n
 * For the `DKHM16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in
 * Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom
 * 16-bit Q15 content of 32-bit chunks in Rs1 with the bottom 16-bit Q15 content of 32-bit chunks in
 * Rs2.
 *
 * The Q30 results are then right-shifted 15-bits and saturated into Q15 values. The Q15 results are
 * then written into Rd. When both the two Q15 inputs of a multiplication are 0x8000, saturation will
 * happen. The result will be saturated to 0x7FFF and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.H[x+1]; op2t = Rs2.H[x+1]; // top
 * op1b = Rs1.H[x]; op2b = Rs2.H[x]; // bottom
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   if (0x8000 != aop | 0x8000 != bop) {
 *     res = (aop s* bop) >> 15;
 *   } else {
 *     res= 0x7FFF;
 *     OV = 1;
 *   }
 * }
 * Rd.W[x/2] = concat(rest, resb);
 * for RV32: x=0, 2
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKHM16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dkhm16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKHM16 ===== */

/* ===== Inline Function Start for DKABS8 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N1
 * \brief DKABS8 (64-bit SIMD 8-bit Saturating Absolute)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKABS8 Rd, Rs1
 * # Rd, Rs1 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Get the absolute value of 8-bit signed integer elements simultaneously.
 *
 * **Description**:\n
 * This instruction calculates the absolute value of 8-bit signed integer elements stored
 * in Rs1 and writes the element results to Rd. If the input number is 0x80, this instruction generates
 * 0x7f as the output and sets the OV bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * src = Rs1.B[x];
 * if (src == 0x80) {
 *   src = 0x7f;
 *   OV = 1;
 * } else if (src[7] == 1)
 *   src = -src;
 * }
 * Rd.B[x] = src;
 * for RV32: x=7...0,
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKABS8(unsigned long long a)
{
    unsigned long long result;
    __ASM volatile("dkabs8 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for DKABS8 ===== */

/* ===== Inline Function Start for DKABS16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N1
 * \brief DKABS16 (64-bit SIMD 16-bit Saturating Absolute)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKABS16 Rd, Rs1
 * # Rd, Rs1 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Get the absolute value of 16-bit signed integer elements simultaneously.
 *
 * **Description**:\n
 * This instruction calculates the absolute value of 16-bit signed integer elements stored
 * in Rs1 and writes the element results to Rd. If the input number is 0x8000, this instruction
 * generates 0x7fff as the output and sets the OV bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * src = Rs1.H[x];
 * if (src == 0x8000) {
 *   src = 0x7fff;
 *   OV = 1;
 * } else if (src[15] == 1)
 *   src = -src;
 * }
 * Rd.H[x] = src;
 * for RV32: x=3...0,
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKABS16(unsigned long long a)
{
    unsigned long long result;
    __ASM volatile("dkabs16 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for DKABS16 ===== */

/* ===== Inline Function Start for DKSLRA8 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N1
 * \brief DKSLRA8 (64-bit SIMD 8-bit Shift Left Logical with Saturation or Shift Right Arithmetic)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKSLRA8 Rd, Rs1, Rs2
 * # Rd, Rs1 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit elements logical left (positive) or arithmetic right (negative) shift operation with
 * Q7 saturation for the left shift.
 *
 * **Description**:\n
 * The 8-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
 * based on the value of Rs2[3:0]. Rs2[3:0] is in the signed range of [-2^3, 2^3-1]. A positive Rs2[3:0] means
 * logical left shift and a negative Rs2[3:0] means arithmetic right shift. The shift amount is the
 * absolute value of Rs2[3:0]. However, the behavior of `Rs2[3:0]==-2^3 (0x8)` is defined to be
 * equivalent to the behavior of `Rs2[3:0]==-(2^3-1) (0x9)`.
 * The left-shifted results are saturated to the 8-bit signed integer range of [-2^7, 2^7-1].
 * If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:4] will not affect
 * this instruction.
 *
 * **Operations**:\n
 * ~~~
 * if (Rs2[3:0] < 0) {
 *   sa = -Rs2[3:0];
 *   sa = (sa == 8)? 7 : sa;
 *   Rd.B[x] = SE8(Rs1.B[x][7:sa]);
 * } else {
 *   sa = Rs2[2:0];
 *   res[(7+sa):0] = Rs1.B[x] <<(logic) sa;
 *   if (res > (2^7)-1) {
 *     res[7:0] = 0x7f; OV = 1;
 *   } else if (res < -2^7) {
 *     res[7:0] = 0x80; OV = 1;
 *   }
 *   Rd.B[x] = res[7:0];
 * }
 * for RV32: x=7...0,
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b int type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKSLRA8(unsigned long long a, int b)
{
    unsigned long long result;
    __ASM volatile("dkslra8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKSLRA8 ===== */

/* ===== Inline Function Start for DKSLRA16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N1
 * \brief DKSLRA16 (64-bit SIMD 16-bit Shift Left Logical with Saturation or Shift Right Arithmetic)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKSLRA16 Rd, Rs1, Rs2
 * # Rd, Rs1 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit elements logical left (positive) or arithmetic right (negative) shift operation with
 * Q15 saturation for the left shift.
 *
 * **Description**:\n
 * The 16-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
 * based on the value of Rs2[4:0]. Rs2[4:0] is in the signed range of [-2^4, 2^4-1]. A positive Rs2[4:0] means
 * logical left shift and a negative Rs2[4:0] means arithmetic right shift. The shift amount is the
 * absolute value of Rs2[4:0]. However, the behavior of `Rs2[4:0]==-2^4 (0x10)` is defined to be
 * equivalent to the behavior of `Rs2[4:0]==-(2^4-1) (0x11)`.
 * The left-shifted results are saturated to the 16-bit signed integer range of [-2^15, 2^15-1].
 * After the shift, saturation, or rounding, the final results are written to
 * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:5] will not affect
 * this instruction.
 *
 * **Operations**:\n
 * ~~~
 * if (Rs2[4:0] < 0) {
 *   sa = -Rs2[4:0];
 *   sa = (sa == 16)? 15 : sa;
 *   Rd.H[x] = SE16(Rs1.H[x][15:sa]);
 * } else {
 *   sa = Rs2[3:0];
 *   res[(15+sa):0] = Rs1.H[x] <<(logic) sa;
 *   if (res > (2^15)-1) {
 *     res[15:0] = 0x7fff; OV = 1;
 *   } else if (res < -2^15) {
 *     res[15:0] = 0x8000; OV = 1;
 *   }
 *   d.H[x] = res[15:0];
 * }
 * for RV32: x=3...0,
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b int type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKSLRA16(unsigned long long a, int b)
{
    unsigned long long result;
    __ASM volatile("dkslra16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKSLRA16 ===== */

/* ===== Inline Function Start for DKADD8 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N1
 * \brief DKADD8 (64-bit SIMD 8-bit Signed Saturating Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKADD8 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit signed integer element saturating additions simultaneously.
 *
 * **Description**:\n
 * This instruction adds the 8-bit signed integer elements in Rs1 with the 8-bit signed
 * integer elements in Rs2. If any of the results are beyond the Q7 number range (-2^7 <= Q7 <= 2^7-1), they
 * are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rs1.B[x] + Rs2.B[x];
 * if (res[x] > 127) {
 *   res[x] = 127;
 *   OV = 1;
 * } else if (res[x] < -128) {
 *   res[x] = -128;
 *   OV = 1;
 * }
 * Rd.B[x] = res[x];
 * for RV32: x=7...0,
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKADD8(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dkadd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKADD8 ===== */

/* ===== Inline Function Start for DKADD16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N1
 * \brief DKADD16 (64-bit SIMD 16-bit Signed Saturating Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKADD16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element saturating additions simultaneously.
 *
 * **Description**:\n
 * This instruction adds the 16-bit signed integer elements in Rs1 with the 16-bit signed
 * integer elements in Rs2. If any of the results are beyond the Q15 number range (-2^15 <= Q15 <= 2^15-1),
 * they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rs1.H[x] + Rs2.H[x];
 * if (res[x] > 32767) {
 *   res[x] = 32767;
 *   OV = 1;
 * } else if (res[x] < -32768) {
 *   res[x] = -32768;
 *   OV = 1;
 * }
 * Rd.H[x] = res[x];
 * for RV32: x=3...0,
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKADD16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dkadd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKADD16 ===== */

/* ===== Inline Function Start for DKSUB8 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N1
 * \brief DKSUB8 (64-bit SIMD 8-bit Signed Saturating Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKSUB8 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 8-bit signed elements saturating subtractions simultaneously.
 *
 * **Description**:\n
 * This instruction subtracts the 8-bit signed integer elements in Rs2 from the 8-bit
 * signed integer elements in Rs1. If any of the results are beyond the Q7 number range (-2^7 <= Q7 <= 2^7-1),
 * they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rs1.B[x] - Rs2.B[x];
 * if (res[x] > (2^7)-1) {
 *   res[x] = (2^7)-1;
 *   OV = 1;
 * } else if (res[x] < -2^7) {
 *   res[x] = -2^7;
 *   OV = 1;
 * }
 * Rd.B[x] = res[x];
 * for RV32: x=7...0,
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKSUB8(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dksub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKSUB8 ===== */

/* ===== Inline Function Start for DKSUB16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N1
 * \brief DKSUB16 (64-bit SIMD 16-bit Signed Saturating Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKSUB16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer elements saturating subtractions simultaneously.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit signed integer elements in Rs2 from the 16-bit
 * signed integer elements in Rs1. If any of the results are beyond the Q15 number range (-2^15 <= Q15 <=
 * 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to
 * Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rs1.H[x] - Rs2.H[x];
 * if (res[x] > (2^15)-1) {
 *   res[x] = (2^15)-1;
 *   OV = 1;
 * } else if (res[x] < -2^15) {
 *   res[x] = -2^15;
 *   OV = 1;
 * }
 * Rd.H[x] = res[x];
 * for RV32: x=3...0,
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKSUB16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dksub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKSUB16 ===== */

/* ===== Inline Function Start for DKHMX8 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief DKHMX8 (64-bit SIMD Signed Crossed Saturating Q7 Multiply)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKHMX8 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do Q7xQ7 element crossed multiplications simultaneously. The Q15 results are then reduced to Q7 numbers again.
 *
 * **Description**:\n
 * For the `KHM8` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1 with the
 * bottom 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7
 * content of 16-bit chunks in Rs1 with the top 8-bit Q7 content of 16-bit chunks in Rs2.
 *
 * The Q14 results are then right-shifted 7-bits and saturated into Q7 values. The Q7 results are then
 * written into Rd. When both the two Q7 inputs of a multiplication are 0x80, saturation will happen.
 * The result will be saturated to 0x7F and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.B[x+1]; op2t = Rs2.B[x]; // top
 * op1b = Rs1.B[x]; op2b = Rs2.B[x+1]; // bottom
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   if (0x80 != aop | 0x80 != bop) {
 *     res = (aop s* bop) >> 7;
 *   } else {
 *     res= 0x7F;
 *     OV = 1;
 *   }
 * }
 * Rd.H[x/2] = concat(rest, resb);
 * for RV32, x=0,2,4,6
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKHMX8(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dkhmx8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKHMX8 ===== */

/* ===== Inline Function Start for DKHMX16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief DKHMX16 (64-bit SIMD Signed Crossed Saturating Q15 Multiply)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKHMX16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do Q15xQ15 element crossed multiplications simultaneously. The Q31 results are then reduced to Q15 numbers again.
 *
 * **Description**:\n
 * For the `KHMX16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in Rs1 with the
 * bottom 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom 16-bit Q15
 * content of 32-bit chunks in Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2.
 *
 * The Q30 results are then right-shifted 15-bits and saturated into Q15 values. The Q15 results are
 * then written into Rd. When both the two Q15 inputs of a multiplication are 0x8000, saturation will
 * happen. The result will be saturated to 0x7FFF and the overflow flag OV will be set.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.H[x+1]; op2t = Rs2.H[x]; // top
 * op1b = Rs1.H[x]; op2b = Rs2.H[x+1]; // bottom
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   if (0x8000 != aop | 0x8000 != bop) {
 *     res = (aop s* bop) >> 15;
 *   } else {
 *     res= 0x7FFF;
 *     OV = 1;
 *   }
 * }
 * Rd.W[x/2] = concat(rest, resb);
 * for RV32, x=0,2
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKHMX16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dkhmx16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKHMX16 ===== */

/* ===== Inline Function Start for DSMMUL ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief DSMMUL (64-bit MSW 32x32 Signed Multiply)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMMUL Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do MSW 32x32 element signed multiplications simultaneously. The results are written into Rd.
 *
 * **Description**:\n
 * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2 and writes the
 * most significant 32-bit multiplication results to the corresponding 32-bit elements of Rd. The 32-bit
 * elements of Rs1 and Rs2 are treated as signed integers. The .u form of the instruction rounds up
 * the most significant 32-bit of the 64-bit multiplication results by adding a 1 to bit 31 of the results.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   res = (aop s* bop)[63:32];
 * }
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSMMUL(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dsmmul %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMMUL ===== */

/* ===== Inline Function Start for DSMMUL.u ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief  DSMMUL.u (64-bit MSW 32x32 Unsigned Multiply)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMMUL.u Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do MSW 32x32 element unsigned multiplications simultaneously. The results are written into Rd.
 *
 * **Description**:\n
 * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2 and writes the
 * most significant 32-bit multiplication results to the corresponding 32-bit elements of Rd. The 32-bit
 * elements of Rs1 and Rs2 are treated as unsigned integers. The .u form of the instruction rounds up
 * the most significant 32-bit of the 64-bit multiplication results by adding a 1 to bit 31 of the results.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   res = RUND(aop u* bop)[63:32];
 * }
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSMMUL_U(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dsmmul.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMMUL.u ===== */

/* ===== Inline Function Start for DKWMMUL ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief   DKWMMUL (64-bit MSW 32x32 Signed Multiply & Double)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKWMMUL Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do MSW 32x32 element signed multiplications simultaneously and double. The results are written into Rd.
 *
 * **Description**:\n
 * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2. It then shifts
 * the multiplication results one bit to the left and takes the most significant 32-bit results. If the
 * shifted result is greater than 2^31-1, it is saturated to 2^31-1 and the OV flag is set to 1. The final element
 * result is written to Rd. The 32-bit elements of Rs1 and Rs2 are treated as signed integers. The .u
 * form of the instruction additionally rounds up the 64-bit multiplication results by adding a 1 to bit
 * 30 before the shift and saturation operations.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *     res = sat.q31((aop s* bop) << 1)[63:32];
 * }
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKWMMUL(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dkwmmul %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKWMMUL ===== */

/* ===== Inline Function Start for DKWMMUL.u ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief   DKWMMUL.u (64-bit MSW 32x32 Unsigned Multiply & Double)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKWMMUL.u Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do MSW 32x32 element unsigned multiplications simultaneously and double. The results are written into Rd.
 *
 * **Description**:\n
 * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2. It then shifts
 * the multiplication results one bit to the left and takes the most significant 32-bit results. If the
 * shifted result is greater than 2^31-1, it is saturated to 2^31-1 and the OV flag is set to 1. The final element
 * result is written to Rd. The 32-bit elements of Rs1 and Rs2 are treated as signed integers. The .u
 * form of the instruction additionally rounds up the 64-bit multiplication results by adding a 1 to bit
 * 30 before the shift and saturation operations.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
 *   res = sat.q31(RUND(aop u* bop) << 1)[63:32];
 * }
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKWMMUL_U(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dkwmmul.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKWMMUL.u ===== */

/* ===== Inline Function Start for DKABS32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief   DKABS32 (64-bit SIMD 32-bit Saturating Absolute)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKABS32 Rd, Rs1
 * # Rd, Rs1 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Get the absolute value of 32-bit signed integer elements simultaneously.
 *
 * **Description**:\n
 * This instruction calculates the absolute value of 32-bit signed integer elements stored in Rs1 and writes the element
 * results to Rd. If the input number is 0x8000_0000, this instruction generates 0x7fff_ffff as the output and sets the OV
 * bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * src = Rs1.W[x];
 * if (src == 0x8000_0000) {
 *   src = 0x7fff_ffff;
 *   OV = 1;
 * } else if (src[31] == 1)
 *   src = -src;
 * }
 * Rd.W[x] = src;
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKABS32(unsigned long long a)
{
    unsigned long long result;
    __ASM volatile("dkabs32 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for DKABS32 ===== */

/* ===== Inline Function Start for DKSLRA32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief   DKSLRA32 (64-bit SIMD 32-bit Shift Left Logical with Saturation or Shift Right Arithmetic)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKSLRA32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 31-bit elements logical left (positive) or arithmetic right (negative) shift operation with Q31 saturation for the left shift.
 *
 * **Description**:\n
 * The 31-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically based on the value of Rs2[5:0].
 * Rs2[5:0] is in the signed range of [-2^5, 2^5-1]. A positive Rs2[5:0] means logical left shift and a negative Rs2[4:0]
 * means arithmetic right shift. The shift amount is the absolute value of Rs2[5:0]. However, the behavior of Rs2[5:0]==-
 * 2^5 (0x20) is defined to be equivalent to the behavior of Rs2[5:0]==-(2^5-1) (0x21).
 *
 * **Operations**:\n
 * ~~~
 * if (Rs2[5:0] < 0) {
 *   sa = -Rs2[5:0];
 *   sa = (sa == 32)? 31 : sa;
 *   Rd.W[x] = SE32(Rs1.W[x][31:sa]);
 * } else {
 *   sa = Rs2[4:0];
 *   res[(31+sa):0] = Rs1.W[x] <<(logic) sa;
 *   if (res > (2^31)-1) {
 *   res[31:0] = 0x7fff_ffff; OV = 1;
 * } else if (res < -2^31) {
 *   res[31:0] = 0x8000_0000; OV = 1;
 * }
 *   Rd.W[x] = res[31:0];
 * }
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b int type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKSLRA32(unsigned long long a, int b)
{
    unsigned long long result;
    __ASM volatile("dkslra32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKSLRA32 ===== */

/* ===== Inline Function Start for DKADD32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief   DKADD32(64-bit SIMD 32-bit Signed Saturating Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKADD32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element saturating additions simultaneously.
 *
 * **Description**:\n
 * This instruction adds the 32-bit signed integer elements in Rs1 with the 32-bit signed integer elements in Rs2. If any
 * of the results are beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), they are saturated to the range and the OV
 * bit is set to 1. The saturated results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rs1.W[x] + Rs2.W[x];
 * if (res[x] > 0x7fff_ffff) {
 *   res[x] = 0x7fff_ffff;
 *   OV = 1;
 * } else if (res[x] < 0x8000_0000) {
 *   res[x] = 0x8000_0000;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKADD32(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dkadd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKADD32 ===== */

/* ===== Inline Function Start for DKSUB32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief   DKSUB32 (64-bit SIMD 32-bit Signed Saturating Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKSUB32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element saturating subtractions simultaneously.
 *
 * **Description**:\n
 * This instruction subtracts the 32-bit signed integer elements in Rs2 from the 32-bit signed integer elements in Rs1. If
 * any of the results are beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), they are saturated to the range and the
 * OV bit is set to 1. The saturated results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[x] = Rs1.W[x] - Rs2.W[x];
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res[x] < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[x] = res[x];
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKSUB32(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dksub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKSUB32 ===== */

/* ===== Inline Function Start for DRADD16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief   DRADD16 (64-bit SIMD 16-bit Halving Signed Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DRADD16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element additions simultaneously. The results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the 16-bit signed integer elements in Rs1 with the 16-bit signed integer elements in Rs2. The results
 * are first arithmetically right-shifted by 1 bit and then written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = [(Rs1.H[x]) + (Rs2.H[x])] s>> 1;
 * x=3...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DRADD16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dradd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DRADD16 ===== */

/* ===== Inline Function Start for DSUB16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief   DSUB16 (64-bit SIMD 16-bit Halving Signed Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSUB16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit integer element subtractions simultaneously.
 *
 * **Description**:\n
 * This instruction adds the 16-bit signed integer elements in Rs1 with the 16-bit signed integer elements in Rs2. The results
 * are first arithmetically right-shifted by 1 bit and then written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = [(Rs1.H[x]) - (Rs2.H[x])] ;
 * x=3...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSUB16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dsub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSUB16 ===== */

/* ===== Inline Function Start for DRADD32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DRADD32 (64-bit SIMD 32-bit Halving Signed Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DRADD32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element additions simultaneously. The results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the 32-bit signed integer elements in Rs1 with the 32-bit signed integer elements in Rs2. The results
 * are first arithmetically right-shifted by 1 bit and then written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = [(Rs1.W[x]) + (Rs2.W[x])] s>> 1;
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DRADD32(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dradd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DRADD32 ===== */

/* ===== Inline Function Start for DSUB32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSUB32 (64-bit SIMD 32-bit Halving Signed Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSUB32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit integer element subtractions simultaneously.
 *
 * **Description**:\n
 * This instruction subtracts the 32-bit signed integer elements in Rs2 from the 32-bit signed integer elements in Rs1 . The
 * results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = [(Rs1.E[x]) - (Rs2.E[x])] ;
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSUB32(unsigned long long a, unsigned long long b)
{
     unsigned long long result;
    __ASM volatile("dsub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSUB32 ===== */

/* ===== Inline Function Start for DMSR16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DMSR16 (Signed Multiply Halfs with Right Shift 16-bit and Cross Multiply Halfs with Right Shift 16-bit)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DMSR16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications and cross multiplications from the 16-bit elements of two registers; and each multiplications performs a right shift operation.
 *
 * **Description**:\n
 * For the `DMSR16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in Rs1 with the top 16-bit Q15 content
 * of 32-bit chunks in Rs2, multiply the bottom 16-bit Q15 content of 32-bit chunks in Rs1 with the bottom 16-bit Q15 content
 * of 32-bit chunks in Rs2.
 * At the same time, multiply the top 16-bit Q15 content of 32-bit chunks in Rs1 with the bottom16-bit Q15 content of 32-bit
 * chunks in Rs2 and multiply the bottom16-bit Q15 content of 32-bit chunks in Rs1 with the top16-bit Q15 content of 32-bit
 * chunks in Rs2. The Q31 results are then right-shifted 16-bits and clipped to Q15 values. The Q15 results are then written
 * into Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[0] = (Rs1.H[0] s* Rs2.H[0]) s>> 16
 * Rd.H[1] = (Rs1.H[1] s* Rs2.H[1]) s>> 16
 * Rd.H[2] = (Rs1.H[1] s* Rs2.H[0]) s>> 16
 * Rd.H[3] = (Rs1.H[0] s* Rs2.H[1]) s>> 16
 * ~~~
 *
 * \param [in]  a unsigned long type of value stored in a
 * \param [in]  b unsigned long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DMSR16(unsigned long a, unsigned long b)
{
    unsigned long long result;
    __ASM volatile("dmsr16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DMSR16 ===== */

/* ===== Inline Function Start for DMSR17 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DMSR17 (Signed Multiply Halfs with Right Shift 17-bit and Cross Multiply Halfs with Right Shift 17-bit)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DMSR17 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications and cross multiplications from the 16-bit elements of two registers;
 * and each multiplications performs a right shift operation.
 *
 * **Description**:\n
 * For the `DMSR17` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in Rs1 with the top 16-bit Q15 content
 * of 32-bit chunks in Rs2, multiply the bottom 16-bit Q15 content of 32-bit chunks in Rs1 with the bottom 16-bit Q15 content
 * of 32-bit chunks in Rs2.
 * At the same time, multiply the top 16-bit Q15 content of 32-bit chunks in Rs1 with the bottom 16-bit Q15 content of 32-bit
 * chunks in Rs2 and multiply the bottom 16-bit Q15 content of 32-bit chunks in Rs1 with the top 16-bit Q15 content of 32-bit
 * chunks in Rs2. The Q31 results are then right-shifted 17-bits and clipped to Q15 values. The Q15 results are then written
 * into Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[0] = (Rs1.H[0] s* Rs2.H[0]) s>> 17
 * Rd.H[1] = (Rs1.H[1] s* Rs2.H[1]) s>> 17
 * Rd.H[2] = (Rs1.H[1] s* Rs2.H[0]) s>> 17
 * Rd.H[3] = (Rs1.H[0] s* Rs2.H[1]) s>> 17
 * ~~~
 *
 * \param [in]  a unsigned long type of value stored in a
 * \param [in]  b unsigned long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DMSR17(unsigned long a, unsigned long b)
{
    unsigned long long result;
    __ASM volatile("dmsr17 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DMSR17 ===== */

/* ===== Inline Function Start for DMSR33 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DMSR33 (Signed Multiply with Right Shift 33-bit and Cross Multiply with Right Shift 33-bit)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DMSR33 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32-bit multiplications from the 32-bit elements of two registers, and each multiplications performs a right
 * shift operation.
 *
 * **Description**:\n
 * For the `DMSR33` instruction, multiply the top 32-bit Q31 content of 64-bit chunks in Rs1 with the top 32-bit Q31 content
 * of 64-bit chunks in Rs2. At the same time, multiply the bottom 32-bit Q31 content of 64bit chunks in Rs1 with the bottom 
 * 32-bit Q31 content of 64-bit. 
 * The Q64 results are then right-shifted 33-bits and clipped to Q31 values. The Q31 results are then written into Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[0] = (Rs1.W[0] s* Rs2.W[0]) s>> 33
 * Rd.W[1] = (Rs1.W[1] s* Rs2.W[1]) s>> 33
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DMSR33(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dmsr33 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DMSR33 ===== */

/* ===== Inline Function Start for DMXSR33 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DMXSR33 (Signed Multiply with Right Shift 33-bit and Cross Multiply with Right Shift 33-bit)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DMXSR33 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32-bit cross multiplications from the 32-bit elements of two registers, and each multiplications performs a
 * right shift operation.
 *
 * **Description**:\n
 * For the `DMXSR33` instruction, multiply the top 32-bit Q31 content of 64-bit chunks in Rs1 with the bottom 32-bit Q31
 * content of 64-bit chunks in Rs2. At the same time, multiply the bottom 32-bit Q31 content of 64-bit chunks in Rs1 with
 * the top 32-bit Q31 content of 64-bit chunks in Rs2.
 * The Q63 results are then right-shifted 33-bits and clipped to Q31 values. The Q31 results are then written into Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[0] = (Rs1.W[0] s* Rs2.W[1]) s>> 33
 * Rd.W[1] = (Rs1.W[1] s* Rs2.W[0]) s>> 33
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DMXSR33(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dmxsr33 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DMXSR33 ===== */

/* ===== Inline Function Start for DREDAS16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DREDAS16 (Reduced Addition and Reduced Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DREDAS16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do halfs reduced subtraction and halfs reduced addition from a register. The result is written to Rd.
 *
 * **Description**:\n
 * For the `DREDAS16` instruction, subtract the top 16-bit Q15 element from the bottom 16-bit Q15 element of the bottom
 * 32-bit Q31 content of 64-bit chunks in Rs1. At the same time, add the the top16-bit Q15 element with the bottom16-bit
 * Q15 element of the top 32-bit Q31 content of 64-bit chunks in Rs1. The two Q15 results are then written into Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[0] = Rs1.H[0] - Rs1.H[1]
 * Rd.H[1] = Rs1.H[2] + Rs1.H[3]
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_DREDAS16(unsigned long long a)
{
    unsigned long result;
    __ASM volatile("dredas16 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for DREDAS16 ===== */

/* ===== Inline Function Start for DREDSA16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DREDSA16 (Reduced Subtraction and Reduced Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DREDSA16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do halfs reduced subtraction and halfs reduced addition from a register. The result is written to Rd.
 *
 * **Description**:\n
 * For the `DREDSA16` instruction, add the top 16-bit Q15 element from the bottom 16-bit Q15  element of the bottom 32-bit Q31 content of 64-bit chunks in Rs1. At the same time,  subtract the the top16-bit Q15 element with the bottom16-bit Q15 element of the top 32-bit Q31 content of 64-bit chunks in Rs1. The two Q15 results are then written into Rd. 
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[0] = Rs1.H[0] + Rs1.H[1]
 * Rd.H[1] = Rs1.H[2] - Rs1.H[3]
 * ~~~
 *
 * \param [in]  a unsigned long longtype of value stored in a
 * \return value stored in unsigned long type
 */
__STATIC_FORCEINLINE unsigned long __RV_DREDSA16(unsigned long long a)
{
    unsigned long result;
    __ASM volatile("dredsa16 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for DREDSA16 ===== */

/* ===== Inline Function Start for DKCLIP64 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DKCLIP64 (64-bit Clipped to 16-bit Saturation Value)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKCLIP64 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 15-bit element arithmetic right shift operations and limit result into 32-bit int,then do saturate operation to 16-bit and
 * clip result to 16-bit Q15.
 *
 * **Description**:\n
 * For the `DKCLIP64` instruction, shift the input 15 bits to the right and data convert the result to 32-bit int type, after
 * which the input is saturated to limit the data to between 2^15-1 and -2^15. the result is converted to 16-bits q15 type. The
 * final results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * const int32_t max = (int32_t)((1U << 15U) - 1U);
 * const int32_t min = -1 - max ;
 * int32_t val = (int32_t)(Rs s>> 15);
 * if (val > max) {
 *   Rd = max;
 * } else if (val < min) {
 *   Rd = min;
 * } else {
 *   Rd = (int16_t)val;
 * }
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \return value stored in int16_t type
 */
__STATIC_FORCEINLINE int16_t __RV_DKCLIP64(unsigned long long a)
{
    int16_t result;
    __ASM volatile("dkclip64 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for DKCLIP64 ===== */

/* ===== Inline Function Start for DKMDA ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DKMDA (Signed Multiply Two Halfs and Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKMDA Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then adds the two 32-bit results together.
 * The addition result may be saturated.
 *
 * **Description**:\n
 * This instruction multiplies the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of the
 * 32-bit elements of Rs2 and then adds the result to the result of multiplying the top 16-bit content of the 32-bit elements of
 * Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
 * The addition result is checked for saturation. If saturation happens, the result is saturated to 2^31-1 The final results are
 * written to Rd. The 16-bit contents are treated as signed integers
 *
 * **Operations**:\n
 * ~~~
 * if (Rs1.W[x] != 0x80008000) or (Rs2.W[x] != 0x80008000){
 *   Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[1]) + (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
 * } else {
 *   Rd.W[x] = 0x7fffffff;
 *   OV = 1;
 * }
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKMDA(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dkmda %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKMDA ===== */

/* ===== Inline Function Start for DKMXDA ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief   DKMXDA (Signed Crossed Multiply Two Halfs and Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKMXDA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then adds the two 32-bit results together.
 * The addition result may be saturated.
 * * DKMXDA: top*bottom + top*bottom (per 32-bit element)
 *
 * **Description**:\n
 * This instruction multiplies the bottom 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit
 * elements of Rs2 and then adds the result to the result of multiplying the top 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2.
 * The addition result is checked for saturation.If saturation happens, the result is saturated to 2^31-1 The final results are
 * written to Rd. The 16-bit contents are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * if (Rs1.W[x] != 0x80008000) or (Rs2.W[x] != 0x80008000){
 * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[0]) + (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
 * } else {
 * Rd.W[x] = 0x7fffffff;
 * OV = 1;
 * }
 * x=1...0
 * ~~~
 *
 * \param [in]  a    unsigned long long type of value stored in a
 * \param [in]  b    unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKMXDA(unsigned long long a, unsigned long long b)
{
   unsigned long long result;
    __ASM volatile("dkmxda %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKMXDA ===== */

/* ===== Inline Function Start for DSMDRS ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief   DSMDRS (Signed Multiply Two Halfs and Reverse Subtract)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMDRS Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then perform a subtraction operation
 * between the two 32-bit results.
 * * DSMDRS: bottom*bottom - top*top (per 32-bit element)
 *
 * **Description**:\n
 * This instruction multiplies the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit
 * elements of Rs2 and then subtracts the result from the result of multiplying the bottom 16-bit content of the 32-bit elements
 * of Rs1 with the bottom 16-bit content of the 32-bit elements of Rs2.
 * The subtraction result is written to the corresponding 32-bit element of Rd (The 16-bit contents of multiplication are
 * treated as signed integers).
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]); x = 1...0
 * ~~~
 *
 * \param [in]  a    unsigned long long type of value stored in a
 * \param [in]  b    unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSMDRS(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dsmdrs %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMDRS ===== */

/* ===== Inline Function Start for DSMXDS ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief   DSMXDS (Signed Crossed Multiply Two Halfs and Subtract)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMXDS Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then perform a subtraction operation
 * between the two 32-bit results.
 * * DSMXDS: top*bottom - bottom*top (per 32-bit element)
 *
 * **Description**:\n
 * This instruction multiplies the bottom 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit
 * elements of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of the 32-bit elements
 * of Rs1 with the bottom 16-bit content of the 32-bit elements of Rs2.
 * The subtraction result is written to the corresponding 32-bit element of Rd. The 16-bit contents of multiplication are
 * treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]); x = 1...0
 * ~~~
 *
 * \param [in]  a    unsigned long long type of value stored in a
 * \param [in]  b    unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSMXDS(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dsmxds %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMXDS ===== */

/* ===== Inline Function Start for DSMBB32 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief   DSMBB32 (Signed Multiply Bottom Word & Bottom Word)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMBB32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit element of a register with the signed 32-bit element of another register and write the 64-bit result to a third register.
 * * DSMBB32: bottom*bottom
 *
 * **Description**:\n
 * This instruction multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit element of Rs2. The 64-bit multiplication result is written to Rd.
 * The 32-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = (Rs1.W[0] * Rs2.W[0]);
 * Rd = res;
 * ~~~
 *
 * \param [in]  a    unsigned long long type of value stored in a
 * \param [in]  b    unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMBB32(unsigned long long a, unsigned long long b)
{
    long long result;
    __ASM volatile("dsmbb32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMBB32 ===== */

/* ===== Inline Function Start for DSMBB32.sra14 ===== */
/**
 * \ingroup NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief   DSMBB32.sra14 (Signed Crossed Multiply Two Halfs and Subtract with Right Shift 14)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMBB32.sra14 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit element of a register with the signed 32-bit element of another register, then right shift 14-
 * bit,finally write the 64-bit result to a third register.
 * * DSMBB32.sra14: bottom*bottom s>> 14
 *
 * **Description**:\n
 * This instruction multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit element of Rs2. The 64-bit multiplication result is written to Rd after right shift 14-bit.
 * The 32-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = (Rs1.W[0] * Rs2.W[0]) s>> 14;
 * Rd = res;
 * ~~~
 *
 * \param [in]  a    unsigned long long type of value stored in a
 * \param [in]  b    unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMBB32_SRA14(unsigned long long a, unsigned long long b)
{
    long long result;
    __ASM volatile("dsmbb32.sra14 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMBB32.sra14 ===== */

/* ===== Inline Function Start for DSMBB32.sra32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief   DSMBB32.sra32 (Signed Crossed Multiply Two Halfs and Subtract with Right Shift 32)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMBB32.sra32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit element of a register with the signed 32-bit element of another register, then right shift 32-
 * bit,finally write the 64-bit result to a third register.
 * * DSMBB32.sra32: bottom*bottom s >> 32
 *
 * **Description**:\n
 * This instruction multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit element of Rs2.
 * The 64-bit multiplication result is written to Rd after right shift 32-bit. The 32-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = (Rs1.W[0] * Rs2.W[0]) s>> 32;
 * Rd = res;
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMBB32_SRA32(unsigned long long a, unsigned long long b)
{
    long long result;
    __ASM volatile("dsmbb32.sra32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMBB32.sra32 ===== */

/* ===== Inline Function Start for DSMBT32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    SMBT32 (Signed Multiply Bottom Word & Top Word)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMBT32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit element of a register with the signed 32-bit element of another register and write the 64-bit
 * result to a third register.
 * * DSMBT32: bottom*top
 *
 * **Description**:\n
 * This instruction multiplies the bottom 32-bit element of Rs1 with the top 32-bit element of Rs2. The 64-bit multiplication
 * result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = (Rs1.W[0] * Rs2.W[0]);
 * Rd = res;
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMBT32(unsigned long long a, unsigned long long b)
{
    long long result;
    __ASM volatile("dsmbt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMBT32 ===== */

/* ===== Inline Function Start for DSMBT32.sra14 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSMBT32.sra14 (Signed Multiply Bottom Word & Top Word with Right Shift 14)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMBT32.sra14 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit element of a register with the signed 32-bit element of another register, then right shift 14-
 * bit,finally write the 64-bit result to a third register.
 * * DSMBT32.sra14: bottom*bottom s>> 14
 *
 * **Description**:\n
 * This instruction multiplies the bottom 32-bit element of Rs1 with the top 32-bit element of Rs2. The 64-bit multiplication
 * result is written to Rd after right shift 14-bit. The 32-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = (Rs1.W[0] * Rs2.W[0]) s>> 14;
 * Rd = res;
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMBT32_SRA14(unsigned long long a, unsigned long long b)
{
    long long result;
    __ASM volatile("dsmbt32.sra14 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMBT32.sra14 ===== */

/* ===== Inline Function Start for DSMBT32.sra32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSMBT32.sra32 (Signed Crossed Multiply Two Halfs and Subtract with Right Shift 32)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMBT32.sra32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit element of a register with the signed 32-bit element of another register, then right shift 32-
 * bit,finally write the 64-bit result to a third register.
 * * DSMBT32.sra32: bottom*bottom s>> 32
 *
 * **Description**:\n
 * This instruction multiplies the bottom 32-bit element of Rs1 with the top 32-bit element of Rs2. The 64-bit multiplication
 * result is written to Rd after right shift 32-bit. The 32-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = (Rs1.W[0] * Rs2.W[0]) s>> 14;
 * Rd = res;
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMBT32_SRA32(unsigned long long a, unsigned long long b)
{
    long long result;
    __ASM volatile("dsmbt32.sra32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMBT32.sra32 ===== */

/* ===== Inline Function Start for DSMTT32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSMTT32 (Signed Multiply Top Word & Top Word)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMTT32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit element of a register with the signed 32-bit element of another register and write the 64-bit
 * result to a third register.
 * * DSMTT32: top*top
 *
 * **Description**:\n
 * This instruction multiplies the top 32-bit element of Rs1 with the top 32-bit element of Rs2. The 64-bit multiplication
 * result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rs1.W[1] * Rs2.W[1];
 * Rd = res;
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMTT32(unsigned long long a, unsigned long long b)
{
    long long result;
    __ASM volatile("dsmtt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMTT32 ===== */

/* ===== Inline Function Start for DSMTT32.sra14 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSMTT32.sra14 (Signed Multiply Top Word & Top Word with Right Shift 14-bit)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMTT32.sra14 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit element of a register with the signed 32-bit element of another register,then right shift 14-bit,
 * finally write the 64-bit result to a third register.
 * * DSMTT32.sra14: top*top s>> 14
 *
 * **Description**:\n
 * This instruction multiplies the top 32-bit element of Rs1 with the top 32-bit element of Rs2. The 64-bit multiplication
 * result is written to Rd after right shift 14-bit. The 32-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rs1.W[1] * Rs2.W[1] >> 14;
 * Rd = res;
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMTT32_SRA14(unsigned long long a, unsigned long long b)
{
    long long result;
    __ASM volatile("dsmtt32.sra14 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMTT32.sra14 ===== */

/* ===== Inline Function Start for DSMTT32.sra32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSMTT32.sra32 (Signed Multiply Top Word & Top Word with Right Shift 32-bit)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMTT32.sra32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit element of a register with the signed 32-bit element of another register,then right shift 32-bit,
 * finally write the 64-bit result to a third register.
 * * DSMTT32.sra32: top*top s>> 32
 *
 * **Description**:\n
 * This instruction multiplies the top 32-bit element of Rs1 with the top 32-bit element of Rs2. The 64-bit multiplication
 * result is written to Rd after right shift 32-bit. The 32-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rs1.W[1] * Rs2.W[1] >> 32;
 * Rd = res;
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMTT32_SRA32(unsigned long long a, unsigned long long b)
{
    long long result;
    __ASM volatile("dsmtt32.sra32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMTT32.sra32 ===== */

/* ===== Inline Function Start for DPKBB32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DPKBB32 (Pack Two 32-bit Data from Both Bottom Half)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DPKBB32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Pack 32-bit data from 64-bit chunks in two registers.
 * * DPKBB32: bottom.bottom
 *
 * **Description**:\n
 * This instruction moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
 *
 * **Operations**:\n
 * ~~~
 * Rd = CONCAT(Rs1.W[0], Rs2.W[0]);
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DPKBB32(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dpkbb32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DPKBB32 ===== */

/* ===== Inline Function Start for DPKBT32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DPKBT32 (Pack Two 32-bit Data from Bottom and Top Half)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DPKBT32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Pack 32-bit data from 64-bit chunks in two registers.
 * * DPKBT32: bottom.top
 *
 * **Description**:\n
 * This instruction moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
 *
 * **Operations**:\n
 * ~~~
 * Rd = CONCAT(Rs1.W[0], Rs2.W[1]);
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DPKBT32(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dpkbt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DPKBT32 ===== */

/* ===== Inline Function Start for DPKTT32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DPKTT32 (Pack Two 32-bit Data from Both Top Half)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DPKTT32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Pack 32-bit data from 64-bit chunks in two registers.
 * * DPKTT32: top.top
 *
 * **Description**:\n
 * This instruction moves Rs1.W[1] to Rd.W[0] and moves Rs2.W[1] to Rd.W[0].
 *
 * **Operations**:\n
 * ~~~
 * Rd = CONCAT(Rs1.W[1], Rs2.W[1]);
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DPKTT32(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dpktt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DPKTT32 ===== */

/* ===== Inline Function Start for DPKTB32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DPKTB32 (Pack Two 32-bit Data from Top and Bottom Half)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DPKTB32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Pack 32-bit data from 64-bit chunks in two registers.
 * * DPKTB32: top.bottom
 *
 * **Description**:\n
 * This instruction moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
 *
 * **Operations**:\n
 * ~~~
 * Rd = CONCAT(Rs1.W[1], Rs2.W[0]);
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DPKTB32(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dpktb32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DPKTB32 ===== */

/* ===== Inline Function Start for DPKTB16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DPKTB16 (Pack Two 32-bit Data from Top and Bottom Half)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DPKTB16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Pack 16-bit data from 32-bit chunks in two registers.
 * * DPKTB16: top.bottom
 *
 * **Description**:\n
 * This instruction moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [15:0] to Rd.W[x] [15:0].
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][15:0]);
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DPKTB16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dpktb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DPKTB16 ===== */

/* ===== Inline Function Start for DPKBB16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DPKBB16 (Pack Two 16-bit Data from Both Bottom Half)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DPKBB16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Pack 16-bit data from 32-bit chunks in two registers.
 * * PKBB16: bottom.bottom
 *
 * **Description**:\n
 * This instruction moves Rs1.W[x][15:0] to Rd.W[x][31:16] and moves Rs2.W[x] [15:0] to Rd.W[x] [15:0].
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][15:0]);
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DPKBB16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dpkbb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DPKBB16 ===== */

/* ===== Inline Function Start for DPKBT16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DPKBT16 (Pack Two 16-bit Data from Bottom and Top Half)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DPKBT16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Pack 16-bit data from 32-bit chunks in two registers.
 * * PKBT16: bottom.top
 *
 * **Description**:\n
 * This instruction moves Rs1.W[x] [15:0] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][31:16]);
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DPKBT16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dpkbt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DPKBT16 ===== */

/* ===== Inline Function Start for DPKTT16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DPKTT16 (Pack Two 16-bit Data from Both Top Half)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DPKTT16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Pack 16-bit data from 32-bit chunks in two registers.
 * * PKTT16 top.top 
 *
 * **Description**:\n
 * This instruction moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][31:16]);
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DPKTT16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dpktt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DPKTT16 ===== */

/* ===== Inline Function Start for DSRA16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSRA16 (SIMD 16-bit Shift Right Arithmetic)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSRA16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit element arithmetic right shift operations simultaneously. The shift amount is a variable from a GPR.
 *
 * **Description**:\n
 * The 16-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out bits are filled with the sign-bit of
 * the data elements. The shift amount is specified by the low-order 4-bits of the value in the Rs2 register. And the results
 * are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * sa = Rs2[3:0];
 * if (sa != 0)
 * {
 * Rd.H[x] = SE16(Rs1.H[x][15:sa]);
 * } else {
 * Rd = Rs1;
 * }
 * x=3...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSRA16(unsigned long long a, unsigned long b)
{
    unsigned long long result;
    __ASM volatile("dsra16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSRA16 ===== */

/* ===== Inline Function Start for DADD16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DADD16 (16-bit Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DADD16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit integer element additions simultaneously.
 *
 * **Description**:\n
 * This instruction adds the 16-bit unsigned integer elements in Rs1 with the 16-bit unsigned integer elements in Rs2. And
 * the results are written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = Rs1.H[x] + Rs2.H[x];
 * x=3...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DADD16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dadd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DADD16 ===== */

/* ===== Inline Function Start for DADD32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DADD32 (32-bit Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DADD32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit integer element additions simultaneously.
 *
 * **Description**:\n
 * This instruction adds the 32-bit integer elements in Rs1 with the 32-bit integer elements in Rs2, and then writes the 32-bit
 * element results to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = Rs1.W[x] + Rs2.W[x];
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DADD32(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dadd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DADD32 ===== */

/* ===== Inline Function Start for DSMBB16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSMBB16 (Signed Multiply Bottom Half & Bottom Half)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMBB16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 16-bit content of the 32-bit elements of a register with the signed 16-bit content of the 32-bit elements
 * of another register and write the result to a third register.
 * * DSMBB16: W[x].bottom*W[x].bottom
 *
 * **Description**:\n
 * For the `DSMBB16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom
 * 16-bit content of the 32-bit elements of Rs2.
 * The multiplication results are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[0];
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSMBB16(unsigned long long a, unsigned long long b) /* pass */
{
    unsigned long long result;
    __ASM volatile("dsmbb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMBB16 ===== */

/* ===== Inline Function Start for DSMBT16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSMBT16 (Signed Multiply Bottom Half & Top Half)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMBT16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 16-bit content of the 32-bit elements of a register with the signed 16-bit content of the 32-bit
 * elements of another register and write the result to a third register.
 * * DSMBT16: W[x].bottom *W[x].top
 *
 * **Description**:\n
 * For the `DSMBT16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 with the top 16-bit
 * content of the 32-bit elements of Rs2.
 * The multiplication results are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[1];
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSMBT16(unsigned long long a, unsigned long long b) /* pass */
{
    unsigned long long result;
    __ASM volatile("dsmbt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMBT16 ===== */

/* ===== Inline Function Start for DSMTT16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSMTT16 (Signed Multiply Top Half & Top Half)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMTT16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 16-bit content of the 32-bit elements of a register with the signed 16-bit content of the 32-bit
 * elements of another register and write the result to a third register.
 * * DSMTT16: W[x].top * W[x].top
 *
 * **Description**:\n
 * For the `DSMTT16` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit
 * content of the 32-bit elements of Rs2.
 * The multiplication results are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[1];
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSMTT16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dsmtt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMTT16 ===== */

/* ===== Inline Function Start for DRCRSA16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DRCRSA16 (16-bit Signed Halving Cross Subtraction & Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DRCRSA16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element subtraction and 16-bit signed integer element addition in a 32-bit chunk simultaneously.
 * Operands are from crossed positions in 32-bit chunks. The results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit signed integer in [31:16] of 32-bit chunks in Rs1 with the 16-bit signed integer in
 * [15:0] of 32-bit chunks in Rs2, and adds the 16-bit signed integer in [31:16] of 32-bit chunks in Rs2 from the 16-bit signed
 * integer in [15:0] of 32-bit chunks in Rs1. The element results are first logically right-shifted by 1 bit and then written to
 * [31:16] of 32- bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:16] = (Rs1.W[x][31:16] - Rs2.W[x][15:0]) s>> 1;
 * Rd.W[x][15:0] = (Rs1.W[x][15:0] + Rs2.W[x][31:16]) s>> 1;
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DRCRSA16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("drcrsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DRCRSA16 ===== */

/* ===== Inline Function Start for DRCRSA32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DRCRSA32 (32-bit Signed Halving Cross Subtraction & Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DRCRSA32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element subtraction and 32-bit signed integer element addition in a 64-bit chunk simultaneously.
 * Operands are from crossed 32-bit elements. The results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the 32-bit signed integer element in [63:32] of Rs1 with the 32-bit signed integer element in
 * [31:0] of Rs2, and adds the 32-bit signed integer element in [63:32] of Rs2 from the 32-bit signed integer element in [31:0]
 * of Rs1. The element results are first arithmetically right-shifted by 1 bit and then written to [63:32] of Rd for addition and
 * [31:0] of Rd for subtraction.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[1] = (Rs1.W[1] - Rs2.W[0]) s>> 1;
 * Rd.W[0] = (Rs1.W[0] + Rs2.W[1]) s>> 1;
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DRCRSA32(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("drcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DRCRSA32 ===== */

/* ===== Inline Function Start for DRCRAS16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DRCRAS16 (16-bit Signed Halving Cross Addition & Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DRCRAS16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element subtraction and 16-bit signed integer element addition in a 32-bit chunk simultaneously.
 * Operands are from crossed positions in 32-bit chunks. The results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs1 with the 16-bit unsigned integer in
 * [15:0] of 32-bit chunks in Rs2, and subtracts the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs2 from the 16-bit
 * unsigned integer in [15:0] of 32-bit chunks in Rs1. The element results are first logically right-shifted by 1 bit and then
 * written to [31:16] of 32-bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x][31:16] = (Rs1.W[x][31:16] + Rs2.W[x][15:0]) s>> 1;
 * Rd.W[x][15:0] = (Rs1.W[x][15:0] - Rs2.W[x][31:16]) s>> 1;
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DRCRAS16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("drcras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DRCRAS16 ===== */

/* ===== Inline Function Start for DRCRAS32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DRCRAS32 (32-bit Signed Cross Addition & Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DRCRAS32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element addition and 32-bit signed integer element subtraction in a 64-bit chunk simultaneously.
 * Operands are from crossed 32-bit elements. The results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction adds the 32-bit signed integer element in [63:32] of Rs1 with the 32-bit signed integer element in [31:0]
 * of Rs2, and subtracts the 32-bit signed integer element in [63:32] of Rs2 from the 32-bit signed integer element in [31:0]
 * of Rs1. The element results are first arithmetically right-shifted by 1 bit and then written to [63:32] of Rd for addition
 * and [31:0] of Rd for subtraction.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[1] = (Rs1.W[1] + Rs2.W[0]) s>> 1;
 * Rd.W[0] = (Rs1.W[0] - Rs2.W[1]) s>> 1;
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DRCRAS32(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("DRCRAS32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DRCRAS32 ===== */

/* ===== Inline Function Start for DKCRAS16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DKCRAS16 (16-bit Signed Saturating Cross Addition & Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKCRAS16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element saturating addition and 16-bit signed integer element saturating subtraction in a 32-bit
 * chunk simultaneously. Operands are from crossed positions in 32-bit chunks.
 *
 * **Description**:\n
 * This instruction adds the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs1 with the 16-bit signed integer
 * element in [15:0] of 32-bit chunks in Rs2; at the same time, it subtracts the 16-bit signed integer element in [31:16] of
 * 32-bit chunks in Rs2 from the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs1.
 * If any of the results are beyond the Q15 number range (-2^15 <= Q15 <= 2^15-1), they are saturated to the range and the OV
 * bit is set to 1. The saturated results are written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of 32-bit chunks
 * in Rd for addition.
 *
 * **Operations**:\n
 * ~~~
 * res1 = Rs1.W[x][31:16] - Rs2.W[x][15:0];
 * res2 = Rs1.W[x][15:0] + Rs2.W[x][31:16];
 * for (res in [res1, res2]) {
 *   if (res > (2^15)-1) {
 *     res = (2^15)-1;
 *     OV = 1;
 *   } else if (res < -2^15) {
 *     res = -2^15;
 *     OV = 1;
 *   }
 * }
 * Rd.W[x][31:16] = res1;
 * Rd.W[x][15:0] = res2;
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKCRAS16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dkcras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKCRAS16 ===== */

/* ===== Inline Function Start for DKCRSA16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DKCRSA16 (16-bit Signed Saturating Cross Subtraction & Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKCRSA16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element saturating subtraction and 16-bit signed integer element saturating addition in a 32-bit
 * chunk simultaneously. Operands are from crossed positions in 32-bit chunks.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs2 from the 16-bit signed integer
 * element in [31:16] of 32-bit chunks in Rs1; at the same time, it adds the 16-bit signed integer element in [31:16] of 32-bit
 * chunks in Rs2 with the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs1.
 * If any of the results are beyond the Q15 number range (-2^15 <= Q15 <= 2^15-1), they are saturated to the range and the OV
 * bit is set to 1. The saturated results are written to [31:16] of 32-bit chunks in Rd for addition and [15:0] of 32-bit chunks
 * in Rd for subtraction.
 *
 * **Operations**:\n
 * ~~~
 * res1 = Rs1.W[x][31:16] + Rs2.W[x][15:0];
 * res2 = Rs1.W[x][15:0] - Rs2.W[x][31:16];
 * for (res in [res1, res2]) {
 *   if (res > (2^15)-1) {
 *     res = (2^15)-1;
 *     OV = 1;
 *   } else if (res < -2^15) {
 *     res = -2^15;
 *     OV = 1;
 *   }
 * }
 * Rd.W[x][31:16] = res1;
 * Rd.W[x][15:0] = res2;
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKCRSA16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dkcrsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKCRSA16 ===== */

/* ===== Inline Function Start for DRSUB16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DRSUB16 (16-bit Signed Halving Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DRSUB16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element subtractions simultaneously. The results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit signed integer elements in Rs2 from the 16-bit signed integer elements in Rs1. The
 * results are first arithmetically right-shifted by 1 bit and then written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.H[x] = (Rs1.H[x] - Rs2.H[x]) s>> 1;
 * x=3...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DRSUB16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("drsub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DRSUB16 ===== */

/* ===== Inline Function Start for DSTSA32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSTSA32 (32-bit Straight Subtraction & Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSTSA32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit integer element subtraction and 32-bit integer element addition in a 64-bit chunk simultaneously. Operands are
 * from corresponding 32-bit elements.
 *
 * **Description**:\n
 * This instruction subtracts the 32-bit integer element in [63:32] of Rs2 from the 32-bit integer element in [63:32] of Rs1,
 * and writes the result to [63:32] of Rd; at the same time, it adds the 32-bit integer element in [31:0] of Rs1 with the 32-bit
 * integer element in [31:0] of Rs2, and writes the result to [31:0] of Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[1] = Rs1.W[1] - Rs2.W[1];
 * Rd.W[0] = Rs1.W[0] + Rs2.W[0];
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSTSA32(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dstsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSTSA32 ===== */

/* ===== Inline Function Start for DSTAS32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSTAS32 (SIMD 32-bit Straight Addition & Subtractionn)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSTAS32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit integer element addition and 32-bit integer element subtraction in a 64-bit chunk simultaneously. Operands are
 * from corresponding 32-bit elements.
 *
 * **Description**:\n
 * This instruction adds the 32-bit integer element in [63:32] of Rs1 with the 32-bit integer element in [63:32] of Rs2,
 * and writes the result to [63:32] of Rd; at the same time, it subtracts the 32-bit integer element in [31:0] of Rs2
 * from the 32-bit integer element in [31:0] of Rs1, and writes the result to [31:0] of Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[1] = Rs1.W[1] + Rs2.W[1];
 * Rd.W[0] = Rs1.W[0] - Rs2.W[0];
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSTAS32(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("DSTAS32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSTAS32 ===== */

/* ===== Inline Function Start for DKCRSA32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DKCRSA32 (32-bit Signed Saturating Cross Subtraction & Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKCRSA32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element saturating subtraction and 32-bit signed integer element saturating addition in a 64-bit
 * chunk simultaneously. Operands are from crossed 32-bit elements.
 *
 * **Description**:\n
 * This instruction subtracts the 32-bit integer element in [31:0] of Rs2 from the 32-bit integer element in [63:32] of Rs1; at
 * the same time, it adds the 32-bit integer element in [31:0] of Rs1 with the 32-bit integer element in [63:32] of Rs2. If any
 * of the results are beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), they are saturated to the range and the OV bit is
 * set to 1. The saturated results are written to [63:32] of Rd for subtraction and [31:0] of Rd for addition.
 *
 * **Operations**:\n
 * ~~~
 * res[1] = Rs1.W[1] - Rs2.W[0];
 * res[0] = Rs1.W[0] + Rs2.W[1];
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[1] = res[1];
 * Rd.W[0] = res[0];
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKCRSA32(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dkcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKCRSA32 ===== */

/* ===== Inline Function Start for DKCRAS32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DKCRAS32 (32-bit Signed Saturating Cross Addition & Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKCRAS32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element saturating subtraction and 32-bit signed integer element saturating addition in a 64-bit
 * chunk simultaneously. Operands are from crossed 32-bit elements.
 *
 * **Description**:\n
 * This instruction adds the 32-bit integer element in [31:0] of Rs2 from the 32-bit integer element in [63:32] of Rs1; at the
 * same time, it subtracts the 32-bit integer element in [31:0] of Rs1 with the 32-bit integer element in [63:32] of Rs2. If any
 * of the results are beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), they are saturated to the range and the OV bit is
 * set to 1. The saturated results are written to [63:32] of Rd for subtraction and [31:0] of Rd for addition.
 *
 * **Operations**:\n
 * ~~~
 * res[1] = Rs1.W[1] + Rs2.W[0];
 * res[0] = Rs1.W[0] - Rs2.W[1];
 * if (res[x] > (2^31)-1) {
 *   res[x] = (2^31)-1;
 *   OV = 1;
 * } else if (res < -2^31) {
 *   res[x] = -2^31;
 *   OV = 1;
 * }
 * Rd.W[1] = res[1];
 * Rd.W[0] = res[0];
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKCRAS32(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dkcras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKCRAS32 ===== */

/* ===== Inline Function Start for DCRSA32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DCRSA32 (32-bit Cross Subtraction & Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DCRSA32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit integer element subtraction and 32-bit integer element addition in a 64-bit chunk simultaneously. Operands are
 * from crossed 32-bit elements.
 *
 * **Description**:\n
 * This instruction adds the 32-bit integer element in [63:32] of Rs1 with the 32-bit integer element in [31:0] of Rs2, and
 * writes the result to [63:32] of Rd; at the same time, it subtracts the 32-bit integer element in [63:32] of Rs2 from the 32-bit
 * integer element in [31:0] of Rs1, and writes the result to [31:0] of Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[1] = Rs1.W[1] - Rs2.W[0];
 * res[0] = Rs1.W[0] + Rs2.W[1];
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DCRSA32(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DCRSA32 ===== */

/* ===== Inline Function Start for DCRAS32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DCRAS32 (32-bit Cross Addition & Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DCRAS32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit integer element addition and 32-bit integer element subtraction in a 64-bit chunk simultaneously. Operands are
 * from crossed 32-bit elements.
 *
 * **Description**:\n
 * This instruction subtracts the 32-bit integer element in [63:32] of Rs1 with the 32-bit integer element in [31:0] of Rs2, and
 * writes the result to [63:32] of Rd; at the same time, it adds the 32-bit integer element in [63:32] of Rs2 from the 32-bit
 * integer element in [31:0] of Rs1, and writes the result to [31:0] of Rd.
 *
 * **Operations**:\n
 * ~~~
 * res[1] = Rs1.W[1] - Rs2.W[0];
 * res[0] = Rs1.W[0] + Rs2.W[1];
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DCRAS32(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dcras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DCRAS32 ===== */

/* ===== Inline Function Start for DKSTSA16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DKSTSA16 (16-bit Signed Saturating Straight Subtraction & Addition)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKSTSA16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element saturating subtraction and 16-bit signed integer element saturating addition in a 32-bit
 * chunk simultaneously. Operands are from corresponding positions in 32-bit chunks.
 *
 * **Description**:\n
 * This instruction subtracts the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs2 from the 16-bit signed integer
 * element in [31:16] of 32-bit chunks in Rs1; at the same time, it adds the 16-bit signed integer element in [15:0] of 32-bit
 * chunks in Rs2 with the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs1.
 * If any of the results are beyond the Q15 number range (-2^15 <= Q15 <= 2^15-1), they are saturated to the range and the OV
 * bit is set to 1. The saturated results are written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of 32-bit chunks
 * in Rd for addition.
 *
 * **Operations**:\n
 * ~~~
 * res1 = Rs1.W[x][31:16] - Rs2.W[x][31:16];
 * res2 = Rs1.W[x][15:0] + Rs2.W[x][15:0];
 * for (res in [res1, res2]) {
 *   if (res > (2^15)-1) {
 *     res = (2^15)-1;
 *     OV = 1;
 *   } else if (res < -2^15) {
 *     res = -2^15;
 *     OV = 1;
 *   }
 * }
 * Rd.W[x][31:16] = res1;
 * Rd.W[x][15:0] = res2;
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKSTSA16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dkstsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKSTSA16 ===== */

/* ===== Inline Function Start for DKSTAS16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DKSTAS16 (16-bit Signed Saturating Straight Addition & Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKSTAS16 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 16-bit signed integer element saturating addition and 16-bit signed integer element saturating subtraction in a 32-bit
 * chunk simultaneously. Operands are from corresponding positions in 32-bit chunks.
 *
 * **Description**:\n
 * This instruction adds the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs1 with the 16-bit signed integer
 * element in [31:16] of 32-bit chunks in Rs2; at the same time, it subtracts the 16-bit signed integer element in [15:0] of
 * 32-bit chunks in Rs2 from the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs1.
 * If any of the results are beyond the Q15 number range (-2^15 <= Q15 <= 2^15-1), they are saturated to the range and the OV
 * bit is set to 1. The saturated results are written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of 32-bit chunks
 * in Rd for addition.
 *
 * **Operations**:\n
 * ~~~
 * res1 = Rs1.W[x][31:16] + Rs2.W[x][31:16];
 * res2 = Rs1.W[x][15:0] - Rs2.W[x][15:0];
 * for (res in [res1, res2]) {
 *   if (res > (2^15)-1) {
 *     res = (2^15)-1;
 *     OV = 1;
 *   } else if (res < -2^15) {
 *     res = -2^15;
 *     OV = 1;
 *   }
 * }
 * Rd.W[x][31:16] = res1;
 * Rd.W[x][15:0] = res2;
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKSTAS16(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("dkstas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKSTAS16 ===== */

/* ===== Inline Function Start for DSCLIP8 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSCLIP8 (8-bit Signed Saturation and Clip)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSCLIP8 Rd, Rs1, imm3u[2:0]
 * # Rd, Rs1 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Limit the 8-bit signed integer elements of a register into a signed range simultaneously.
 *
 * **Description**:\n
 * This instruction limits the 8-bit signed integer elements stored in Rs1 into a signed integer range between -2^imm3u and
 * 2^imm3u-1, and writes the limited results to Rd. For example, if imm3u is 3, the 8-bit input values should be saturated
 * between 7 and -8. If saturation is performed, set OV bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * src = Rs1.B[x];
 * if (src > (2^imm3u)-1) {
 *   src = (2^imm3u)-1;
 *   OV = 1;
 * } else if (src < -2^imm3u) {
 *   src = -2^imm3u;
 *   OV = 1;
 * }
 * Rd.B[x] = src
 * x=7...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
#define __RV_DSCLIP8(a, b)    \
    ({    \
        unsigned long long result;    \
        unsigned long long __a = (unsigned long long)(a);    \
        __ASM volatile("dsclip8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for DSCLIP8 ===== */

/* ===== Inline Function Start for DSCLIP16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSCLIP16 (16-bit Signed Saturation and Clip)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSCLIP16 Rd, Rs1, imm4u[3:0]
 * # Rd, Rs1 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Limit the 16-bit signed integer elements of a register into a signed range simultaneously.
 *
 * **Description**:\n
 * This instruction limits the 16-bit signed integer elements stored in Rs1 into a signed integer range between -2^imm4u and
 * 2^imm4u-1, and writes the limited results to Rd. For example, if imm4u is 3, the 32-bit input values should be saturated
 * between 7 and -8. If saturation is performed, set OV bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * src = Rs1.H[x];
 * if (src > (2^imm4u)-1) {
 *   src = (2^imm4u)-1;
 *   OV = 1;
 * } else if (src < -2^imm4u) {
 *   src = -2^imm4u;
 *   OV = 1;
 * }
 * Rd.H[x] = src
 * x=3...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
#define __RV_DSCLIP16(a, b)    \
    ({    \
        unsigned long long result;    \
        unsigned long long __a = (unsigned long long)(a);    \
        __ASM volatile("dsclip16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for DSCLIP16 ===== */

/* ===== Inline Function Start for DSCLIP32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSCLIP32 (32-bit Signed Saturation and Clip)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSCLIP32 Rd, Rs1, imm5u[4:0]
 * # Rd, Rs1 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Limit the 32-bit signed integer elements of a register into a signed range simultaneously.
 *
 * **Description**:\n
 * This instruction limits the 32-bit signed integer elements stored in Rs1 into a signed integer range between -2^imm5u and
 * 2^imm5u-1, and writes the limited results to Rd. For example, if imm5u is 3, the 32-bit input values should be saturated
 * between 7 and -8. If saturation is performed, set OV bit to 1.
 *
 * **Operations**:\n
 * ~~~
 * src = Rs1.W[x];
 * if (src > (2^imm5u)-1) {
 *   src = (2^imm5u)-1;
 *   OV = 1;
 * } else if (src < -2^imm5u) {
 *   src = -2^imm5u;
 *   OV = 1;
 * }
 * Rd.W[x] = src
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
#define __RV_DSCLIP32(a, b)    \
    ({    \
        unsigned long long result;    \
        unsigned long long __a = (unsigned long long)(a);    \
        __ASM volatile("dsclip32 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
        result;    \
    })
/* ===== Inline Function End for DSCLIP32 ===== */

/* ===== Inline Function Start for DRSUB32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DRSUB32 (32-bit Signed Halving Subtraction)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DRSUB32 Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do 32-bit signed integer element subtractions simultaneously. The results are halved to avoid overflow or saturation.
 *
 * **Description**:\n
 * This instruction subtracts the 32-bit signed integer elements in Rs2 from the 32-bit signed integer elements in Rs1. The
 * results are first arithmetically right-shifted by 1 bit and then written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[x] = (Rs1.W[x] - Rs2.W[x]) s>> 1;
 * x=1...0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DRSUB32(unsigned long long a, unsigned long long b)
{
    unsigned long long result;
    __ASM volatile("drsub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DRSUB32 ===== */

/* ===== Inline Function Start for DPACK32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DPACK32 (SIMD Pack Two 32-bit Data To 64-bit)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DPACK32 Rd, Rs1, Rs2
 * # Rd is even/odd pair of register
 * ~~~
 *
 * **Purpose**:\n
 * Pack two 32-bit datas which from two registers into a 64-bit data.
 *
 * **Description**:\n
 * This instruction moves 32-bit Rs1 to Rd.W[1] and moves 32-bit Rs2 to Rd.W[0].
 *
 * **Operations**:\n
 * ~~~
 * Rd = CONCAT(Rs1.W , Rs2.W);
 * ~~~
 *
 * \param [in]  a signed long type of value stored in a
 * \param [in]  b signed long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DPACK32(signed long a, signed long b)
{
    unsigned long long result;
    __ASM volatile("dpack32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DPACK32 ===== */

/* ===== Inline Function Start for DSUNPKD810 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSUNPKD810 (Signed Unpacking Bytes 1 & 0)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSUNPKD810 Rd, Rs1
 * # Rd, Rs1 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte 1 and byte 0 of 32-bit chunks in a register into two 16-bit signed halfwords of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `DSUNPKD810` instruction, it unpacks byte 1 and byte 0 of 32-bit chunks in Rs1 into two 16-bit signed halfwords
 * and writes the results to the top part and the bottom part of 32-bit chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = SE16(Rs1.W[m].B[1])
 * Rd.W[m].H[0] = SE16(Rs1.W[m].B[0])
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSUNPKD810(unsigned long long a)
{
    unsigned long long result;
    __ASM volatile("dsunpkd810 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for DSUNPKD810 ===== */

/* ===== Inline Function Start for DSUNPKD820 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSUNPKD820 (Signed Unpacking Bytes 2 & 0)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSUNPKD820 Rd, Rs1
 * # Rd, Rs1 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte 2 and byte 0 of 32-bit chunks in a register into two 16-bit signed halfwords of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `DSUNPKD820` instruction, it unpacks byte 2 and byte 0 of 32-bit chunks in Rs1 into two 16-bit signed halfwords
 * and writes the results to the top part and the bottom part of 32-bit chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = SE16(Rs1.W[m].B[2])
 * Rd.W[m].H[0] = SE16(Rs1.W[m].B[0])
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSUNPKD820(unsigned long long a)
{
    unsigned long long result;
    __ASM volatile("dsunpkd820 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for DSUNPKD820 ===== */

/* ===== Inline Function Start for DSUNPKD830 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSUNPKD830 (Signed Unpacking Bytes 3 & 0)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSUNPKD830 Rd, Rs1
 * # Rd, Rs1 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte 3 and byte 0 of 32-bit chunks in a register into two 16-bit signed halfwords of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `DSUNPKD830` instruction, it unpacks byte 3 and byte 0 of 32-bit chunks in Rs1 into two 16-bit signed halfwords
 * and writes the results to the top part and the bottom part of 32-bit chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = SE16(Rs1.W[m].B[3])
 * Rd.W[m].H[0] = SE16(Rs1.W[m].B[0])
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSUNPKD830(unsigned long long a)
{
    unsigned long long result;
    __ASM volatile("dsunpkd830 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for DSUNPKD830 ===== */

/* ===== Inline Function Start for DSUNPKD831 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSUNPKD831 (Signed Unpacking Bytes 3 & 1)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSUNPKD831 Rd, Rs1
 * # Rd, Rs1 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte 3 and byte 1 of 32-bit chunks in a register into two 16-bit signed halfwords of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `DSUNPKD831` instruction, it unpacks byte 3 and byte 1 of 32-bit chunks in Rs1 into two 16-bit signed halfwords
 * and writes the results to the top part and the bottom part of 32-bit chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = SE16(Rs1.W[m].B[3])
 * Rd.W[m].H[0] = SE16(Rs1.W[m].B[1])
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSUNPKD831(unsigned long long a)
{
    unsigned long long result;
    __ASM volatile("dsunpkd831 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for DSUNPKD831 ===== */

/* ===== Inline Function Start for DSUNPKD832 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DSUNPKD832 (Signed Unpacking Bytes 3 & 2)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSUNPKD832 Rd, Rs1
 * # Rd, Rs1 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte 3 and byte 2 of 32-bit chunks in a register into two 16-bit signed halfwords of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `DSUNPKD832` instruction, it unpacks byte 3 and byte 2 of 32-bit chunks in Rs1 into two 16-bit signed halfwords
 * and writes the results to the top part and the bottom part of 32-bit chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = SE16(Rs1.W[m].B[3])
 * Rd.W[m].H[0] = SE16(Rs1.W[m].B[2])
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSUNPKD832(unsigned long long a)
{
    unsigned long long result;
    __ASM volatile("dsunpkd832 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for DSUNPKD832 ===== */

/* ===== Inline Function Start for DZUNPKD810 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DZUNPKD810 (UnSigned Unpacking Bytes 1 & 0)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DZUNPKD810 Rd, Rs1
 * # Rd, Rs1 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte 1 and byte 0 of 32-bit chunks in a register into two 16-bit unsigned halfwords of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `DZUNPKD810` instruction, it unpacks byte 1 and byte 0 of 32-bit chunks in Rs1 into two 16-bit unsigned halfwords
 * and writes the results to the top part and the bottom part of 32-bit chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = SE16(Rs1.W[m].B[1])
 * Rd.W[m].H[0] = SE16(Rs1.W[m].B[0])
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DZUNPKD810(unsigned long long a)
{
    unsigned long long result;
    __ASM volatile("dzunpkd810 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for DZUNPKD810 ===== */

/* ===== Inline Function Start for DZUNPKD820 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DZUNPKD820 (UnSigned Unpacking Bytes 2 & 0)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DZUNPKD820 Rd, Rs1
 * # Rd, Rs1 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte 2 and byte 0 of 32-bit chunks in a register into two 16-bit unsigned halfwords of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `DZUNPKD820` instruction, it unpacks byte 2 and byte 0 of 32-bit chunks in Rs1 into two 16-bit unsigned halfwords
 * and writes the results to the top part and the bottom part of 32-bit chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = SE16(Rs1.W[m].B[2])
 * Rd.W[m].H[0] = SE16(Rs1.W[m].B[0])
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DZUNPKD820(unsigned long long a)
{
    unsigned long long result;
    __ASM volatile("dzunpkd820 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for DZUNPKD820 ===== */

/* ===== Inline Function Start for DZUNPKD830 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DZUNPKD830 (UnSigned Unpacking Bytes 3 & 0)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DZUNPKD830 Rd, Rs1
 * # Rd, Rs1 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte 3 and byte 0 of 32-bit chunks in a register into two 16-bit unsigned halfwords of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `DZUNPKD830` instruction, it unpacks byte 3 and byte 0 of 32-bit chunks in Rs1 into two 16-bit unsigned halfwords
 * and writes the results to the top part and the bottom part of 32-bit chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = SE16(Rs1.W[m].B[3])
 * Rd.W[m].H[0] = SE16(Rs1.W[m].B[0])
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DZUNPKD830(unsigned long long a)
{
    unsigned long long result;
    __ASM volatile("dzunpkd830 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for DZUNPKD830 ===== */

/* ===== Inline Function Start for DZUNPKD831 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DZUNPKD831 (UnSigned Unpacking Bytes 3 & 1)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DZUNPKD831 Rd, Rs1
 * # Rd, Rs1 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte 3 and byte 1 of 32-bit chunks in a register into two 16-bit unsigned halfwords of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `DZUNPKD831` instruction, it unpacks byte 3 and byte 1 of 32-bit chunks in Rs1 into two 16-bit unsigned halfwords
 * and writes the results to the top part and the bottom part of 32-bit chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = SE16(Rs1.W[m].B[3])
 * Rd.W[m].H[0] = SE16(Rs1.W[m].B[1])
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DZUNPKD831(unsigned long long a)
{
    unsigned long long result;
    __ASM volatile("dzunpkd831 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for DZUNPKD831 ===== */

/* ===== Inline Function Start for DZUNPKD832 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N2
 * \brief    DZUNPKD832 (UnSigned Unpacking Bytes 3 & 2)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DZUNPKD832 Rd, Rs1
 * # Rd, Rs1 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Unpack byte 3 and byte 2 of 32-bit chunks in a register into two 16-bit unsigned halfwords of 32-bit chunks in a register.
 *
 * **Description**:\n
 * For the `DZUNPKD832` instruction, it unpacks byte 3 and byte 2 of 32-bit chunks in Rs1 into two 16-bit unsigned halfwords
 * and writes the results to the top part and the bottom part of 32-bit chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd.W[m].H[1] = SE16(Rs1.W[m].B[3])
 * Rd.W[m].H[0] = SE16(Rs1.W[m].B[2])
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DZUNPKD832(unsigned long long a)
{
    unsigned long long result;
    __ASM volatile("dzunpkd832 %0, %1" : "=r"(result) : "r"(a));
    return result;
}
/* ===== Inline Function End for DZUNPKD832 ===== */

/* ===== Inline Function Start for DKMMAC ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief   DKMMAC (64-bit MSW 32x32 Signed Multiply and Saturating Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKMMAC Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do MSW 32x32 element signed multiplications and saturating addition simultaneously. The results are written into Rd.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2
 * and adds the most significant 32-bit multiplication results with the signed 32-bit elements of Rd. If
 * the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range
 * and the OV bit is set to 1. The results after saturation are written to Rd. The .u form of the
 * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by
 * adding a 1 to bit 31 of the results.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; op3t = Rd.W[x+1] // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; op3b = Rd.W[x] // bottom
 * for ((aop,bop,dop,res) in [(op1t,op2t,op3t,rest), (op1b,op2b,op3b,resb)]) {
 *    res = sat.q31(dop + (aop s* bop)[63:32]);
 * }
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  t unsigned long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKMMAC(unsigned long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmmac %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMMAC ===== */

/* ===== Inline Function Start for DKMMAC.u ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief   DKMMAC.u (64-bit MSW 32x32 Unsigned Multiply and Saturating Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKMMAC.u Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do MSW 32x32 element unsigned multiplications and saturating addition simultaneously. The results are written into Rd.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2
 * and adds the most significant 32-bit multiplication results with the signed 32-bit elements of Rd. If
 * the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range
 * and the OV bit is set to 1. The results after saturation are written to Rd. The .u form of the
 * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by
 * adding a 1 to bit 31 of the results.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; op3t = Rd.W[x+1] // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; op3b = Rd.W[x] // bottom
 * for ((aop,bop,dop,res) in [(op1t,op2t,op3t,rest), (op1b,op2b,op3b,resb)]) {
 *   res = sat.q31(dop + RUND(aop u* bop)[63:32]);
 * }
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  t unsigned long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKMMAC_U(unsigned long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmmac.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMMAC.u ===== */

/* ===== Inline Function Start for DKMMSB ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief   DKMMSB (64-bit MSW 32x32 Signed Multiply and Saturating Sub)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKMMSB Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do MSW 32x32 element signed multiplications and saturating subtraction simultaneously. The results are written into Rd.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2
 * and subtracts the most significant 32-bit multiplication results from the signed 32-bit elements of
 * Rd. If the subtraction result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the
 * range and the OV bit is set to 1. The results after saturation are written to Rd. The .u form of the
 * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by
 * adding a 1 to bit 31 of the results.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; op3t = Rd.W[x+1] // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; op3b = Rd.W[x] // bottom
 * for ((aop,bop,dop,res) in [(op1t,op2t,op3t,rest), (op1b,op2b,op3b,resb)]) {
 *    res = sat.q31(dop - (aop s* bop)[63:32]);
 * }
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  t unsigned long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKMMSB(unsigned long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmmsb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMMSB ===== */

/* ===== Inline Function Start for DKMMSB.u ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief   DKMMSB.u (64-bit MSW 32x32 Unsigned Multiply and Saturating Sub)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKMMSB.u Rd, Rs1, Rs2
 * # Rd, Rs1, Rs2 are all even/odd pair of registers
 * ~~~
 *
 * **Purpose**:\n
 * Do MSW 32x32 element unsigned multiplications and saturating subtraction simultaneously. The results are written into Rd.
 *
 * **Description**:\n
 * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2
 * and subtracts the most significant 32-bit multiplication results from the signed 32-bit elements of
 * Rd. If the subtraction result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the
 * range and the OV bit is set to 1. The results after saturation are written to Rd. The .u form of the
 * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by
 * adding a 1 to bit 31 of the results.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; op3t = Rd.W[x+1] // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; op3b = Rd.W[x] // bottom
 * for ((aop,bop,dop,res) in [(op1t,op2t,op3t,rest), (op1b,op2b,op3b,resb)]) {
 *    res = sat.q31(dop - (aop u* bop)[63:32]);
 * }
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  t unsigned long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKMMSB_U(unsigned long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmmsb.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMMSB.u ===== */

/* ===== Inline Function Start for DKMADA ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DKMADA (Saturating Signed Multiply Two Halfs and Two Adds)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DKMADA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two 16x16 with 32-bit signed double addition simultaneously. The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
 * the bottom 16-bit content of 32-bit elements in Rs2 and then adds the result to the result of
 * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit
 * elements in Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; op3t = Rd.W[x+1] // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; op3b = Rd.W[x] // bottom
 *
 * for ((aop,bop,dop,res) in [(op1t,op2t,op3t,rest), (op1b,op2b,op3b,resb)]) {
 *   mul1 = aop.H[1] s* bop.H[1];
 *   mul2 = aop.H[0] s* bop.H[0];
 *   res = sat.q31(dop + mul1 + mul2);
 * }
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  t unsigned long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKMADA(unsigned long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmada %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMADA ===== */

/* ===== Inline Function Start for DKMAXDA ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DKMAXDA (Two Cross 16x16 with 32-bit Signed Double Add)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DKMAXDA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two cross 16x16 with 32-bit signed double addition simultaneously. The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the top 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-bit
 * elements in Rs2 and then adds the result to the result of multiplying the bottom 16-bit content of
 * 32-bit elements in Rs1 with the top 16-bit content of 32-bit elements in elements in Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; op3t = Rd.W[x+1] // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; op3b = Rd.W[x] // bottom
 *
 * for ((aop,bop,dop,res) in [(op1t,op2t,op3t,rest), (op1b,op2b,op3b,resb)]) {
 *   mul1 = aop.H[1] s* bop.H[0];
 *   mul2 = aop.H[0] s* bop.H[1];
 *   res = sat.q31(dop + mul1 + mul2);
 * }
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  t unsigned long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKMAXDA(unsigned long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmaxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMAXDA ===== */

/* ===== Inline Function Start for DKMADS ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief  DKMADS (Two 16x16 with 32-bit Signed Add and Sub)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DKMADS Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two 16x16 with 32-bit signed addition and subtraction simultaneously. The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
 * the bottom 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
 * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit
 * elements in Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; op3t = Rd.W[x+1] // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; op3b = Rd.W[x] // bottom
 *
 * for ((aop,bop,dop,res) in [(op1t,op2t,op3t,rest), (op1b,op2b,op3b,resb)]) {
 *   mul1 = aop.H[1] s* bop.H[1];
 *   mul2 = aop.H[0] s* bop.H[0];
 *   res = sat.q31(dop + mul1 - mul2);
 * }
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  t unsigned long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKMADS(unsigned long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmads %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMADS ===== */

/* ===== Inline Function Start for DKMADRS ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief  DKMADRS (Two 16x16 with 32-bit Signed Add and Reversed Sub)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DKMADRS Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two 16x16 with 32-bit signed addition and revered subtraction simultaneously. The results are written into Rd.
 *
 * **Description**:\n
 * it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
 * top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
 * multiplying the bottom 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-
 * bit elements in Rs2
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; op3t = Rd.W[x+1] // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; op3b = Rd.W[x] // bottom
 *
 * for ((aop,bop,dop,res) in [(op1t,op2t,op3t,rest), (op1b,op2b,op3b,resb)]) {
 *   mul1 = aop.H[1] s* bop.H[1];
 *   mul2 = aop.H[0] s* bop.H[0];
 *   res = sat.q31(dop - mul1 + mul2);
 * }
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  t unsigned long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKMADRS(unsigned long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmadrs %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMADRS ===== */

/* ===== Inline Function Start for DKMAXDS ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DKMAXDS (Saturating Signed Crossed Multiply Two Halfs & Subtract & Add)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DKMAXDS Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two cross 16x16 with 32-bit signed addition and subtraction simultaneously. The results are written into Rd.
 *
 * **Description**:\n
 * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then
 * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
 * the corresponding 32-bit elements in a third register. The addition result may be saturated.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; op3t = Rd.W[x+1] // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; op3b = Rd.W[x] // bottom
 *
 * for ((aop,bop,dop,res) in [(op1t,op2t,op3t,rest), (op1b,op2b,op3b,resb)]) {
 *   mul1 = aop.H[1] s* bop.H[0];
 *   mul2 = aop.H[0] s* bop.H[1];
 *   res = sat.q31(dop + mul1 - mul2);
 * }
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  t unsigned long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKMAXDS(unsigned long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmaxds %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMAXDS ===== */

/* ===== Inline Function Start for DKMSDA ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DKMSDA (Two 16x16 with 32-bit Signed Double Sub)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DKMSDA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two 16x16 with 32-bit signed double subtraction simultaneously. The results are written into Rd.
 *
 * **Description**:\n
 * it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of
 * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; op3t = Rd.W[x+1] // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; op3b = Rd.W[x] // bottom
 *
 * for ((aop,bop,dop,res) in [(op1t,op2t,op3t,rest), (op1b,op2b,op3b,resb)]) {
 *   mul1 = aop.H[1] s* bop.H[0];
 *   mul2 = aop.H[0] s* bop.H[1];
 *   res = sat.q31(dop - mul1 - mul2);
 * }
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  t unsigned long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKMSDA(unsigned long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmsda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMSDA ===== */

/* ===== Inline Function Start for DKMSXDA ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DKMSXDA (Two Cross 16x16 with 32-bit Signed Double Sub)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DKMSXDA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two cross 16x16 with 32-bit signed double subtraction simultaneously. The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the bottom 16-bit content of the 32-bit elements of Rs1
 * with the top 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of the
 * 32-bit elements of Rs1 with the bottom 16-bit content of the 32-bit elements of Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; op3t = Rd.W[x+1] // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; op3b = Rd.W[x] // bottom
 *
 * for ((aop,bop,dop,res) in [(op1t,op2t,op3t,rest), (op1b,op2b,op3b,resb)]) {
 *   mul1 = aop.H[1] s* bop.H[0];
 *   mul2 = aop.H[0] s* bop.H[1];
 *   res = sat.q31(dop - mul1 - mul2);
 * }
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  t unsigned long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKMSXDA(unsigned long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmsxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMSXDA ===== */

/* ===== Inline Function Start for DSMAQA ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DSMAQA (Four Signed 8x8 with 32-bit Signed Add)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DSMAQA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do four signed 8x8 with 32-bit signed addition simultaneously. The results are written into Rd.
 *
 * **Description**:\n
 * This instruction multiplies the four signed 8-bit elements of 32-bit chunks of Rs1 with the four
 * signed 8-bit elements of 32-bit chunks of Rs2 and then adds the four results together with the signed
 * content of the corresponding 32-bit chunks of Rd. The final results are written back to the
 * corresponding 32-bit chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; op3t = Rd.W[x+1] // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; op3b = Rd.W[x] // bottom
 *
 * for ((aop,bop,dop,res) in [(op1t,op2t,op3t,rest), (op1b,op2b,op3b,resb)]) {
 *   m0 = aop.B[0] s* bop.B[0];
 *   m1 = aop.B[1] s* bop.B[1];
 *   m2 = aop.B[2] s* bop.B[2];
 *   m3 = aop.B[3] s* bop.B[3];
 *   res = dop + m0 + m1 + m2 + m3;
 * }
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  t unsigned long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSMAQA(unsigned long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dsmaqa %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DSMAQA ===== */

/* ===== Inline Function Start for DSMAQA.SU ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DSMAQA.SU (Four Signed 8 x Unsigned 8 with 32-bit Signed Add)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DSMAQA.SU Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do four Signed 8 x Unsigned 8 with 32-bit unsigned addition simultaneously. The results are written into Rd.
 *
 * **Description**:\n
 * This instruction multiplies the four unsigned 8-bit elements of 32-bit chunks of Rs1 with the four
 * signed 8-bit elements of 32-bit chunks of Rs2 and then adds the four results together with the unsigned
 * content of the corresponding 32-bit chunks of Rd. The final results are written back to the
 * corresponding 32-bit chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; op3t = Rd.W[x+1] // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; op3b = Rd.W[x] // bottom
 *
 * for ((aop,bop,dop,res) in [(op1t,op2t,op3t,rest), (op1b,op2b,op3b,resb)]) {
 *   m0 = aop.B[0] su* bop.B[0];
 *   m1 = aop.B[1] su* bop.B[1];
 *   m2 = aop.B[2] su* bop.B[2];
 *   m3 = aop.B[3] su* bop.B[3];
 *   res = dop + m0 + m1 + m2 + m3;
 * }
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  t unsigned long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DSMAQA_SU(unsigned long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dsmaqa.su %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DSMAQA.SU ===== */

/* ===== Inline Function Start for DUMAQA ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DUMAQA (Four Unsigned 8x8 with 32-bit Unsigned Add)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DUMAQA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do four unsigned 8x8 with 32-bit unsigned addition simultaneously. The results are written into Rd.
 *
 * **Description**:\n
 * This instruction multiplies the four unsigned 8-bit elements of 32-bit chunks of Rs1 with the four
 * unsigned 8-bit elements of 32-bit chunks of Rs2 and then adds the four results together with the
 * unsigned content of the corresponding 32-bit chunks of Rd. The final results are written back to the
 * corresponding 32-bit chunks in Rd.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; op3t = Rd.W[x+1] // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; op3b = Rd.W[x] // bottom
 *
 * for ((aop,bop,dop,res) in [(op1t,op2t,op3t,rest), (op1b,op2b,op3b,resb)]) {
 *   m0 = aop.B[0] su* bop.B[0];
 *   m1 = aop.B[1] su* bop.B[1];
 *   m2 = aop.B[2] su* bop.B[2];
 *   m3 = aop.B[3] su* bop.B[3];
 *   res = dop + m0 + m1 + m2 + m3;
 * }
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  t unsigned long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DUMAQA(unsigned long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dumaqa %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DUMAQA ===== */

/* ===== Inline Function Start for DKMDA32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DKMDA32 (Two Signed 32x32 with 64-bit Saturation Add)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DKMDA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32x32 add the signed multiplication results with Q63 saturation. The results are written into Rd.
 *
 * **Description**:\n
 * For the `KMDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
 * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1
 * with the top 32-bit element of Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 * t0 = op1b s* op2b;
 * t1 = op1t s* op2t;
 * Rd = concat(rest, resb);
 * x=0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DKMDA32(unsigned long long a, unsigned long long b)
{
   long long result;
    __ASM volatile("dkmda32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKMDA32 ===== */

/* ===== Inline Function Start for DKMXDA32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DKMXDA32 (Two Cross Signed 32x32 with 64-bit Saturation Add)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DKMXDA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two cross signed 32x32 and add the signed multiplication results with Q63 saturation. The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the bottom 32-bit element of Rs1 with the top 32-bit
 * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1
 * with the bottom 32-bit element of Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 * t01 = op1b s* op2t;
 * t10 = op1t s* op2b;
 * Rd = sat.q63(t01 + t10);
 * x=0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DKMXDA32(unsigned long long a, unsigned long long b)
{
    long long result;
    __ASM volatile("dkmxda32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DKMXDA32 ===== */

/* ===== Inline Function Start for DKMADA32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DKMADA32 (Two Signed 32x32 with 64-bit Saturation Add)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DKMADA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32x32 and add the signed multiplication results and a third register with Q63 saturation. The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
 * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1
 * with the top 32-bit element of Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 * t01 = op1b s* op2b;
 * t10 = op1t s* op2t;
 * Rd = sat.q63(t01 + t10);
 * x=0
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DKMADA32(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmada32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMADA32 ===== */

/* ===== Inline Function Start for DKMAXDA32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DKMAXDA32 (Two Cross Signed 32x32 with 64-bit Saturation Add)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DKMAXDA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two cross signed 32x32 and add the signed multiplication results and a third register with Q63 saturation. The
 * results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the top 32-bit element in Rs1 with the bottom 32-bit
 * element in Rs2 and then adds the result to the result of multiplying the bottom 32-bit element in Rs1
 * with the top 32-bit element in Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 * t01 = op1b s* op2t;
 * t10 = op1t s* op2b;
 * Rd = sat.q63(Rd + t01 + t10);
 * x=0
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DKMAXDA32(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmaxda32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMAXDA32 ===== */

/* ===== Inline Function Start for DKMADS32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DKMADS32 (Two Signed 32x32 with 64-bit Saturation Add and Sub)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DKMADS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32x32 and add the top signed multiplication results and subtraction bottom signed multiplication results
 * and add a third register with Q63 saturation. The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the top 32-bit element in Rs1 with the bottom 32-bit
 * element in Rs2 and then subtracts the result to the result of multiplying the top 32-bit element in Rs1
 * with the top 32-bit element in Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 *
 * t0 = op1b s* op2b;
 * t1 = op1t s* op2t;
 * Rd = sat.q63(Rd - t0 + t1);
 * x=0
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DKMADS32(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmads32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMADS32 ===== */

/* ===== Inline Function Start for DKMADRS32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DKMADRS32 (Two Signed 32x32 with 64-bit Saturation Revered Add and Sub)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DKMADRS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32x32 and add the signed multiplication results and a third register with Q63 saturation. The results
 * are written into Rd.Do two signed 32x32 and subtraction the top signed multiplication results and add bottom signed
 * multiplication results and add a third register with Q63 saturation. The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the top 32-bit element in Rs1 with the top 32-bit
 * element in Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
 * element in Rs1 with the bottom 32-bit element in Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 * t0 = op1b s* op2b;
 * t1 = op1t s* op2t;
 * Rd = sat.q63(Rd + t0 - t1);
 * x=0
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DKMADRS32(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmadrs32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMADRS32 ===== */

/* ===== Inline Function Start for DKMAXDS32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DKMAXDS32 (Two Cross Signed 32x32 with 64-bit Saturation Add and Sub)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DKMAXDS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32x32 and add the top signed multiplication results and subtraction bottom signed multiplication results
 * and add a third register with Q63 saturation. The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the bottom 32-bit element in Rs1 with the top 32-bit
 * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
 * Rs1 with the bottom 32-bit element in Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 *
 * t01 = op1b s* op2t;
 * t10 = op1t s* op2b;
 * Rd = sat.q63(Rd - t01 + t10);
 * x=0
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DKMAXDS32(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmaxds32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMAXDS32 ===== */

/* ===== Inline Function Start for DKMSDA32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DKMSDA32 (Two Signed 32x32 with 64-bit Saturation Sub)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DKMSDA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32x32 and subtraction the top signed multiplication results and subtraction bottom signed multiplication
 * results and add a third register with Q63 saturation. The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
 * element of Rs2 and multiplies the top 32-bit element of Rs1 with the top 32-bit element of Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 *
 * t0 = op1b s* op2b;
 * t1 = op1t s* op2t;
 * Rd = sat.q63(Rd - t0 - t1);
 * x=0
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DKMSDA32(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmsda32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMSDA32 ===== */

/* ===== Inline Function Start for DKMSXDA32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DKMSXDA32 (Two Cross Signed 32x32 with 64-bit Saturation Sub)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DKMSXDA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two cross signed 32x32 and subtraction the top signed multiplication results and subtraction bottom signed multiplication
 * results and add a third register with Q63 saturation. The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the bottom 32-bit element of Rs1 with the top 32-bit
 * element of Rs2 and multiplies the top 32-bit element of Rs1 with the bottom 32-bit element of Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 *
 * t0 = op1b s* op2t;
 * t1 = op1t s* op2b;
 * Rd = sat.q63(Rd - t0 - t1);
 * x=0
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DKMSXDA32(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmsxda32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMSXDA32 ===== */

/* ===== Inline Function Start for DSMDS32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DSMDS32 (Two Signed 32x32 with 64-bit Sub)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DSMDS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32x32 and add the top signed multiplication results and subtraction bottom signed multiplication. The
 * results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
 * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
 * Rs1 with the top 32-bit element of Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 *
 * t0 = op1b s* op2t;
 * t1 = op1t s* op2b;
 * Rd = t1 - t0;
 * x=0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMDS32(unsigned long long a, unsigned long long b)
{
    long long result;
    __ASM volatile("dsmds32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMDS32 ===== */

/* ===== Inline Function Start for DSMDRS32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DSMDRS32 (Two Signed 32x32 with 64-bit Revered Sub)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DSMDRS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32x32 and subtraction the top signed multiplication results and add bottom signed multiplication. The results are written into Rd
 *
 * **Description**:\n
 * It multiplies the top 32-bit element of Rs1 with the top 32-bit
 * element of Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
 * element of Rs1 with the bottom 32-bit element of Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 *
 * t0 = op1b s* op2b;
 * t1 = op1t s* op2t;
 * Rd = t1 - t0;
 * x=0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMDRS32(unsigned long long a, unsigned long long b)
{
    long long result;
    __ASM volatile("dsmdrs32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMDRS32 ===== */

/* ===== Inline Function Start for DSMXDS32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DSMXDS32 (Two Cross Signed 32x32 with 64-bit Sub)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DSMXDS32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two cross signed 32x32 and add the top signed multiplication results and subtraction bottom signed multiplication.
 * The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the bottom 32-bit element of Rs1 with the top 32-bit
 * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
 * Rs1 with the bottom 32-bit element of Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 *
 * t01 = op1b s* op2t;
 * t10 = op1t s* op2b;
 * Rd = t1 - t0;
 * x=0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMXDS32(unsigned long long a, unsigned long long b)
{
    long long result;
    __ASM volatile("dsmxds32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMXDS32 ===== */

/* ===== Inline Function Start for DSMALDA ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DSMALDA (Four Signed 16x16 with 64-bit Add)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DSMALDA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do four signed 16x16 and add signed multiplication results and a third register. The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
 * content of Rs2 and then adds the result to the result of multiplying the top 16-bit content of Rs1 with
 * the top 16-bit content of Rs2 with unlimited precision
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 *
 * m0 = op1b.H[0] s* op2b.H[0];
 * m1 = op1b.H[1] s* op2b.H[1];
 * m2 = op1t.H[0] s* op2t.H[0];
 * m3 = op1t.H[1] s* op2t.H[1];
 *
 * Rd = Rd + m0 + m1 + m2 + m3;
 * x=0
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMALDA(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dsmalda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DSMALDA ===== */

/* ===== Inline Function Start for DSMALXDA ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DSMALXDA (Four Signed 16x16 with 64-bit Add)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DSMALXDA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do four cross signed 16x16 and add signed multiplication results and a third register. The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the top 16-bit content of Rs1 with the bottom 16-bit
 * content of Rs2 and then adds the result to the result of multiplying the bottom 16-bit content of Rs1
 * with the top 16-bit content of Rs2 with unlimited precision.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 *
 * m0 = op1b.H[0] s* op2b.H[1];
 * m1 = op1b.H[1] s* op2b.H[0];
 * m2 = op1t.H[0] s* op2t.H[1];
 * m3 = op1t.H[1] s* op2t.H[0];
 *
 * Rd = Rd + m0 + m1 + m2 + m3;
 * x=0
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMALXDA(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dsmalxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DSMALXDA ===== */

/* ===== Inline Function Start for DSMALDS ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DSMALDS (Four Signed 16x16 with 64-bit Add and Sub)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DSMALDS Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do four signed 16x16 and add and subtraction signed multiplication results and a third register. The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
 * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
 * Rs1 with the top 16-bit content of Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 *
 * m0 = op1b.H[1] s* op2b.H[1];
 * m1 = op1b.H[0] s* op2b.H[0];
 * m2 = op1t.H[1] s* op2t.H[1];
 * m3 = op1t.H[0] s* op2t.H[0];
 *
 * Rd = Rd + m0 - m1 + m2 - m3;
 * x=0
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMALDS(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dsmalds %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DSMALDS ===== */

/* ===== Inline Function Start for DSMALDRS ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief DSMALDRS (Four Signed 16x16 with 64-bit Add and Revered Sub)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DSMALDRS Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16x16 and add and revered subtraction signed multiplication results and a third register. The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the top 16-bit content of Rs1 with the top 16-bit content
 * of Rs2 and then subtracts the result from the result of multiplying the bottom 16-bit content of Rs1
 * with the bottom 16-bit content of Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 *
 * m0 = op1b.H[0] s* op2b.H[0];
 * m1 = op1b.H[1] s* op2b.H[1];
 * m2 = op1t.H[0] s* op2t.H[0];
 * m3 = op1t.H[1] s* op2t.H[1];
 *
 * Rd = Rd + m0 - m1 + m2 - m3;
 * x=0
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMALDRS(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dsmaldrs %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DSMALDRS ===== */

/* ===== Inline Function Start for DSMALXDS ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief  DSMALXDS (Four Cross Signed 16x16 with 64-bit Add and Sub)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DSMALXDS Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do four cross signed 16x16 and add and subtraction signed multiplication results and a third register. The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the bottom 16-bit content of Rs1 with the top 16-bit
 * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
 * Rs1 with the bottom 16-bit content of Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 *
 * m0 = op1b.H[1] s* op2b.H[0];
 * m1 = op1b.H[0] s* op2b.H[1];
 * m2 = op1t.H[1] s* op2t.H[0];
 * m3 = op1t.H[0] s* op2t.H[1];
 *
 * Rd = Rd + m0 - m1 + m2 - m3;
 * x=0
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMALXDS(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dsmalxds %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DSMALXDS ===== */

/* ===== Inline Function Start for DSMSLDA ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief  DSMSLDA (Four Signed 16x16 with 64-bit Sub)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DSMSLDA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do four signed 16x16 and subtraction signed multiplication results and add a third register. The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
 * content Rs2 and multiplies the top 16-bit content of Rs1 with the top 16-bit content of Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 *
 * m0 = op1b.H[0] s* op2b.H[0];
 * m1 = op1b.H[1] s* op2b.H[1];
 * m2 = op1t.H[0] s* op2t.H[0];
 * m3 = op1t.H[1] s* op2t.H[1];
 *
 * Rd = Rd - m0 - m1 - m2 - m3;
 * x=0
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMSLDA(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dsmslda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DSMSLDA ===== */

/* ===== Inline Function Start for DSMSLXDA ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief  DSMSLXDA (Four Cross Signed 16x16 with 64-bit Sub)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DSMSLXDA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do four signed 16x16 and subtraction signed multiplication results and add a third register. The results are written into Rd.
 *
 * **Description**:\n
 * It multiplies the top 16-bit content of Rs1 with the bottom 16-bit
 * content of Rs2 and multiplies the bottom 16-bit content of Rs1 with the top 16-bit content of Rs2.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 *
 * m0 = op1b.H[0] s* op2b.H[1];
 * m1 = op1b.H[1] s* op2b.H[0];
 * m2 = op1t.H[0] s* op2t.H[1];
 * m3 = op1t.H[1] s* op2t.H[0];
 *
 * Rd = Rd - m0 - m1 - m2 - m3;
 * x=0
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMSLXDA(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dsmslxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DSMSLXDA ===== */

/* ===== Inline Function Start for DDSMAQA ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief  DDSMAQA (Eight Signed 8x8 with 64-bit Add)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DDSMAQA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do eight signed 8x8 and add signed multiplication results and a third register. The results are written into Rd.
 *
 * **Description**:\n
 * Do eight signed 8-bit multiplications from eight 8-bit chunks of two registers; and then adds
 * the eight 16-bit results and the content of 64-bit chunks of a third register.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 *
 * m0 = op1b.B[0] s* op2b.B[0];
 * m1 = op1b.B[1] s* op2b.B[1];
 * m2 = op1b.B[2] s* op2b.B[2];
 * m3 = op1b.B[3] s* op2b.B[3];
 * m4 = op1t.B[0] s* op2t.B[0];
 * m5 = op1t.B[1] s* op2t.B[1];
 * m6 = op1t.B[2] s* op2t.B[2];
 * m7 = op1t.B[3] s* op2t.B[3];
 *
 * s0 = m0 + m1 + m2 + m3;
 * s1 = m4 + m5 + m6 + m7;
 * Rd = Rd + s0 + s1;
 * x=0
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DDSMAQA(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("ddsmaqa %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DDSMAQA ===== */

/* ===== Inline Function Start for DDSMAQA.SU ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief  DDSMAQA.SU (Eight Signed 8 x Unsigned 8 with 64-bit Add)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DDSMAQA.SU Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do eight signed 8 x unsigned 8 and add signed multiplication results and a third register. The results are written into Rd.
 *
 * **Description**:\n
 * Do eight signed 8 x unsigned 8 and add signed multiplication results and a third register; and then adds
 * the eight 16-bit results and the content of 64-bit chunks of a third register.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 *
 * m0 = op1b.B[0] su* op2b.B[0];
 * m1 = op1b.B[1] su* op2b.B[1];
 * m2 = op1b.B[2] su* op2b.B[2];
 * m3 = op1b.B[3] su* op2b.B[3];
 * m4 = op1t.B[0] su* op2t.B[0];
 * m5 = op1t.B[1] su* op2t.B[1];
 * m6 = op1t.B[2] su* op2t.B[2];
 * m7 = op1t.B[3] su* op2t.B[3];
 *
 * s0 = m0 + m1 + m2 + m3;
 * s1 = m4 + m5 + m6 + m7;
 * Rd = Rd + s0 + s1;
 * x=0
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DDSMAQA_SU(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("ddsmaqa.su %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DDSMAQA.SU ===== */

/* ===== Inline Function Start for DDUMAQA ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief  DDUMAQA (Eight Unsigned 8x8 with 64-bit Unsigned Add)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DDUMAQA Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do eight unsigned 8x8 and add unsigned multiplication results and a third register. The results are written into Rd.
 *
 * **Description**:\n
 * Do eight unsigned 8x8 and add unsigned multiplication results and a third register; and then adds
 * the eight 16-bit results and the content of 64-bit chunks of a third register.
 *
 * **Operations**:\n
 * ~~~
 * op1t = Rs1.W[x+1]; op2t = Rs2.W[x+1]; // top
 * op1b = Rs1.W[x]; op2b = Rs2.W[x]; // bottom
 *
 * m0 = op1b.B[0] u* op2b.B[0];
 * m1 = op1b.B[1] u* op2b.B[1];
 * m2 = op1b.B[2] u* op2b.B[2];
 * m3 = op1b.B[3] u* op2b.B[3];
 * m4 = op1t.B[0] u* op2t.B[0];
 * m5 = op1t.B[1] u* op2t.B[1];
 * m6 = op1t.B[2] u* op2t.B[2];
 * m7 = op1t.B[3] u* op2t.B[3];
 *
 * s0 = m0 + m1 + m2 + m3;
 * s1 = m4 + m5 + m6 + m7;
 * Rd = Rd + s0 + s1;
 * x=0
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DDUMAQA(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("ddumaqa %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DDUMAQA ===== */

/* ===== Inline Function Start for DSMA32.u ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief    DSMA32.u (64-bit SIMD 32-bit Signed Multiply Addition With Rounding and Clip)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DSMA32.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32x32 and add signed multiplication results with Rounding, then right shift 32-bit and clip q63 to q31.
 * The result is written to Rd.
 *
 * **Description**:\n
 * For the `DSMA32.u` instruction, multiply the top 32-bit Q31 content of 64-bit chunks in Rs1 with the top 32-bit Q31
 * content of 64-bit chunks in Rs2. At the same time, multiply the bottom 32-bit Q31 content of 64-bit chunks in Rs1 with
 * the bottom 32-bit Q31 content of 64-bit chunks in Rs2.
 * Then, do the addtion for the results above and perform the addtional rounding operations, and then move the data to the right
 * by 32-bit, and clip the 64-bit data into 32-bit.The result is written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd = (q31_t)((Rs1.W[x] s* Rs2.W[x] + Rs1.W[x + 1] s* Rs2.W[x + 1] + 0x80000000LL) s>> 32);
 * x=0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_DSMA32_U(unsigned long long a, unsigned long long b)
{
    long result;
    __ASM volatile("dsma32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMA32.u ===== */

/* ===== Inline Function Start for DSMXS32.u ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief    DSMXS32.u (64-bit SIMD 32-bit Signed Multiply Cross Subtraction With Rounding and Clip)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DSMXS32.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two cross signed 32x32 and sub signed multiplication results with Rounding, then right shift 32-bit and clip q63 to
 * q31. The result is written to Rd.
 *
 * **Description**:\n
 * For the `DSMXS32.u` instruction, multiply the top 32-bit Q31 content of 64-bit chunks in Rs1 with the bottom 32-bit
 * Q31 content of 64-bit chunks in Rs2. At the same time, multiply the bottom 32-bit Q31 content of 64-bit chunks in Rs1
 * with the top 32-bit Q31 content of 64-bit chunks in Rs2.
 * Then, do the subtraction for the results above and perform the addtional rounding operations, and then move the data to the right by
 * 32-bit, and clip the 64-bit data into 32-bit.The result is written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd = (q31_t)((Rs1.W[x + 1] s* Rs2.W[x] - Rs1.W[x] s* Rs2.W[x + 1] + 0x80000000LL) s>> 32);
 * x=0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_DSMXS32_U(unsigned long long a, unsigned long long b)
{
    long result;
    __ASM volatile("dsmxs32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMXS32.u ===== */

/* ===== Inline Function Start for DSMXA32.u ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief    DSMXA32.u (64-bit SIMD 32-bit Signed Cross Multiply Addition with Rounding and Clip)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DSMXA32.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two cross signed 32x32 and add signed multiplication results with Rounding, then right shift 32-bit and clip q63 to
 * q31. The result is written to Rd.
 *
 * **Description**:\n
 * For the `DSMXA32.u` instruction,multiply the top 32-bit Q31 content of 64-bit chunks in Rs1 with the bottom 32-bit Q31
 * content of 64-bit chunks in Rs2. At the same time, multiply the bottom 32-bit Q31 content of 64-bit chunks in Rs1 with
 * the top 32-bit Q31 content of 64-bit chunks in Rs2.
 * Then, do the addtion for the results above and perform the addtional rounding operations, and then move the data to the right
 * by 32-bit, and clip the 64-bit data into 32-bit.The result is written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd = (q31_t)((Rs1.W[x + 1] s* Rs2.W[x] + Rs1.W[x] s* Rs2.W[x + 1] + 0x80000000LL) s>> 32);
 * x=0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_DSMXA32_U(unsigned long long a, unsigned long long b)
{
    long result;
    __ASM volatile("dsmxa32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMXA32.u ===== */

/* ===== Inline Function Start for DSMS32.u ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief    DSMS32.u (64-bit SIMD 32-bit Signed Multiply Subtraction with Rounding and Clip)
 * \details
 * **Type**: DSP
 *
 * **Syntax**:\n
 * ~~~
 * DSMS32.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 32x32 and sub signed multiplication results with Rounding, then right shift 32-bit and clip q63 to q31. The
 * result is written to Rd.
 *
 * **Description**:\n
 * For the `DSMS32.u` instruction, multiply the bottom 32-bit Q31 content of 64-bit chunks in Rs1 with the bottom 32-bit
 * Q31 content of 64-bit chunks in Rs2. At the same time, multiply the top 32-bit Q31 content of 64-bit chunks in Rs1 with
 * the top 32-bit Q31 content of 64-bit chunks in Rs2.
 * Then, do the subtraction for the results above and perform the addtional rounding operations, and then move the data to the right by
 * 32-bit, and clip the 64-bit data into 32-bit.The result is written to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Rd = (q31_t)((Rs1.W[x] s* Rs2.W[x] - Rs1.W[x + 1] s* Rs2.W[x + 1] + 0x80000000LL) s>> 32);
 * x=0
 * ~~~
 *
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_DSMS32_U(unsigned long long a, unsigned long long b)
{
    long result;
    __ASM volatile("dsms32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
    return result;
}
/* ===== Inline Function End for DSMS32.u ===== */

/* ===== Inline Function Start for DSMADA16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief    DSMADA16 (Signed Multiply Two Halfs and Two Adds 32-bit)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMADA16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications of two 32-bit registers; and then adds the 32-bit results and the 32-bit value of an
 * even/odd pair of registers together.
 * * DSMADA16: rt pair+ top*top + bottom*bottom
 *
 * **Description**:\n
 * This instruction multiplies the per 16-bit content of the 32-bit elements of Rs1 with the corresponding 16-bit content of
 * the 32-bit elements of Rs2. The result is added to the 32-bit value of an even/odd pair of registers specified by Rd(4,1).
 * The 32-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and Rs2, and the 32-bit value of the
 * register-pair are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
 * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
 * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
 * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
 * Rd.W = Rd.W + SE32(Mres0[0][31:0]) + SE32(Mres1[0][31:0]) + SE32(Mres0[1][31:0]) + SE32(Mres1[1][31:0]);
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_DSMADA16(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dsmada16 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return (long)t;
}
/* ===== Inline Function End for DSMADA16 ===== */

/* ===== Inline Function Start for DSMAXDA16 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief    DSMAXDA16 (Signed Crossed Multiply Two Halfs and Two Adds 32-bit)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMAXDA16 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two signed 16-bit multiplications of two 32-bit registers; and then adds the 32-bit results and the 32-bit value of an
 * even/odd pair of registers together.
 * * DSMAXDA: rt pair+ top*bottom + bottom*top (all 32-bit elements)
 *
 * **Description**:\n
 * This instruction crossly multiplies the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of the 32-bit
 * elements of Rs2 and then adds the result to the result of multiplying the bottom 16-bit content of the 32-bit elements of
 * Rs1 with the top 16-bit content of the 32-bit elements of Rs2 with unlimited precision. The result is added to the 64-bit
 * value of an even/odd pair of registers specified by Rd(4,1).The 64-bit addition result is clipped to 32-bit result.
 *
 * **Operations**:\n
 * ~~~
 * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
 * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]);
 * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
 * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[0]);
 * Rd.W = Rd.W + SE32(Mres0[0][31:0]) + SE32(Mres1[0][31:0]) + SE32(Mres0[1][31:0]) + SE32(Mres1[1][31:0]);
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_DSMAXDA16(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dsmaxda16 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return (long)t;
}
/* ===== Inline Function End for DSMAXDA16 ===== */

/* ===== Inline Function Start for DKSMS32.u ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief    DKSMS32.u (Two Signed Multiply Shift-clip and Saturation with Rounding)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKSMS32.u Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Computes saturated multiplication of two pairs of q31 type with shifted rounding.
 *
 * **Description**:\n
 * Compute the multiplication of Rs1 and Rs2 of type q31_t, intercept [47:16] for the resulting 64-bit product
 * to get the 32-bit number, then add 1 to it to do rounding, and finally saturate the result after rounding.
 *
 * **Operations**:\n
 * ~~~
 * Mres[x][63:0] = Rs1.W[x] s* Rs2.W[x];
 * Round[x][32:0] = Mres[x][47:15] + 1;
 * Rd.W[x] = sat.31(Rd.W[x] + Round[x][32:1]);
 * x=1...0
 * ~~~
 *
 * \param [in]  t unsigned long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE unsigned long long __RV_DKSMS32_U(unsigned long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dksms32.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKSMS32.u ===== */

/* ===== Inline Function Start for DMADA32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief    DMADA32 ((Two Cross Signed 32x32 with 64-bit Add and Clip to 32-bit)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DMADA32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Do two cross signed 32x32 and add the signed multiplication results to q63, then clip the q63 result to q31 , the final results
 * are written into Rd.
 *
 * **Description**:\n
 * For the `DMADA32` instruction, it multiplies the top 32-bit element in Rs1 with the bottom 32-bit element in Rs2 and
 * then adds the result to the result of multiplying the bottom 32-bit element in Rs1 with the top 32-bit element in Rs2, then
 * clip the q63 result to q31.
 *
 * **Operations**:\n
 * ~~~
 * res = (q31_t)((((q63_t) Rd.w[0] << 32) + (q63_t)Rs1.w[0] s*  Rs2.w[1] + (q63_t)Rs1.w[1] s*  Rs2.w[0]) s>> 32);
 * rd = res;
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long type
 */
__STATIC_FORCEINLINE long __RV_DMADA32(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dmada32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return (long)t;
}
/* ===== Inline Function End for DMADA32 ===== */

/* ===== Inline Function Start for DSMALBB ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief    DSMALBB (Signed Multiply Bottom Halfs & Add 64-bit)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMALBB Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 16-bit content of the 32-bit elements of a register with the 16-bit content of the corresponding 32-bit
 * elements of another register and add the results with a 64-bit value of an even/odd pair of registers. The addition result 
 * is written back to the register-pair.
 * * DSMALBB: rt pair + bottom*bottom (all 32-bit elements)
 *
 * **Description**:\n
 * For the `DSMALBB` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit content of Rs2.The
 * multiplication results are added with the 64-bit value of Rd. The 64-bit addition result is written back to Rd.
 *
 * **Operations**:\n
 * ~~~
 * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[0];
 * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[0];
 * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMALBB(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dsmalbb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DSMALBB ===== */

/* ===== Inline Function Start for DSMALBT ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief    DSMALBT (Signed Multiply Bottom Half & Top Half & Add 64-bit)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMALBT Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 16-bit content of the 32-bit elements of a register with the 16-bit content of the corresponding 32-bit
 * elements of another register and add the results with a 64-bit value of an even/odd pair of registers. The addition result
 * is written back to the register-pair.
 * * DSMALBT: rt pair + bottom*top (all 32-bit elements)
 *
 * **Description**:\n
 * For the `DSMALBT` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 with the top 16-bit
 * content of the 32-bit elements of Rs2.
 * The multiplication results are added with the 64-bit value of Rd. The 64-bit addition result is written back to Rd. The
 * 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed integers
 *
 * **Operations**:\n
 * ~~~
 * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[1];
 * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[1];
 * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMALBT(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dsmalbt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DSMALBT ===== */

/* ===== Inline Function Start for DSMALTT ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief    DSMALTT (Signed Multiply Top Half & Add 64-bit)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DSMALTT Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 16-bit content of the 32-bit elements of a register with the 16-bit content of the corresponding 32-bit
 * elements of another register and add the results with a 64-bit value of an even/odd pair of registers. The addition result
 * is written back to the register-pair.
 * * DSMALTT: DSMALTT rt pair + top*top (all 32-bit elements)
 *
 * **Description**:\n
 * For the `DSMALTT` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit
 * content of the 32-bit elements of Rs2.
 * The multiplication results are added with the 64-bit value of Rd. The 64-bit addition result is written back to Rd. The
 * 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * Mres[0][31:0] = Rs1.W[0].H[1] * Rs2.W[0].H[1];
 * Mres[1][31:0] = Rs1.W[1].H[1] * Rs2.W[1].H[1];
 * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DSMALTT(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dsmaltt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DSMALTT ===== */

/* ===== Inline Function Start for DKMABB32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief    DKMABB32 (Saturating Signed Multiply Bottom Words & Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKMABB32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit element in a register with the 32-bit element in another register and add the result to the content
 * of 64-bit data in the third register. The addition result may besaturated and is written to the third register.
 * * DKMABB32: rd + bottom*bottom
 *
 * **Description**:\n
 * For the `DKMABB32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit element in Rs2
 * The multiplication result is added to the content of 64-bit data in Rd. If the addition result is beyond the Q63 number range
 * (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The result after saturation is written to Rd.
 * The 32-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rd + (Rs1.W[0] * Rs2.W[0]);
 * if (res > (2^63)-1) {
 *   res = (2^63)-1;
 *   OV = 1;
 * } else if (res < -2^63) {
 *   res = -2^63;
 *   OV = 1;
 * }
 * Rd = res;
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DKMABB32(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmabb32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMABB32 ===== */

/* ===== Inline Function Start for DKMABT32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief    DKMABT32 (Saturating Signed Multiply Bottom & Top Words & Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKMABT32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit element in a register with the 32-bit element in another register and add the result to the content
 * of 64-bit data in the third register. The addition result may be saturated and is written to the third register.
 * * DKMABT32: rd + bottom*top
 *
 * **Description**:\n
 * For the `DKMABT32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit element in Rs2
 * The multiplication result is added to the content of 64-bit data in Rd. If the addition result is beyond the Q63 number range
 * (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The result after saturation is written to Rd.
 * The 32-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rd + (Rs1.W[0] * Rs2.W[1]);
 * if (res > (2^63)-1) {
 *   res = (2^63)-1;
 *   OV = 1;
 * } else if (res < -2^63) {
 *   res = -2^63;
 *   OV = 1;
 * }
 * Rd = res;
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in long long type
 */
__STATIC_FORCEINLINE long long __RV_DKMABT32(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmabt32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMABT32 ===== */

/* ===== Inline Function Start for DKMATT32 ===== */
/**
 * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_N3
 * \brief    DKMATT32 (Saturating Signed Multiply Bottom & Top Words & Add)
 * \details
 * **Type**: SIMD
 *
 * **Syntax**:\n
 * ~~~
 * DKMATT32 Rd, Rs1, Rs2
 * ~~~
 *
 * **Purpose**:\n
 * Multiply the signed 32-bit element in a register with the 32-bit element in another register and add the result to the content
 * of 64-bit data in the third register. The addition result may be saturated and is written to the third register.
 * * DKMATT32: rd + top*top
 *
 * **Description**:\n
 * For the `DKMATT32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit element in Rs2
 * The multiplication result is added to the content of 64-bit data in Rd. If the addition result is beyond the Q63 number range
 * (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The result after saturation is written to Rd.
 * The 32-bit contents of Rs1 and Rs2 are treated as signed integers.
 *
 * **Operations**:\n
 * ~~~
 * res = Rd + (Rs1.W[1] * Rs2.W[1]);
 * if (res > (2^63)-1) {
 *   res = (2^63)-1;
 *   OV = 1;
 * } else if (res < -2^63) {
 *   res = -2^63;
 *   OV = 1;
 * }
 * Rd = res;
 * ~~~
 *
 * \param [in]  t long long type of value stored in t
 * \param [in]  a unsigned long long type of value stored in a
 * \param [in]  b unsigned long long type of value stored in b
 * \return value stored in unsigned long long type
 */
__STATIC_FORCEINLINE long long __RV_DKMATT32(long long t, unsigned long long a, unsigned long long b)
{
    __ASM volatile("dkmatt32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
    return t;
}
/* ===== Inline Function End for DKMATT32 ===== */
#endif /* __RISCV_XLEN == 32 */

#elif defined (__ICCRISCV__)

#if __riscv_xlen == 32
#include "iar_nds32_intrinsic.h"
#elif __riscv_xlen == 64
#include "iar_nds64_intrinsic.h"
#else
#error "Unexpected RISC-V XLEN size."
#endif /* __riscv_xlen == 32 */

#pragma language=save
#pragma language=extended

// Redefine those compatible instruction name supplied by IAR
#define __RV_CLROV              __nds__clrov
#define __RV_RDOV               __nds__rdov
#define __RV_ADD8               __nds__add8
#define __RV_SUB8               __nds__sub8
#define __RV_ADD16              __nds__add16
#define __RV_SUB16              __nds__sub16
#define __RV_ADD64              __nds__add64
#define __RV_SUB64              __nds__sub64
#define __RV_RADD8              __nds__radd8
#define __RV_RSUB8              __nds__rsub8
#define __RV_RADD16             __nds__radd16
#define __RV_RSUB16             __nds__rsub16
#define __RV_RADD64             __nds__radd64
#define __RV_RSUB64             __nds__rsub64
#define __RV_RADDW              __nds__raddw
#define __RV_RSUBW              __nds__rsubw
#define __RV_URADD8             __nds__uradd8
#define __RV_URSUB8             __nds__ursub8
#define __RV_URADD16            __nds__uradd16
#define __RV_URSUB16            __nds__ursub16
#define __RV_URADD64            __nds__uradd64
#define __RV_URSUB64            __nds__ursub64
#define __RV_URADDW             __nds__uraddw
#define __RV_URSUBW             __nds__ursubw
#define __RV_KADD8              __nds__kadd8
#define __RV_KSUB8              __nds__ksub8
#define __RV_KADD16             __nds__kadd16
#define __RV_KSUB16             __nds__ksub16
#define __RV_KADD64             __nds__kadd64
#define __RV_KSUB64             __nds__ksub64
#define __RV_KADDH              __nds__kaddh
#define __RV_KSUBH              __nds__ksubh
#define __RV_KADDW              __nds__kaddw
#define __RV_KSUBW              __nds__ksubw
#define __RV_UKADD8             __nds__ukadd8
#define __RV_UKSUB8             __nds__uksub8
#define __RV_UKADD16            __nds__ukadd16
#define __RV_UKSUB16            __nds__uksub16
#define __RV_UKADD64            __nds__ukadd64
#define __RV_UKSUB64            __nds__uksub64
#define __RV_UKADDH             __nds__ukaddh
#define __RV_UKSUBH             __nds__uksubh
#define __RV_UKADDW             __nds__ukaddw
#define __RV_UKSUBW             __nds__uksubw
#define __RV_CRAS16             __nds__cras16
#define __RV_CRSA16             __nds__crsa16
#define __RV_RCRAS16            __nds__rcras16
#define __RV_RCRSA16            __nds__rcrsa16
#define __RV_URCRAS16           __nds__urcras16
#define __RV_URCRSA16           __nds__urcrsa16
#define __RV_KCRAS16            __nds__kcras16
#define __RV_KCRSA16            __nds__kcrsa16
#define __RV_UKCRAS16           __nds__ukcras16
#define __RV_UKCRSA16           __nds__ukcrsa16
#define __RV_SRA8               __nds__sra8
#define __RV_SRAI8              __nds__sra8
#define __RV_SRA16              __nds__sra16
#define __RV_SRAI16             __nds__sra16
#define __RV_SRL8               __nds__srl8
#define __RV_SRL16              __nds__srl16
#define __RV_SLL8               __nds__sll8
#define __RV_SLL16              __nds__sll16
#define __RV_SRA_U              __nds__sra_u
#define __RV_SRA8_U             __nds__sra8_u
#define __RV_SRA16_U            __nds__sra16_u
#define __RV_SRL8_U             __nds__srl8_u
#define __RV_SRL16_U            __nds__srl16_u
#define __RV_KSLL8              __nds__ksll8
#define __RV_KSLL16             __nds__ksll16
#define __RV_KSLLW              __nds__ksllw
#define __RV_KSLRA8             __nds__kslra8
#define __RV_KSLRA8_U           __nds__kslra8_u
#define __RV_KSLRA16            __nds__kslra16
#define __RV_KSLRA16_U          __nds__kslra16_u
#define __RV_KSLRAW             __nds__kslraw
#define __RV_KSLRAW_U           __nds__kslraw_u
#define __RV_CMPEQ8             __nds__cmpeq8
#define __RV_CMPEQ16            __nds__cmpeq16
#define __RV_SCMPLE8            __nds__scmple8
#define __RV_SCMPLE16           __nds__scmple16
#define __RV_SCMPLT8            __nds__scmplt8
#define __RV_SCMPLT16           __nds__scmplt16
#define __RV_UCMPLE8            __nds__ucmple8
#define __RV_UCMPLE16           __nds__ucmple16
#define __RV_UCMPLT8            __nds__ucmplt8
#define __RV_UCMPLT16           __nds__ucmplt16
#define __RV_SMUL8              __nds__smul8
#define __RV_UMUL8              __nds__umul8
#define __RV_SMUL16             __nds__smul16
#define __RV_UMUL16             __nds__umul16
#define __RV_SMULX8             __nds__smulx8
#define __RV_UMULX8             __nds__umulx8
#define __RV_SMULX16            __nds__smulx16
#define __RV_UMULX16            __nds__umulx16
#define __RV_KHM8               __nds__khm8
#define __RV_KHMX8              __nds__khmx8
#define __RV_KHM16              __nds__khm16
#define __RV_KHMX16             __nds__khmx16
#define __RV_MULR64             __nds__mulr64
#define __RV_MULSR64            __nds__mulsr64
#define __RV_SMMUL              __nds__smmul
#define __RV_SMMUL_U            __nds__smmul_u
#define __RV_WEXT               __nds__wext
#define __RV_SUNPKD810          __nds__sunpkd810
#define __RV_SUNPKD820          __nds__sunpkd820
#define __RV_SUNPKD830          __nds__sunpkd830
#define __RV_SUNPKD831          __nds__sunpkd831
#define __RV_SUNPKD832          __nds__sunpkd832
#define __RV_ZUNPKD810          __nds__zunpkd810
#define __RV_ZUNPKD820          __nds__zunpkd820
#define __RV_ZUNPKD830          __nds__zunpkd830
#define __RV_ZUNPKD831          __nds__zunpkd831
#define __RV_ZUNPKD832          __nds__zunpkd832
#define __RV_PKBB16             __nds__pkbb16
#define __RV_PKBT16             __nds__pkbt16
#define __RV_PKTT16             __nds__pktt16
#define __RV_PKTB16             __nds__pktb16
#define __RV_KMMAC              __nds__kmmac
#define __RV_KMMAC_U            __nds__kmmac_u
#define __RV_KMMSB              __nds__kmmsb
#define __RV_KMMSB_U            __nds__kmmsb_u
#define __RV_KWMMUL             __nds__kwmmul
#define __RV_KWMMUL_U           __nds__kwmmul_u
#define __RV_SMMWB              __nds__smmwb
#define __RV_SMMWB_U            __nds__smmwb_u
#define __RV_SMMWT              __nds__smmwt
#define __RV_SMMWT_U            __nds__smmwt_u
#define __RV_KMMAWB             __nds__kmmawb
#define __RV_KMMAWB_U           __nds__kmmawb_u
#define __RV_KMMAWT             __nds__kmmawt
#define __RV_KMMAWT_U           __nds__kmmawt_u
#define __RV_KMMWB2             __nds__kmmwb2
#define __RV_KMMWB2_U           __nds__kmmwb2_u
#define __RV_KMMWT2             __nds__kmmwt2
#define __RV_KMMWT2_U           __nds__kmmwt2_u
#define __RV_KMMAWB2            __nds__kmmawb2
#define __RV_KMMAWB2_U          __nds__kmmawb2_u
#define __RV_KMMAWT2            __nds__kmmawt2
#define __RV_KMMAWT2_U          __nds__kmmawt2_u
#define __RV_SMBB16             __nds__smbb16
#define __RV_SMBT16             __nds__smbt16
#define __RV_SMTT16             __nds__smtt16
#define __RV_KMDA               __nds__kmda
#define __RV_KMXDA              __nds__kmxda
#define __RV_SMDS               __nds__smds
#define __RV_SMDRS              __nds__smdrs
#define __RV_SMXDS              __nds__smxds
#define __RV_KMABB              __nds__kmabb
#define __RV_KMABT              __nds__kmabt
#define __RV_KMATT              __nds__kmatt
#define __RV_KMADA              __nds__kmada
#define __RV_KMAXDA             __nds__kmaxda
#define __RV_KMADS              __nds__kmads
#define __RV_KMADRS             __nds__kmadrs
#define __RV_KMAXDS             __nds__kmaxds
#define __RV_KMSDA              __nds__kmsda
#define __RV_KMSXDA             __nds__kmsxda
#define __RV_SMAL               __nds__smal
#define __RV_SMAQA              __nds__smaqa
#define __RV_UMAQA              __nds__umaqa
#define __RV_SMAQA_SU           __nds__smaqa_su
#define __RV_SMAR64             __nds__smar64
#define __RV_SMSR64             __nds__smsr64
#define __RV_UMAR64             __nds__umar64
#define __RV_UMSR64             __nds__umsr64
#define __RV_KMAR64             __nds__kmar64
#define __RV_KMSR64             __nds__kmsr64
#define __RV_UKMAR64            __nds__ukmar64
#define __RV_UKMSR64            __nds__ukmsr64
#define __RV_SMALBB             __nds__smalbb
#define __RV_SMALBT             __nds__smalbt
#define __RV_SMALTT             __nds__smaltt
#define __RV_SMALDA             __nds__smalda
#define __RV_SMALXDA            __nds__smalxda
#define __RV_SMALDS             __nds__smalds
#define __RV_SMALDRS            __nds__smaldrs
#define __RV_SMALXDS            __nds__smalxds
#define __RV_SMSLDA             __nds__smslda
#define __RV_SMSLXDA            __nds__smslxda
#define __RV_MINW               __nds__minw
#define __RV_MAXW               __nds__maxw
#define __RV_SMIN8              __nds__smin8
#define __RV_SMAX8              __nds__smax8
#define __RV_SMIN16             __nds__smin16
#define __RV_SMAX16             __nds__smax16
#define __RV_UMIN8              __nds__umin8
#define __RV_UMAX8              __nds__umax8
#define __RV_UMIN16             __nds__umin16
#define __RV_UMAX16             __nds__umax16
#define __RV_KABS8              __nds__kabs8
#define __RV_KABS16             __nds__kabs16
#define __RV_KABSW              __nds__kabsw
#define __RV_SCLIP8             __nds__sclip8
#define __RV_SCLIP16            __nds__sclip16
#define __RV_SCLIP32            __nds__sclip32
#define __RV_UCLIP8             __nds__uclip8
#define __RV_UCLIP16            __nds__uclip16
#define __RV_UCLIP32            __nds__uclip32
#define __RV_CLO8               __nds__clo8
#define __RV_CLO16              __nds__clo16
#define __RV_CLO32              __nds__clo32
#define __RV_CLZ8               __nds__clz8
#define __RV_CLZ16              __nds__clz16
#define __RV_CLZ32              __nds__clz32
#define __RV_CLRS8              __nds__clrs8
#define __RV_CLRS16             __nds__clrs16
#define __RV_CLRS32             __nds__clrs32
#define __RV_SWAP8              __nds__swap8
#define __RV_SWAP16             __nds__swap16
#define __RV_KHMBB              __nds__khmbb
#define __RV_KHMBT              __nds__khmbt
#define __RV_KHMTT              __nds__khmtt
#define __RV_KDMBB              __nds__kdmbb
#define __RV_KDMBT              __nds__kdmbt
#define __RV_KDMTT              __nds__kdmtt
#define __RV_KDMABB             __nds__kdmabb
#define __RV_KDMABT             __nds__kdmabt
#define __RV_KDMATT             __nds__kdmatt
#define __RV_MADDR32            __nds__maddr32
#define __RV_MSUBR32            __nds__msubr32
#define __RV_PBSAD              __nds__pbsad
#define __RV_PBSADA             __nds__pbsada
#define __RV_AVE                __nds__ave
#define __RV_BITREV             __nds__bitrev
#define __RV_INSB               __nds__insb

#if (__riscv_xlen == 64)
#define __RV_ADD32              __nds__add32
#define __RV_SUB32              __nds__sub32
#define __RV_RADD32             __nds__radd32
#define __RV_RSUB32             __nds__rsub32
#define __RV_URADD32            __nds__uradd32
#define __RV_URSUB32            __nds__ursub32
#define __RV_KADD32             __nds__kadd32
#define __RV_KSUB32             __nds__ksub32
#define __RV_UKADD32            __nds__ukadd32
#define __RV_UKSUB32            __nds__uksub32
#define __RV_CRAS32             __nds__cras32
#define __RV_CRSA32             __nds__crsa32
#define __RV_RCRAS32            __nds__rcras32
#define __RV_RCRSA32            __nds__rcrsa32
#define __RV_URCRAS32           __nds__urcras32
#define __RV_URCRSA32           __nds__urcrsa32
#define __RV_KCRAS32            __nds__kcras32
#define __RV_KCRSA32            __nds__kcrsa32
#define __RV_UKCRAS32           __nds__ukcras32
#define __RV_UKCRSA32           __nds__ukcrsa32
#define __RV_SRA32              __nds__sra32
#define __RV_SRAI32             __nds__sra32
#define __RV_SRL32              __nds__srl32
#define __RV_SLL32              __nds__sll32
#define __RV_SLLI32             __nds__sll32
#define __RV_SRAW_U             __nds__sraw_u
#define __RV_SRA32_U            __nds__sra32_u
#define __RV_SRL32_U            __nds__srl32_u
#define __RV_KSLL32             __nds__ksll32
#define __RV_KSLRA32            __nds__kslra32
#define __RV_KSLRA32_U          __nds__kslra32_u
#define __RV_SMBB32             __nds__smbb32
#define __RV_SMBT32             __nds__smbt32
#define __RV_SMTT32             __nds__smtt32
#define __RV_PKBB32             __nds__pkbb32
#define __RV_PKBT32             __nds__pkbt32
#define __RV_PKTT32             __nds__pktt32
#define __RV_PKTB32             __nds__pktb32
#define __RV_SMIN32             __nds__smin32
#define __RV_SMAX32             __nds__smax32
#define __RV_UMIN32             __nds__umin32
#define __RV_UMAX32             __nds__umax32
#define __RV_KABS32             __nds__kabs32
#define __RV_KHMBB16            __nds__khmbb16
#define __RV_KHMBT16            __nds__khmbt16
#define __RV_KHMTT16            __nds__khmtt16
#define __RV_KDMBB16            __nds__kdmbb16
#define __RV_KDMBT16            __nds__kdmbt16
#define __RV_KDMTT16            __nds__kdmtt16
#define __RV_KDMABB16           __nds__kdmabb16
#define __RV_KDMABT16           __nds__kdmabt16
#define __RV_KDMATT16           __nds__kdmatt16
#define __RV_KMABB32            __nds__kmabb32
#define __RV_KMABT32            __nds__kmabt32
#define __RV_KMATT32            __nds__kmatt32
#define __RV_KMDA32             __nds__kmda32
#define __RV_KMXDA32            __nds__kmxda32
#define __RV_KMADA32            __nds__kmada32
#define __RV_KMAXDA32           __nds__kmaxda32
#define __RV_KMADS32            __nds__kmads32
#define __RV_KMADRS32           __nds__kmadrs32
#define __RV_KMAXDS32           __nds__kmaxds32
#define __RV_KMSDA32            __nds__kmsda32
#define __RV_KMSXDA32           __nds__kmsxda32
#define __RV_SMDS32             __nds__smds32
#define __RV_SMDRS32            __nds__smdrs32
#define __RV_SMXDS32            __nds__smxds32
#endif /* __riscv_xlen == 64 */

// For now, the P-extention version of IAR IDE is 0.5.0, but Nuclei's supports 0.5.4
// so Nuclei supplies a workround to add custom instructions of those not natively
// supported by the IAR Assembler. Note that __RV_BPICK remains to be implemented in future.
// And we only implement Xxldsp Nuclei custom instruction set, bpick not implemented, expdxx
// implemented in c, not via .insn variant

#pragma inline=forced_no_body
unsigned long __RV_STAS16(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x7A,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_RSTAS16(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x5A,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_KSTAS16(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x62,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_URSTAS16(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x6A,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_UKSTAS16(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x72,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_STSA16(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x7B,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_RSTSA16(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x5B,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_KSTSA16(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x63,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_URSTSA16(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x6B,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_UKSTSA16(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x73,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

// #pragma inline=forced_no_body
// unsigned long __RV_BPICK(unsigned long a, unsigned long b, unsigned long c) {
    // TODO: remains to be done
// }

// RV64 only
#pragma inline=forced_no_body
unsigned long __RV_STAS32(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x78,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_RSTAS32(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x58,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_KSTAS32(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x60,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_URSTAS32(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x68,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_UKSTAS32(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x70,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_STSA32(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x79,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_RSTSA32(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x59,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_KSTSA32(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x61,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_URSTSA32(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x69,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_UKSTSA32(unsigned long a, unsigned long b) {
    unsigned long r;
    __asm(".insn r 0x7F, 0x2, 0x71,   %0,%1,%2":"=r"(r) : "r"(a), "r"(b) );
    return r;
}

#pragma inline=forced_no_body
unsigned long __RV_EXPD80(unsigned long a)
{
    return __EXPD_BYTE((uint8_t)(a & 0xff));
}

#pragma inline=forced_no_body
unsigned long __RV_EXPD81(unsigned long a)
{
    return __EXPD_BYTE((uint8_t)((a >> 8) & 0xff));
}

#pragma inline=forced_no_body
unsigned long __RV_EXPD82(unsigned long a)
{
    return __EXPD_BYTE((uint8_t)((a >> 16) & 0xff));
}

#pragma inline=forced_no_body
unsigned long __RV_EXPD83(unsigned long a)
{
    return __EXPD_BYTE((uint8_t)((a >> 24) & 0xff));
}

#if __RISCV_XLEN == 64
// RV64 only
#pragma inline=forced_no_body
unsigned long __RV_EXPD84(unsigned long a)
{
    return __EXPD_BYTE((uint8_t)((a >> 32) & 0xff));
}

#pragma inline=forced_no_body
unsigned long __RV_EXPD85(unsigned long a)
{
    return __EXPD_BYTE((uint8_t)((a >> 40) & 0xff));
}

#pragma inline=forced_no_body
unsigned long __RV_EXPD86(unsigned long a)
{
    return __EXPD_BYTE((uint8_t)((a >> 48) & 0xff));
}

#pragma inline=forced_no_body
unsigned long __RV_EXPD87(unsigned long a)
{
    return __EXPD_BYTE((uint8_t)((a >> 56) & 0xff));
}
#endif
#pragma language=restore

#else
    #error Unknown compiler
#endif /* __ICCRISCV__ */


/* XXXXX ARM Compatiable SIMD API XXXXX */
/** \brief Q setting quad 8-bit saturating addition. */
#define __QADD8(x, y)               __RV_KADD8(x, y)
/** \brief Q setting quad 8-bit saturating subtract. */
#define __QSUB8(x, y)               __RV_KSUB8((x), (y))
/** \brief Q setting dual 16-bit saturating addition. */
#define __QADD16(x, y)              __RV_KADD16((x), (y))
/** \brief Dual 16-bit signed addition with halved results. */
#define __SHADD16(x, y)             __RV_RADD16((x), (y))
/** \brief Q setting dual 16-bit saturating subtract. */
#define __QSUB16(x, y)              __RV_KSUB16((x), (y))
/** \brief Dual 16-bit signed subtraction with halved results. */
#define __SHSUB16(x, y)             __RV_RSUB16((x), (y))
/** \brief Q setting dual 16-bit add and subtract with exchange. */
#define __QASX(x, y)                __RV_KCRAS16((x), (y))
/** \brief Dual 16-bit signed addition and subtraction with halved results.*/
#define __SHASX(x, y)               __RV_RCRAS16((x), (y))
/** \brief Q setting dual 16-bit subtract and add with exchange. */
#define __QSAX(x, y)                __RV_KCRSA16((x), (y))
/** \brief Dual 16-bit signed subtraction and addition with halved results.*/
#define __SHSAX(x, y)               __RV_RCRSA16((x), (y))
/** \brief Dual 16-bit signed multiply with exchange returning difference. */
#define __SMUSDX(x, y)              __RV_SMXDS((y), (x))
/** \brief Q setting sum of dual 16-bit signed multiply with exchange. */
__STATIC_FORCEINLINE long __SMUADX (unsigned long op1, unsigned long op2)
{
    return __RV_KMXDA(op1, op2);
}
/** \brief Q setting saturating add. */
#define __QADD(x, y)                __RV_KADDW((x), (y))
/** \brief Q setting saturating subtract. */
#define __QSUB(x, y)                __RV_KSUBW((x), (y))
/** \brief Q setting dual 16-bit signed multiply with single 32-bit accumulator. */
__STATIC_FORCEINLINE long __SMLAD(unsigned long op1, unsigned long op2, long acc)
{
    return __RV_KMADA(acc, op1, op2);
}
/** \brief Q setting pre-exchanged dual 16-bit signed multiply with single 32-bit accumulator.  */
__STATIC_FORCEINLINE long __SMLADX(unsigned long op1, unsigned long op2, long acc)
{
    return __RV_KMAXDA(acc, op1, op2);
}
/** \brief Q setting dual 16-bit signed multiply with exchange subtract with 32-bit accumulate.  */
__STATIC_FORCEINLINE long __SMLSDX(unsigned long op1, unsigned long op2, long acc)
{
    return (acc - __RV_SMXDS(op1, op2));
}
/** \brief Dual 16-bit signed multiply with single 64-bit accumulator. */
__STATIC_FORCEINLINE long long __SMLALD(unsigned long op1, unsigned long op2, long long acc)
{
    return __RV_SMALDA(acc, op1, op2);
}
/** \brief Dual 16-bit signed multiply with exchange with single 64-bit accumulator.  */
__STATIC_FORCEINLINE long long __SMLALDX(unsigned long op1, unsigned long op2, long long acc)
{
    return __RV_SMALXDA(acc, op1, op2);
}
/** \brief Q setting sum of dual 16-bit signed multiply. */
__STATIC_FORCEINLINE long __SMUAD(unsigned long op1, unsigned long op2)
{
    return __RV_KMDA(op1, op2);
}
/** \brief Dual 16-bit signed multiply returning difference. */
__STATIC_FORCEINLINE long __SMUSD(unsigned long op1, unsigned long op2)
{
    return __RV_SMDRS(op1, op2);
}
/** \brief Dual extract 8-bits and sign extend each to 16-bits. */
#define __SXTB16(x)             __RV_SUNPKD820(x)
/** \brief Dual extracted 8-bit to 16-bit signed addition. TODO Need test */
__STATIC_FORCEINLINE unsigned long __SXTAB16(unsigned long op1, unsigned long op2)
{
    return __RV_ADD16(op1, __RV_SUNPKD820(op2));
}
#define __SXTAB16_RORn(ARG1, ARG2, ROTATE)        __SXTAB16(ARG1, __ROR(ARG2, ROTATE))

/** \brief 32-bit signed multiply with 32-bit truncated accumulator. */
__STATIC_FORCEINLINE long __SMMLA(long op1, long op2, long acc)
{
    long mul;
    mul = __RV_SMMUL(op1, op2);
    return (acc + mul);
}
#define __DKHM8                 __RV_DKHM8
#define __DKHM16                __RV_DKHM16
#define __DKSUB16               __RV_DKSUB16
#define __SMAQA                 __RV_SMAQA
#define __MULSR64               __RV_MULSR64
#define __DQADD8                __RV_DKADD8
#define __DQSUB8                __RV_DKSUB8
#define __DKADD16               __RV_DKADD16
#define __PKBB16                __RV_PKBB16
#define __DKSLRA16              __RV_DKSLRA16
#define __DKSLRA8               __RV_DKSLRA8
#define __KABSW                 __RV_KABSW
#define __DKABS8                __RV_DKABS8
#define __DKABS16               __RV_DKABS16
#define __SMALDA                __RV_SMALDA
#define __SMSLDA                __RV_SMSLDA
#define __SMALBB                __RV_SMALBB
#define __SUB64                 __RV_SUB64
#define __ADD64                 __RV_ADD64
#define __SMBB16                __RV_SMBB16
#define __SMBT16                __RV_SMBT16
#define __SMTT16                __RV_SMTT16
#define __EXPD80                __RV_EXPD80
#define __SMAX8                 __RV_SMAX8
#define __SMAX16                __RV_SMAX16
#define __PKTT16                __RV_PKTT16
#define __KADD16                __RV_KADD16
#define __SADD16                __RV_ADD16
#define __SSUB8                 __RV_KSUB8
#define __SADD8                 __RV_KADD8
#define __USAT16                __RV_UCLIP16
#define __SMALTT                __RV_SMALTT

/** \brief Halfword packing instruction. Combines bits[15:0] of val1 with bits[31:16] of val2 levitated with the val3. */
#define __PKHBT(ARG1, ARG2, ARG3)  ((ARG3 == 0) ? __RV_PKTB16(ARG2, ARG1) :              \
                                   (ARG3 == 16) ? __RV_PKBB16(ARG2, ARG1) :              \
                                   (((((uint32_t)(ARG1))          ) & 0x0000FFFFUL) |    \
                                   ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL)))

/** \brief Halfword packing instruction. Combines bits[31:16] of val1 with bits[15:0] of val2 right-shifted with the val3. */
#define __PKHTB(ARG1, ARG2, ARG3)  ((ARG3 == 0) ? __RV_PKTB16(ARG1, ARG2) :              \
                                   (ARG3 == 16) ? __RV_PKTT16(ARG1, ARG2) :              \
                                   (((((uint32_t)(ARG1))          ) & 0xFFFF0000UL) |    \
                                   ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL)))

#if __RISCV_XLEN == 64
/** \brief Halfword packing instruction. Combines bits[15:0] of val1 with bits[31:16] of val2 levitated with the val3，
    and also combines the [47:32] of val1 with bits[63:48] of val2 with the val3, finally pack the two new 32-bits to 64-bit. */
#define __PKHBT64(ARG1, ARG2, ARG3)  ((ARG3 == 0) ? __RV_PKTB16(ARG2, ARG1) :             \
                                   (ARG3 == 16) ? __RV_PKBB16(ARG2, ARG1) :              \
                                   ((int64_t)((((uint32_t)((uint64_t)ARG1 >> 32)) & 0x0000FFFFUL) |           \
                                   ((((uint32_t)((uint64_t)ARG2 >> 32)) << (ARG3)) & 0xFFFF0000UL)) << 32) |  \
                                   ((int64_t)(((((uint32_t)(ARG1))) & 0x0000FFFFUL) |                         \
                                   ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL)) & 0xFFFFFFFFUL))

/** \brief Halfword packing instruction. Combines bits[31:16] of val1 with bits[15:0] of val2 right-shifted with the val3，
    and also combines bits [63:48] of val1 with bits [47:32] of val2 with the val3, finally pack the two new 32bits to 64bits. */
#define __PKHTB64(ARG1, ARG2, ARG3)  ((ARG3 == 0) ? __RV_PKTB16(ARG1, ARG2) :              \
                                   (ARG3 == 16) ? __RV_PKTT16(ARG1, ARG2) :              \
                                   ((uint64_t)(((uint32_t)((uint64_t)ARG1 >> 32) & 0xFFFF0000UL) |            \
                                   ((((uint32_t)((uint64_t)ARG2 >> 32)) >> (ARG3)) & 0x0000FFFFUL)) << 32) |  \
                                   ((uint64_t)(((uint32_t)(ARG1) & 0xFFFF0000UL) |                            \
                                   ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL)) & 0xFFFFFFFFUL))
#else
/** \brief Halfword packing instruction. Combines bits[15:0] of val1 with bits[31:16] of val2 levitated with the val3，
    and also combines the [47:32] of val1 with bits[63:48] of val2 with the val3, finally pack the two new 32-bits to 64-bit. */
#define __PKHBT64(ARG1, ARG2, ARG3)  ((ARG3 == 0) ? __RV_DPKTB16(ARG2, ARG1) :             \
                                   (ARG3 == 16) ? __RV_DPKBB16(ARG2, ARG1) :              \
                                   ((int64_t)((((uint32_t)((uint64_t)ARG1 >> 32)) & 0x0000FFFFUL) |           \
                                   ((((uint32_t)((uint64_t)ARG2 >> 32)) << (ARG3)) & 0xFFFF0000UL)) << 32) |  \
                                   ((int64_t)(((((uint32_t)(ARG1))) & 0x0000FFFFUL) |                         \
                                   ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL)) & 0xFFFFFFFFUL))

/** \brief Halfword packing instruction. Combines bits[31:16] of val1 with bits[15:0] of val2 right-shifted with the val3，
    and also combines bits [63:48] of val1 with bits [47:32] of val2 with the val3, finally pack the two new 32bits to 64bits. */
#define __PKHTB64(ARG1, ARG2, ARG3)  ((ARG3 == 0) ? __RV_DPKTB16(ARG1, ARG2) :              \
                                   (ARG3 == 16) ? __RV_DPKTT16(ARG1, ARG2) :              \
                                   ((uint64_t)(((uint32_t)((uint64_t)ARG1 >> 32) & 0xFFFF0000UL) |            \
                                   ((((uint32_t)((uint64_t)ARG2 >> 32)) >> (ARG3)) & 0x0000FFFFUL)) << 32) |  \
                                   ((uint64_t)(((uint32_t)(ARG1) & 0xFFFF0000UL) |                            \
                                   ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL)) & 0xFFFFFFFFUL))
#endif /* __RISCV_XLEN == 64 */

/** first rotate then extract. This is more suitable for arm compiler for it can rotate and extract in one command*/
#define __SXTB16_RORn(ARG1, ARG2)   __RV_SUNPKD820(__ROR(ARG1, ARG2))

#endif /* defined(__DSP_PRESENT) && (__DSP_PRESENT == 1) */

#ifdef __cplusplus
}
#endif

#endif /* __CORE_FEATURE_DSP__ */