/****************************************************************************** * @file riscv_vec_math_f16.h * @brief Public header file for NMSIS DSP Library * @version V1.10.0 * @date 08 July 2021 * Target Processor: RISC-V Cores ******************************************************************************/ /* * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved. * Copyright (c) 2019 Nuclei Limited. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the License); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an AS IS BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef RISCV_VEC_MATH_F16_H #define RISCV_VEC_MATH_F16_H #include "riscv_math_types_f16.h" #include "riscv_common_tables_f16.h" #ifdef __cplusplus extern "C" { #endif #if defined(RISCV_MATH_VECTOR_FLOAT16) // Copyright 2021 Tencent // SPDX-License-Identifier: BSD-3-Clause // ref: https://github.com/Tencent/ncnn/blob/master/src/layer/riscv/rvv_mathfun_fp16s.h #include #define c_inv_mant_mask_f16 -31745 // ~0x7c00u #define c_cephes_SQRTHF 0.707106781186547524 #define c_cephes_log_p0 7.0376836292e-2 #define c_cephes_log_p1 -1.1514610310e-1 #define c_cephes_log_p2 1.1676998740e-1 #define c_cephes_log_p3 -1.2420140846e-1 #define c_cephes_log_p4 +1.4249322787e-1 #define c_cephes_log_p5 -1.6668057665e-1 #define c_cephes_log_p6 +2.0000714765e-1 #define c_cephes_log_p7 -2.4999993993e-1 #define c_cephes_log_p8 +3.3333331174e-1 #define c_cephes_log_q1 -2.12194440e-4 #define c_cephes_log_q2 0.693359375 #define _RVV_FLOAT16_LOG_OP(LMUL, MLEN) \ static inline vfloat16m##LMUL##_t log_ps_f16_m##LMUL(vfloat16m##LMUL##_t x, size_t vl) \ { \ x = __riscv_vfmax_vf_f16m##LMUL(x, (float16_t)0.f, vl); /* force flush to zero on denormal values */ \ vbool##MLEN##_t invalid_mask = __riscv_vmfle_vf_f16m##LMUL##_b##MLEN(x, (float16_t)0.f, vl); \ \ vint16m##LMUL##_t ux = __riscv_vreinterpret_v_f16m##LMUL##_i16m##LMUL(x); \ \ vint16m##LMUL##_t emm0 = __riscv_vsra_vx_i16m##LMUL(ux, 10, vl); \ \ /* keep only the fractional part */ \ ux = __riscv_vand_vx_i16m##LMUL(ux, c_inv_mant_mask_f16, vl); \ ux = __riscv_vor_vx_i16m##LMUL(ux, 14336 /* reinterpret_cast((float16_t)0.5) */, vl); \ x = __riscv_vreinterpret_v_i16m##LMUL##_f16m##LMUL(ux); \ \ emm0 = __riscv_vsub_vx_i16m##LMUL(emm0, 0xf, vl); \ vfloat16m##LMUL##_t e = __riscv_vfcvt_f_x_v_f16m##LMUL(emm0, vl); \ \ e = __riscv_vfadd_vf_f16m##LMUL(e, (float16_t)1.f, vl); \ \ /* part2: */ \ /* if( x < SQRTHF ) { */ \ /* e -= 1; */ \ /* x = x + x - 1.0; */ \ /* } else { x = x - 1.0; } */ \ vbool##MLEN##_t mask = __riscv_vmflt_vf_f16m##LMUL##_b##MLEN(x, (float16_t)c_cephes_SQRTHF, vl); \ x = __riscv_vfadd_vv_f16m##LMUL##_mu(mask, x, x, x, vl); \ x = __riscv_vfsub_vf_f16m##LMUL(x, (float16_t)1.f, vl); \ e = __riscv_vfsub_vf_f16m##LMUL##_mu(mask, e, e, (float16_t)1.f, vl); \ \ vfloat16m##LMUL##_t z = __riscv_vfmul_vv_f16m##LMUL(x, x, vl); \ \ vfloat16m##LMUL##_t y = __riscv_vfmul_vf_f16m##LMUL(x, (float16_t)c_cephes_log_p0, vl); \ y = __riscv_vfadd_vf_f16m##LMUL(y, (float16_t)c_cephes_log_p1, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ y = __riscv_vfadd_vf_f16m##LMUL(y, (float16_t)c_cephes_log_p2, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ y = __riscv_vfadd_vf_f16m##LMUL(y, (float16_t)c_cephes_log_p3, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ y = __riscv_vfadd_vf_f16m##LMUL(y, (float16_t)c_cephes_log_p4, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ y = __riscv_vfadd_vf_f16m##LMUL(y, (float16_t)c_cephes_log_p5, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ y = __riscv_vfadd_vf_f16m##LMUL(y, (float16_t)c_cephes_log_p6, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ y = __riscv_vfadd_vf_f16m##LMUL(y, (float16_t)c_cephes_log_p7, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ y = __riscv_vfadd_vf_f16m##LMUL(y, (float16_t)c_cephes_log_p8, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ \ y = __riscv_vfmul_vv_f16m##LMUL(y, z, vl); \ \ vfloat16m##LMUL##_t tmp = __riscv_vfmul_vf_f16m##LMUL(e, (float16_t)c_cephes_log_q1, vl); \ y = __riscv_vfadd_vv_f16m##LMUL(y, tmp, vl); \ \ tmp = __riscv_vfmul_vf_f16m##LMUL(z, (float16_t)0.5f, vl); \ y = __riscv_vfsub_vv_f16m##LMUL(y, tmp, vl); \ \ tmp = __riscv_vfmul_vf_f16m##LMUL(e, (float16_t)c_cephes_log_q2, vl); \ x = __riscv_vfadd_vv_f16m##LMUL(x, y, vl); \ x = __riscv_vfadd_vv_f16m##LMUL(x, tmp, vl); \ /* negative arg will be NAN */ \ vuint16m##LMUL##_t xtmp = __riscv_vreinterpret_v_f16m##LMUL##_u16m##LMUL(x); \ x = __riscv_vreinterpret_v_u16m##LMUL##_f16m##LMUL(__riscv_vor_vx_u16m##LMUL##_mu(invalid_mask, xtmp, xtmp, 0xffff, vl)); \ return x; \ } _RVV_FLOAT16_LOG_OP(1, 16) _RVV_FLOAT16_LOG_OP(2, 8) _RVV_FLOAT16_LOG_OP(4, 4) _RVV_FLOAT16_LOG_OP(8, 2) #define c_exp_hi_f16 10.7421875f #define c_exp_lo_f16 -10.7421875f #define c_cephes_LOG2EF 1.44269504088896341 #define c_cephes_exp_C1 0.693359375 #define c_cephes_exp_C2 -2.12194440e-4 #define c_cephes_exp_p0 1.9875691500e-4 #define c_cephes_exp_p1 1.3981999507e-3 #define c_cephes_exp_p2 8.3334519073e-3 #define c_cephes_exp_p3 4.1665795894e-2 #define c_cephes_exp_p4 1.6666665459e-1 #define c_cephes_exp_p5 5.0000001201e-1 #define _RVV_FLOAT16_EXP_OP(LMUL, MLEN) \ static inline vfloat16m##LMUL##_t exp_ps_f16_m##LMUL(vfloat16m##LMUL##_t x, size_t vl) \ { \ vfloat16m##LMUL##_t tmp, fx; \ \ x = __riscv_vfmin_vf_f16m##LMUL(x, (float16_t)c_exp_hi_f16, vl); \ x = __riscv_vfmax_vf_f16m##LMUL(x, (float16_t)c_exp_lo_f16, vl); \ \ /* express exp(x) as exp(g + n*log(2)) */ \ fx = __riscv_vfmacc_vf_f16m##LMUL(__riscv_vfmv_v_f_f16m##LMUL((float16_t)0.5f, vl), (float16_t)c_cephes_LOG2EF, x, vl); \ \ /* perform a floorf */ \ tmp = __riscv_vfcvt_f_x_v_f16m##LMUL(__riscv_vfcvt_x_f_v_i16m##LMUL(fx, vl), vl); \ \ /* if greater, substract 1 */ \ vbool##MLEN##_t mask = __riscv_vmfgt_vv_f16m##LMUL##_b##MLEN(tmp, fx, vl); \ fx = __riscv_vfsub_vf_f16m##LMUL##_mu(mask, tmp, tmp, (float16_t)1.f, vl); \ \ tmp = __riscv_vfmul_vf_f16m##LMUL(fx, (float16_t)c_cephes_exp_C1, vl); \ vfloat16m##LMUL##_t z = __riscv_vfmul_vf_f16m##LMUL(fx, (float16_t)c_cephes_exp_C2, vl); \ x = __riscv_vfsub_vv_f16m##LMUL(x, tmp, vl); \ x = __riscv_vfsub_vv_f16m##LMUL(x, z, vl); \ \ vfloat16m##LMUL##_t y = __riscv_vfmul_vf_f16m##LMUL(x, (float16_t)c_cephes_exp_p0, vl); \ z = __riscv_vfmul_vv_f16m##LMUL(x, x, vl); \ \ y = __riscv_vfadd_vf_f16m##LMUL(y, (float16_t)c_cephes_exp_p1, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ y = __riscv_vfadd_vf_f16m##LMUL(y, (float16_t)c_cephes_exp_p2, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ y = __riscv_vfadd_vf_f16m##LMUL(y, (float16_t)c_cephes_exp_p3, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ y = __riscv_vfadd_vf_f16m##LMUL(y, (float16_t)c_cephes_exp_p4, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ y = __riscv_vfadd_vf_f16m##LMUL(y, (float16_t)c_cephes_exp_p5, vl); \ \ y = __riscv_vfmul_vv_f16m##LMUL(y, z, vl); \ y = __riscv_vfadd_vv_f16m##LMUL(y, x, vl); \ y = __riscv_vfadd_vf_f16m##LMUL(y, (float16_t)1.f, vl); \ \ /* build 2^n */ \ vint16m##LMUL##_t mm = __riscv_vfcvt_x_f_v_i16m##LMUL(fx, vl); \ mm = __riscv_vadd_vx_i16m##LMUL(mm, 0xf, vl); \ mm = __riscv_vsll_vx_i16m##LMUL(mm, 10, vl); \ vfloat16m##LMUL##_t pow2n = __riscv_vreinterpret_v_i16m##LMUL##_f16m##LMUL(mm); \ \ y = __riscv_vfmul_vv_f16m##LMUL(y, pow2n, vl); \ return y; \ } _RVV_FLOAT16_EXP_OP(1, 16) _RVV_FLOAT16_EXP_OP(2, 8) _RVV_FLOAT16_EXP_OP(4, 4) _RVV_FLOAT16_EXP_OP(8, 2) #define c_minus_cephes_DP1 -0.78515625 #define c_minus_cephes_DP2 -2.4187564849853515625e-4 #define c_minus_cephes_DP3 -3.77489497744594108e-8 #define c_sincof_p0 -1.9515295891e-4 #define c_sincof_p1 8.3321608736e-3 #define c_sincof_p2 -1.6666654611e-1 #define c_coscof_p0 2.443315711809948e-005 #define c_coscof_p1 -1.388731625493765e-003 #define c_coscof_p2 4.166664568298827e-002 #define c_cephes_FOPI 1.27323954473516 // 4 / M_PI #define _RVV_FLOAT16_SINCOS_OP(LMUL, MLEN) \ static inline void sincos_ps_f16_m##LMUL(vfloat16m##LMUL##_t x, vfloat16m##LMUL##_t* ysin, vfloat16m##LMUL##_t* ycos, size_t vl) \ { \ /* any x */ \ vfloat16m##LMUL##_t xmm1, xmm2, xmm3, y; \ \ vuint16m##LMUL##_t emm2; \ \ vbool##MLEN##_t sign_mask_sin, sign_mask_cos; \ sign_mask_sin = __riscv_vmflt_vf_f16m##LMUL##_b##MLEN(x, (float16_t)0.f, vl); \ x = __riscv_vfsgnj_vf_f16m##LMUL(x, (float16_t)1.f, vl); \ \ /* scale by 4/Pi */ \ y = __riscv_vfmul_vf_f16m##LMUL(x, (float16_t)c_cephes_FOPI, vl); \ \ /* store the integer part of y in mm0 */ \ emm2 = __riscv_vfcvt_xu_f_v_u16m##LMUL(y, vl); \ /* j=(j+1) & (~1) (see the cephes sources) */ \ emm2 = __riscv_vadd_vx_u16m##LMUL(emm2, 1, vl); \ emm2 = __riscv_vand_vx_u16m##LMUL(emm2, ~1, vl); \ y = __riscv_vfcvt_f_xu_v_f16m##LMUL(emm2, vl); \ \ /* get the polynom selection mask */ \ /* there is one polynom for 0 <= x <= Pi/4 */ \ /* and another one for Pi/4