这个
exp
功能来自
avx_mathfun
使用范围缩减结合切比雪夫近似多项式来计算8
经验值
-与AVX指令并行。使用正确的编译器设置以确保
addps
和
mulps
在可能的情况下,融合到FMA说明中。
改编原作非常简单
经验值
代码来自
avx\U mathfun
可移植(跨不同编译器)C/AVX2内部代码。原始代码使用gcc样式的对齐属性和巧妙的宏
_mm256_set1_ps()
相反,它位于小测试代码和表的下面。修改后的代码需要AVX2。
以下代码用于简单测试:
int main(){
int i;
float xv[8];
float yv[8];
__m256 x = _mm256_setr_ps(1.0f, 2.0f, 3.0f ,4.0f ,5.0f, 6.0f, 7.0f, 8.0f);
__m256 y = exp256_ps(x);
_mm256_store_ps(xv,x);
_mm256_store_ps(yv,y);
for (i=0;i<8;i++){
printf("i = %i, x = %e, y = %e \n",i,xv[i],yv[i]);
}
return 0;
}
输出似乎正常:
i = 0, x = 1.000000e+00, y = 2.718282e+00
i = 1, x = 2.000000e+00, y = 7.389056e+00
i = 2, x = 3.000000e+00, y = 2.008554e+01
i = 3, x = 4.000000e+00, y = 5.459815e+01
i = 4, x = 5.000000e+00, y = 1.484132e+02
i = 5, x = 6.000000e+00, y = 4.034288e+02
i = 6, x = 7.000000e+00, y = 1.096633e+03
i = 7, x = 8.000000e+00, y = 2.980958e+03
修改后的代码(AVX2)为:
#include <stdio.h>
#include <immintrin.h>
/* gcc -O3 -m64 -Wall -mavx2 -march=broadwell expc.c */
__m256 exp256_ps(__m256 x) {
/* Modified code. The original code is here: https://github.com/reyoung/avx_mathfun
AVX implementation of exp
Based on "sse_mathfun.h", by Julien Pommier
http://gruntthepeon.free.fr/ssemath/
Copyright (C) 2012 Giovanni Garberoglio
Interdisciplinary Laboratory for Computational Science (LISC)
Fondazione Bruno Kessler and University of Trento
via Sommarive, 18
I-38123 Trento (Italy)
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)
*/
/*
To increase the compatibility across different compilers the original code is
converted to plain AVX2 intrinsics code without ingenious macro's,
gcc style alignment attributes etc. The modified code requires AVX2
*/
__m256 exp_hi = _mm256_set1_ps(88.3762626647949f);
__m256 exp_lo = _mm256_set1_ps(-88.3762626647949f);
__m256 cephes_LOG2EF = _mm256_set1_ps(1.44269504088896341);
__m256 cephes_exp_C1 = _mm256_set1_ps(0.693359375);
__m256 cephes_exp_C2 = _mm256_set1_ps(-2.12194440e-4);
__m256 cephes_exp_p0 = _mm256_set1_ps(1.9875691500E-4);
__m256 cephes_exp_p1 = _mm256_set1_ps(1.3981999507E-3);
__m256 cephes_exp_p2 = _mm256_set1_ps(8.3334519073E-3);
__m256 cephes_exp_p3 = _mm256_set1_ps(4.1665795894E-2);
__m256 cephes_exp_p4 = _mm256_set1_ps(1.6666665459E-1);
__m256 cephes_exp_p5 = _mm256_set1_ps(5.0000001201E-1);
__m256 tmp = _mm256_setzero_ps(), fx;
__m256i imm0;
__m256 one = _mm256_set1_ps(1.0f);
x = _mm256_min_ps(x, exp_hi);
x = _mm256_max_ps(x, exp_lo);
/* express exp(x) as exp(g + n*log(2)) */
fx = _mm256_mul_ps(x, cephes_LOG2EF);
fx = _mm256_add_ps(fx, _mm256_set1_ps(0.5f));
tmp = _mm256_floor_ps(fx);
__m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
mask = _mm256_and_ps(mask, one);
fx = _mm256_sub_ps(tmp, mask);
tmp = _mm256_mul_ps(fx, cephes_exp_C1);
__m256 z = _mm256_mul_ps(fx, cephes_exp_C2);
x = _mm256_sub_ps(x, tmp);
x = _mm256_sub_ps(x, z);
z = _mm256_mul_ps(x,x);
__m256 y = cephes_exp_p0;
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p1);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p2);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p3);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p4);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p5);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, x);
y = _mm256_add_ps(y, one);
/* build 2^n */
imm0 = _mm256_cvttps_epi32(fx);
imm0 = _mm256_add_epi32(imm0, _mm256_set1_epi32(0x7f));
imm0 = _mm256_slli_epi32(imm0, 23);
__m256 pow2n = _mm256_castsi256_ps(imm0);
y = _mm256_mul_ps(y, pow2n);
return y;
}
int main(){
int i;
float xv[8];
float yv[8];
__m256 x = _mm256_setr_ps(1.0f, 2.0f, 3.0f ,4.0f ,5.0f, 6.0f, 7.0f, 8.0f);
__m256 y = exp256_ps(x);
_mm256_store_ps(xv,x);
_mm256_store_ps(yv,y);
for (i=0;i<8;i++){
printf("i = %i, x = %e, y = %e \n",i,xv[i],yv[i]);
}
return 0;
}
像
@Peter Cordes points out
,
应该可以更换
_mm256_floor_ps(fx + 0.5f)
通过
_mm256_round_ps(fx)
. 此外
mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
接下来的两行似乎是多余的。
通过组合可以进一步优化
cephes_exp_C1
和
cephes_exp_C2
进入
inv_LOG2EF
.
这导致以下代码未经过彻底测试!
#include <stdio.h>
#include <immintrin.h>
#include <math.h>
/* gcc -O3 -m64 -Wall -mavx2 -march=broadwell expc.c -lm */
__m256 exp256_ps(__m256 x) {
/* Modified code from this source: https://github.com/reyoung/avx_mathfun
AVX implementation of exp
Based on "sse_mathfun.h", by Julien Pommier
http://gruntthepeon.free.fr/ssemath/
Copyright (C) 2012 Giovanni Garberoglio
Interdisciplinary Laboratory for Computational Science (LISC)
Fondazione Bruno Kessler and University of Trento
via Sommarive, 18
I-38123 Trento (Italy)
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)
*/
/*
To increase the compatibility across different compilers the original code is
converted to plain AVX2 intrinsics code without ingenious macro's,
gcc style alignment attributes etc.
Moreover, the part "express exp(x) as exp(g + n*log(2))" has been significantly simplified.
This modified code is not thoroughly tested!
*/
__m256 exp_hi = _mm256_set1_ps(88.3762626647949f);
__m256 exp_lo = _mm256_set1_ps(-88.3762626647949f);
__m256 cephes_LOG2EF = _mm256_set1_ps(1.44269504088896341f);
__m256 inv_LOG2EF = _mm256_set1_ps(0.693147180559945f);
__m256 cephes_exp_p0 = _mm256_set1_ps(1.9875691500E-4);
__m256 cephes_exp_p1 = _mm256_set1_ps(1.3981999507E-3);
__m256 cephes_exp_p2 = _mm256_set1_ps(8.3334519073E-3);
__m256 cephes_exp_p3 = _mm256_set1_ps(4.1665795894E-2);
__m256 cephes_exp_p4 = _mm256_set1_ps(1.6666665459E-1);
__m256 cephes_exp_p5 = _mm256_set1_ps(5.0000001201E-1);
__m256 fx;
__m256i imm0;
__m256 one = _mm256_set1_ps(1.0f);
x = _mm256_min_ps(x, exp_hi);
x = _mm256_max_ps(x, exp_lo);
/* express exp(x) as exp(g + n*log(2)) */
fx = _mm256_mul_ps(x, cephes_LOG2EF);
fx = _mm256_round_ps(fx, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
__m256 z = _mm256_mul_ps(fx, inv_LOG2EF);
x = _mm256_sub_ps(x, z);
z = _mm256_mul_ps(x,x);
__m256 y = cephes_exp_p0;
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p1);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p2);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p3);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p4);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p5);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, x);
y = _mm256_add_ps(y, one);
/* build 2^n */
imm0 = _mm256_cvttps_epi32(fx);
imm0 = _mm256_add_epi32(imm0, _mm256_set1_epi32(0x7f));
imm0 = _mm256_slli_epi32(imm0, 23);
__m256 pow2n = _mm256_castsi256_ps(imm0);
y = _mm256_mul_ps(y, pow2n);
return y;
}
int main(){
int i;
float xv[8];
float yv[8];
__m256 x = _mm256_setr_ps(11.0f, -12.0f, 13.0f ,-14.0f ,15.0f, -16.0f, 17.0f, -18.0f);
__m256 y = exp256_ps(x);
_mm256_store_ps(xv,x);
_mm256_store_ps(yv,y);
/* compare exp256_ps with the double precision exp from math.h,
print the relative error */
printf("i x y = exp256_ps(x) double precision exp relative error\n\n");
for (i=0;i<8;i++){
printf("i = %i x =%16.9e y =%16.9e exp_dbl =%16.9e rel_err =%16.9e\n",
i,xv[i],yv[i],exp((double)(xv[i])),
((double)(yv[i])-exp((double)(xv[i])))/exp((double)(xv[i])) );
}
return 0;
}
下表通过将exp256\U ps与双精度进行比较,给出了某些点的精度印象
经验值
从…起
math.h
.
相对误差在最后一列。
i x y = exp256_ps(x) double precision exp relative error
i = 0 x = 1.000000000e+00 y = 2.718281746e+00 exp_dbl = 2.718281828e+00 rel_err =-3.036785947e-08
i = 1 x =-2.000000000e+00 y = 1.353352815e-01 exp_dbl = 1.353352832e-01 rel_err =-1.289636419e-08
i = 2 x = 3.000000000e+00 y = 2.008553696e+01 exp_dbl = 2.008553692e+01 rel_err = 1.672817689e-09
i = 3 x =-4.000000000e+00 y = 1.831563935e-02 exp_dbl = 1.831563889e-02 rel_err = 2.501162103e-08
i = 4 x = 5.000000000e+00 y = 1.484131622e+02 exp_dbl = 1.484131591e+02 rel_err = 2.108215155e-08
i = 5 x =-6.000000000e+00 y = 2.478752285e-03 exp_dbl = 2.478752177e-03 rel_err = 4.380257261e-08
i = 6 x = 7.000000000e+00 y = 1.096633179e+03 exp_dbl = 1.096633158e+03 rel_err = 1.849522682e-08
i = 7 x =-8.000000000e+00 y = 3.354626242e-04 exp_dbl = 3.354626279e-04 rel_err =-1.101575118e-08