The exp function of avx_mathfun uses range reduction in combination with the Chebyshev approximating polynomial to compute 8 exp -s in parallel with the AVX instructions. Use the correct compiler settings to make sure addps and mulps merge with FMA instructions where possible.
It is quite easy to adapt the source code exp from avx_mathfun to portable (in different compilers) C / AVX2 intrinsics code. The source code uses gcc style alignment attributes and ingenious macros. Instead, a modified code that uses the standard _mm256_set1_ps() is below the small test code and table. Modified code requires AVX2.
The following code is used for a simple test:
int main(){ int i; float xv[8]; float yv[8]; __m256 x = _mm256_setr_ps(1.0f, 2.0f, 3.0f ,4.0f ,5.0f, 6.0f, 7.0f, 8.0f); __m256 y = exp256_ps(x); _mm256_store_ps(xv,x); _mm256_store_ps(yv,y); for (i=0;i<8;i++){ printf("i = %i, x = %e, y = %e \n",i,xv[i],yv[i]); } return 0; }
The output seems ok:
i = 0, x = 1.000000e+00, y = 2.718282e+00 i = 1, x = 2.000000e+00, y = 7.389056e+00 i = 2, x = 3.000000e+00, y = 2.008554e+01 i = 3, x = 4.000000e+00, y = 5.459815e+01 i = 4, x = 5.000000e+00, y = 1.484132e+02 i = 5, x = 6.000000e+00, y = 4.034288e+02 i = 6, x = 7.000000e+00, y = 1.096633e+03 i = 7, x = 8.000000e+00, y = 2.980958e+03
Modified Code (AVX2):
#include <stdio.h> #include <immintrin.h> /* gcc -O3 -m64 -Wall -mavx2 -march=broadwell expc.c */ __m256 exp256_ps(__m256 x) { /* Modified code. The original code is here: https://github.com/reyoung/avx_mathfun AVX implementation of exp Based on "sse_mathfun.h", by Julien Pommier http://gruntthepeon.free.fr/ssemath/ Copyright (C) 2012 Giovanni Garberoglio Interdisciplinary Laboratory for Computational Science (LISC) Fondazione Bruno Kessler and University of Trento via Sommarive, 18 I-38123 Trento (Italy) This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. (this is the zlib license) */ /* To increase the compatibility across different compilers the original code is converted to plain AVX2 intrinsics code without ingenious macro's, gcc style alignment attributes etc. The modified code requires AVX2 */ __m256 exp_hi = _mm256_set1_ps(88.3762626647949f); __m256 exp_lo = _mm256_set1_ps(-88.3762626647949f); __m256 cephes_LOG2EF = _mm256_set1_ps(1.44269504088896341); __m256 cephes_exp_C1 = _mm256_set1_ps(0.693359375); __m256 cephes_exp_C2 = _mm256_set1_ps(-2.12194440e-4); __m256 cephes_exp_p0 = _mm256_set1_ps(1.9875691500E-4); __m256 cephes_exp_p1 = _mm256_set1_ps(1.3981999507E-3); __m256 cephes_exp_p2 = _mm256_set1_ps(8.3334519073E-3); __m256 cephes_exp_p3 = _mm256_set1_ps(4.1665795894E-2); __m256 cephes_exp_p4 = _mm256_set1_ps(1.6666665459E-1); __m256 cephes_exp_p5 = _mm256_set1_ps(5.0000001201E-1); __m256 tmp = _mm256_setzero_ps(), fx; __m256i imm0; __m256 one = _mm256_set1_ps(1.0f); x = _mm256_min_ps(x, exp_hi); x = _mm256_max_ps(x, exp_lo); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm256_mul_ps(x, cephes_LOG2EF); fx = _mm256_add_ps(fx, _mm256_set1_ps(0.5f)); tmp = _mm256_floor_ps(fx); __m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); mask = _mm256_and_ps(mask, one); fx = _mm256_sub_ps(tmp, mask); tmp = _mm256_mul_ps(fx, cephes_exp_C1); __m256 z = _mm256_mul_ps(fx, cephes_exp_C2); x = _mm256_sub_ps(x, tmp); x = _mm256_sub_ps(x, z); z = _mm256_mul_ps(x,x); __m256 y = cephes_exp_p0; y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, cephes_exp_p1); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, cephes_exp_p2); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, cephes_exp_p3); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, cephes_exp_p4); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, cephes_exp_p5); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, x); y = _mm256_add_ps(y, one); /* build 2^n */ imm0 = _mm256_cvttps_epi32(fx); imm0 = _mm256_add_epi32(imm0, _mm256_set1_epi32(0x7f)); imm0 = _mm256_slli_epi32(imm0, 23); __m256 pow2n = _mm256_castsi256_ps(imm0); y = _mm256_mul_ps(y, pow2n); return y; } int main(){ int i; float xv[8]; float yv[8]; __m256 x = _mm256_setr_ps(1.0f, 2.0f, 3.0f ,4.0f ,5.0f, 6.0f, 7.0f, 8.0f); __m256 y = exp256_ps(x); _mm256_store_ps(xv,x); _mm256_store_ps(yv,y); for (i=0;i<8;i++){ printf("i = %i, x = %e, y = %e \n",i,xv[i],yv[i]); } return 0; }
<h / "> As @Peter Cordes points out, _mm256_floor_ps(fx + 0.5f) should be replaced with _mm256_round_ps(fx) . Moreover, mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); and the following two lines seem redundant. Further optimization is possible. by combining cephes_exp_C1 and cephes_exp_C2 in inv_LOG2EF . This leads to the following code that has not been fully tested!
#include <stdio.h> #include <immintrin.h> #include <math.h> /* gcc -O3 -m64 -Wall -mavx2 -march=broadwell expc.c -lm */ __m256 exp256_ps(__m256 x) { /* Modified code from this source: https://github.com/reyoung/avx_mathfun AVX implementation of exp Based on "sse_mathfun.h", by Julien Pommier http://gruntthepeon.free.fr/ssemath/ Copyright (C) 2012 Giovanni Garberoglio Interdisciplinary Laboratory for Computational Science (LISC) Fondazione Bruno Kessler and University of Trento via Sommarive, 18 I-38123 Trento (Italy) This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. (this is the zlib license) */ /* To increase the compatibility across different compilers the original code is converted to plain AVX2 intrinsics code without ingenious macro's, gcc style alignment attributes etc. Moreover, the part "express exp(x) as exp(g + n*log(2))" has been significantly simplified. This modified code is not thoroughly tested! */ __m256 exp_hi = _mm256_set1_ps(88.3762626647949f); __m256 exp_lo = _mm256_set1_ps(-88.3762626647949f); __m256 cephes_LOG2EF = _mm256_set1_ps(1.44269504088896341f); __m256 inv_LOG2EF = _mm256_set1_ps(0.693147180559945f); __m256 cephes_exp_p0 = _mm256_set1_ps(1.9875691500E-4); __m256 cephes_exp_p1 = _mm256_set1_ps(1.3981999507E-3); __m256 cephes_exp_p2 = _mm256_set1_ps(8.3334519073E-3); __m256 cephes_exp_p3 = _mm256_set1_ps(4.1665795894E-2); __m256 cephes_exp_p4 = _mm256_set1_ps(1.6666665459E-1); __m256 cephes_exp_p5 = _mm256_set1_ps(5.0000001201E-1); __m256 fx; __m256i imm0; __m256 one = _mm256_set1_ps(1.0f); x = _mm256_min_ps(x, exp_hi); x = _mm256_max_ps(x, exp_lo); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm256_mul_ps(x, cephes_LOG2EF); fx = _mm256_round_ps(fx, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); __m256 z = _mm256_mul_ps(fx, inv_LOG2EF); x = _mm256_sub_ps(x, z); z = _mm256_mul_ps(x,x); __m256 y = cephes_exp_p0; y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, cephes_exp_p1); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, cephes_exp_p2); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, cephes_exp_p3); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, cephes_exp_p4); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, cephes_exp_p5); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, x); y = _mm256_add_ps(y, one); /* build 2^n */ imm0 = _mm256_cvttps_epi32(fx); imm0 = _mm256_add_epi32(imm0, _mm256_set1_epi32(0x7f)); imm0 = _mm256_slli_epi32(imm0, 23); __m256 pow2n = _mm256_castsi256_ps(imm0); y = _mm256_mul_ps(y, pow2n); return y; } int main(){ int i; float xv[8]; float yv[8]; __m256 x = _mm256_setr_ps(11.0f, -12.0f, 13.0f ,-14.0f ,15.0f, -16.0f, 17.0f, -18.0f); __m256 y = exp256_ps(x); _mm256_store_ps(xv,x); _mm256_store_ps(yv,y); /* compare exp256_ps with the double precision exp from math.h, print the relative error */ printf("ixy = exp256_ps(x) double precision exp relative error\n\n"); for (i=0;i<8;i++){ printf("i = %ix =%16.9ey =%16.9e exp_dbl =%16.9e rel_err =%16.9e\n", i,xv[i],yv[i],exp((double)(xv[i])), ((double)(yv[i])-exp((double)(xv[i])))/exp((double)(xv[i])) ); } return 0; }
The following table gives an idea of ββthe accuracy at certain points, comparing exp256_ps with double precision exp from math.h The relative error is in the last column.
ixy = exp256_ps(x) double precision exp relative error i = 0 x = 1.000000000e+00 y = 2.718281746e+00 exp_dbl = 2.718281828e+00 rel_err =-3.036785947e-08 i = 1 x =-2.000000000e+00 y = 1.353352815e-01 exp_dbl = 1.353352832e-01 rel_err =-1.289636419e-08 i = 2 x = 3.000000000e+00 y = 2.008553696e+01 exp_dbl = 2.008553692e+01 rel_err = 1.672817689e-09 i = 3 x =-4.000000000e+00 y = 1.831563935e-02 exp_dbl = 1.831563889e-02 rel_err = 2.501162103e-08 i = 4 x = 5.000000000e+00 y = 1.484131622e+02 exp_dbl = 1.484131591e+02 rel_err = 2.108215155e-08 i = 5 x =-6.000000000e+00 y = 2.478752285e-03 exp_dbl = 2.478752177e-03 rel_err = 4.380257261e-08 i = 6 x = 7.000000000e+00 y = 1.096633179e+03 exp_dbl = 1.096633158e+03 rel_err = 1.849522682e-08 i = 7 x =-8.000000000e+00 y = 3.354626242e-04 exp_dbl = 3.354626279e-04 rel_err =-1.101575118e-08