#pragma once /** @brief fast math library for float @author herumi @url https://github.com/herumi/fmath/ @note modified new BSD license http://opensource.org/licenses/BSD-3-Clause cl /Ox /Ob2 /arch:SSE2 /fp:fast bench.cpp -I../xbyak /EHsc /DNOMINMAX g++ -O3 -fomit-frame-pointer -fno-operator-names -march=core2 -mssse3 -mfpmath=sse -ffast-math -fexcess-precision=fast */ /* function prototype list float fmath::exp(float); double fmath::expd(double); float fmath::log(float); __m128 fmath::exp_ps(__m128); __m256 fmath::exp_ps256(__m256); __m128 fmath::log_ps(__m128); double fmath::expd_v(double *, size_t n); if FMATH_USE_XBYAK is defined then Xbyak version are used */ //#define FMATH_USE_XBYAK #include #include #include #include #include #include #include // for memcpy #if defined(_WIN32) && !defined(__GNUC__) #include #ifndef MIE_ALIGN #define MIE_ALIGN(x) __declspec(align(x)) #endif #else #ifndef __GNUC_PREREQ #define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor))) #endif #if __GNUC_PREREQ(4, 4) || (__clang__ > 0 && __clang_major__ >= 3) || !defined(__GNUC__) /* GCC >= 4.4 or clang or non-GCC compilers */ #include #elif __GNUC_PREREQ(4, 1) /* GCC 4.1, 4.2, and 4.3 do not have x86intrin.h, directly include SSE2 header */ #include #endif #ifndef MIE_ALIGN #define MIE_ALIGN(x) __attribute__((aligned(x))) #endif #endif #ifndef MIE_PACK #define MIE_PACK(x, y, z, w) ((x) * 64 + (y) * 16 + (z) * 4 + (w)) #endif #ifdef FMATH_USE_XBYAK #define XBYAK_NO_OP_NAMES #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" #endif namespace fmath { namespace local { const size_t EXP_TABLE_SIZE = 10; const size_t EXPD_TABLE_SIZE = 11; const size_t LOG_TABLE_SIZE = 12; typedef unsigned long long uint64_t; union fi { float f; unsigned int i; }; union di { double d; uint64_t i; }; inline unsigned int mask(int x) { return (1U << x) - 1; } inline uint64_t mask64(int x) { return (1ULL << x) - 1; } template inline const T* cast_to(const void *p) { return reinterpret_cast(p); } template size_t NumOfArray(const T (&)[N]) { return N; } /* exp(88.722839f) = inf ; 0x42b17218 exp(-87.33655f) = 1.175491e-038f(007fffe6) denormal ; 0xc2aeac50 exp(-103.972081f) = 0 ; 0xc2cff1b5 */ template struct ExpVar { enum { s = N, n = 1 << s, f88 = 0x42b00000 /* 88.0 */ }; float minX[8]; float maxX[8]; float a[8]; float b[8]; float f1[8]; unsigned int i127s[8]; unsigned int mask_s[8]; unsigned int i7fffffff[8]; unsigned int tbl[n]; ExpVar() { float log_2 = ::logf(2.0f); for (int i = 0; i < 8; i++) { maxX[i] = 88; minX[i] = -88; a[i] = n / log_2; b[i] = log_2 / n; f1[i] = 1.0f; i127s[i] = 127 << s; i7fffffff[i] = 0x7fffffff; mask_s[i] = mask(s); } for (int i = 0; i < n; i++) { float y = pow(2.0f, (float)i / n); fi fi; fi.f = y; tbl[i] = fi.i & mask(23); } } }; template struct ExpdVar { enum { sbit = sbit_, s = 1UL << sbit, adj = (1UL << (sbit + 10)) - (1UL << sbit) }; // A = 1, B = 1, C = 1/2, D = 1/6 double C1[2]; // A double C2[2]; // D double C3[2]; // C/D uint64_t tbl[s]; double a; double ra; ExpdVar() : a(s / ::log(2.0)) , ra(1 / a) { for (int i = 0; i < 2; i++) { #if 0 C1[i] = 1.0; C2[i] = 0.16667794882310216; C3[i] = 2.9997969303278795; #else C1[i] = 1.0; C2[i] = 0.16666666685227835064; C3[i] = 3.0000000027955394; #endif } for (int i = 0; i < s; i++) { di di; di.d = ::pow(2.0, i * (1.0 / s)); tbl[i] = di.i & mask64(52); } } }; template struct LogVar { enum { LEN = N - 1 }; unsigned int m1[4]; // 0 unsigned int m2[4]; // 16 unsigned int m3[4]; // 32 float m4[4]; // 48 unsigned int m5[4]; // 64 struct { float app; float rev; } tbl[1 << LEN]; float c_log2; LogVar() : c_log2(::logf(2.0f) / (1 << 23)) { const double e = 1 / double(1 << 24); const double h = 1 / double(1 << LEN); const size_t n = 1U << LEN; for (size_t i = 0; i < n; i++) { double x = 1 + double(i) / n; double a = ::log(x); tbl[i].app = (float)a; if (i < n - 1) { double b = ::log(x + h - e); tbl[i].rev = (float)((b - a) / ((h - e) * (1 << 23))); } else { tbl[i].rev = (float)(1 / (x * (1 << 23))); } } for (int i = 0; i < 4; i++) { m1[i] = mask(8) << 23; m2[i] = mask(LEN) << (23 - LEN); m3[i] = mask(23 - LEN); m4[i] = c_log2; m5[i] = 127U << 23; } } }; #ifdef FMATH_USE_XBYAK struct ExpCode : public Xbyak::CodeGenerator { float (*exp_)(float); __m128 (*exp_ps_)(__m128); template ExpCode(const ExpVar *self) { Xbyak::util::Cpu cpu; try { makeExp(self, cpu); exp_ = getCode(); align(16); exp_ps_ = getCurr<__m128(*)(__m128)>(); makeExpPs(self, cpu); return; } catch (std::exception& e) { fprintf(stderr, "ExpCode ERR:%s\n", e.what()); } catch (...) { fprintf(stderr, "ExpCode ERR:unknown error\n"); } ::exit(1); } template void makeExp(const ExpVar *self, const Xbyak::util::Cpu& /*cpu*/) { typedef ExpVar Self; using namespace local; using namespace Xbyak; inLocalLabel(); #ifdef XBYAK64 const Reg64& base = rcx; const Reg64& a = rax; #else const Reg32& base = ecx; const Reg32& a = eax; #endif mov(base, (size_t)self); #ifdef XBYAK32 movss(xm0, ptr [esp + 4]); #endif L(".retry"); movaps(xm1, xm0); movd(edx, xm0); mulss(xm1, ptr [base + offsetof(Self, a)]); // t and_(edx, 0x7fffffff); cvtss2si(eax, xm1); cmp(edx, ExpVar::f88); jg(".overflow"); lea(edx, ptr [eax + (127 << self->s)]); cvtsi2ss(xm1, eax); and_(eax, mask(self->s)); // v mov(eax, ptr [base + a * 4 + offsetof(Self, tbl)]); // expVar.tbl[v] shr(edx, self->s); mulss(xm1, ptr [base + offsetof(Self, b)]); shl(edx, 23); // u subss(xm0, xm1); // t or_(eax, edx); // fi.f addss(xm0, ptr [base + offsetof(Self, f1)]); movd(xm1, eax); mulss(xm0, xm1); #ifdef XBYAK32 movss(ptr[esp + 4], xm0); fld(dword[esp + 4]); #endif ret(); L(".overflow"); minss(xm0, ptr [base + offsetof(Self, maxX)]); maxss(xm0, ptr [base + offsetof(Self, minX)]); jmp(".retry"); outLocalLabel(); } template void makeExpPs(const ExpVar *self, const Xbyak::util::Cpu& cpu) { typedef ExpVar Self; using namespace local; using namespace Xbyak; inLocalLabel(); #ifdef XBYAK64 const Reg64& base = rcx; const Reg64& a = rax; const Reg64& d = rdx; #else const Reg32& base = ecx; const Reg32& a = eax; const Reg32& d = edx; #endif /* if abs(x) >= maxX then x = max(min(x, maxX), -maxX) and try minps, maxps are very slow then avoid them */ const bool useSSE41 = cpu.has(Xbyak::util::Cpu::tSSE41); #if defined(XBYAK64_WIN) && !defined(__INTEL_COMPILER) movaps(xm0, ptr [rcx]); #endif mov(base, (size_t)self); L(".retry"); movaps(xm5, xm0); andps(xm5, ptr [base + offsetof(Self, i7fffffff)]); movaps(xm3, ptr [base + offsetof(Self, a)]); movaps(xm4, ptr [base + offsetof(Self, b)]); pcmpgtd(xm5, ptr [base + offsetof(Self, maxX)]); mulps(xm3, xm0); movaps(xm1, ptr [base + offsetof(Self, i127s)]); pmovmskb(eax, xm5); movaps(xm5, ptr [base + offsetof(Self, mask_s)]); cvtps2dq(xm2, xm3); pand(xm5, xm2); cvtdq2ps(xm3, xm2); test(eax, eax); jnz(".overflow"); paddd(xm1, xm2); movd(eax, xm5); mulps(xm4, xm3); pextrw(edx, xm5, 2); subps(xm0, xm4); movd(xm4, ptr [base + a * 4 + offsetof(Self, tbl)]); addps(xm0, ptr [base + offsetof(Self, f1)]); pextrw(eax, xm5, 4); if (useSSE41) { pinsrd(xm4, ptr [base + d * 4 + offsetof(Self, tbl)], 1); } else { movd(xm3, ptr [base + d * 4 + offsetof(Self, tbl)]); movlhps(xm4, xm3); } pextrw(edx, xm5, 6); psrld(xm1, self->s); pslld(xm1, 23); if (useSSE41) { pinsrd(xm4, ptr [base + a * 4 + offsetof(Self, tbl)], 2); pinsrd(xm4, ptr [base + d * 4 + offsetof(Self, tbl)], 3); } else { movd(xm2, ptr [base + a * 4 + offsetof(Self, tbl)]); movd(xm3, ptr [base + d * 4 + offsetof(Self, tbl)]); movlhps(xm2, xm3); shufps(xm4, xm2, MIE_PACK(2, 0, 2, 0)); } por(xm1, xm4); mulps(xm0, xm1); ret(); L(".overflow"); minps(xm0, ptr [base + offsetof(Self, maxX)]); maxps(xm0, ptr [base + offsetof(Self, minX)]); jmp(".retry"); outLocalLabel(); } }; #endif /* to define static variables in fmath.hpp */ template struct C { static const ExpVar expVar; static const LogVar logVar; static const ExpdVar expdVar; #ifdef FMATH_USE_XBYAK static const ExpCode& getInstance() { static const ExpCode expCode(&expVar); return expCode; } #endif }; template MIE_ALIGN(32) const ExpVar C::expVar; template MIE_ALIGN(32) const LogVar C::logVar; template MIE_ALIGN(32) const ExpdVar C::expdVar; } // fmath::local #ifdef FMATH_USE_XBYAK inline float expC(float x) #else inline float exp(float x) #endif { using namespace local; const ExpVar<>& expVar = C<>::expVar; #if 1 __m128 x1 = _mm_set_ss(x); int limit = _mm_cvtss_si32(x1) & 0x7fffffff; if (limit > ExpVar<>::f88) { x1 = _mm_min_ss(x1, _mm_load_ss(expVar.maxX)); x1 = _mm_max_ss(x1, _mm_load_ss(expVar.minX)); } int r = _mm_cvtss_si32(_mm_mul_ss(x1, _mm_load_ss(expVar.a))); unsigned int v = r & mask(expVar.s); float t = _mm_cvtss_f32(x1) - r * expVar.b[0]; int u = r >> expVar.s; fi fi; fi.i = ((u + 127) << 23) | expVar.tbl[v]; return (1 + t) * fi.f; #else x = std::min(x, expVar.maxX[0]); x = std::max(x, expVar.minX[0]); float t = x * expVar.a[0]; const float magic = (1 << 23) + (1 << 22); // to round t += magic; fi fi; fi.f = t; t = x - (t - magic) * expVar.b[0]; int u = ((fi.i + (127 << expVar.s)) >> expVar.s) << 23; unsigned int v = fi.i & mask(expVar.s); fi.i = u | expVar.tbl[v]; return (1 + t) * fi.f; // return (1 + t) * pow(2, (float)u) * pow(2, (float)v / n); #endif } inline double expd(double x) { if (x <= -708.39641853226408) return 0; if (x >= 709.78271289338397) return std::numeric_limits::infinity(); using namespace local; const ExpdVar<>& c = C<>::expdVar; #if 1 const double _b = double(uint64_t(3) << 51); __m128d b = _mm_load_sd(&_b); __m128d xx = _mm_load_sd(&x); __m128d d = _mm_add_sd(_mm_mul_sd(xx, _mm_load_sd(&c.a)), b); uint64_t di = _mm_cvtsi128_si32(_mm_castpd_si128(d)); uint64_t iax = c.tbl[di & mask(c.sbit)]; __m128d _t = _mm_sub_sd(_mm_mul_sd(_mm_sub_sd(d, b), _mm_load_sd(&c.ra)), xx); uint64_t u = ((di + c.adj) >> c.sbit) << 52; double t; _mm_store_sd(&t, _t); double y = (c.C3[0] - t) * (t * t) * c.C2[0] - t + c.C1[0]; double did; u |= iax; memcpy(&did, &u, sizeof(did)); return y * did; #else /* remark : -ffast-math option of gcc may generate bad code for fmath::expd */ const uint64_t b = 3ULL << 51; di di; di.d = x * c.a + b; uint64_t iax = c.tbl[di.i & mask(c.sbit)]; double t = (di.d - b) * c.ra - x; uint64_t u = ((di.i + c.adj) >> c.sbit) << 52; double y = (c.C3[0] - t) * (t * t) * c.C2[0] - t + c.C1[0]; di.i = u | iax; return y * di.d; #endif } inline __m128d exp_pd(__m128d x) { #if 0 // faster on Haswell MIE_ALIGN(16) double buf[2]; memcpy(buf, &x, sizeof(buf)); buf[0] = expd(buf[0]); buf[1] = expd(buf[1]); __m128d y; memcpy(&y, buf, sizeof(buf)); return y; #else // faster on Skeylake using namespace local; const ExpdVar<>& c = C<>::expdVar; const double b = double(3ULL << 51); const __m128d mC1 = *cast_to<__m128d>(c.C1); const __m128d mC2 = *cast_to<__m128d>(c.C2); const __m128d mC3 = *cast_to<__m128d>(c.C3); const __m128d ma = _mm_set1_pd(c.a); const __m128d mra = _mm_set1_pd(c.ra); const __m128i madj = _mm_set1_epi32(c.adj); MIE_ALIGN(16) const double expMax[2] = { 709.78271289338397, 709.78271289338397 }; MIE_ALIGN(16) const double expMin[2] = { -708.39641853226408, -708.39641853226408 }; x = _mm_min_pd(x, *(const __m128d*)expMax); x = _mm_max_pd(x, *(const __m128d*)expMin); __m128d d = _mm_mul_pd(x, ma); d = _mm_add_pd(d, _mm_set1_pd(b)); int adr0 = _mm_cvtsi128_si32(_mm_castpd_si128(d)) & mask(c.sbit); int adr1 = _mm_cvtsi128_si32(_mm_srli_si128(_mm_castpd_si128(d), 8)) & mask(c.sbit); __m128i iaxL = _mm_castpd_si128(_mm_load_sd((const double*)&c.tbl[adr0])); __m128i iax = _mm_castpd_si128(_mm_load_sd((const double*)&c.tbl[adr1])); iax = _mm_unpacklo_epi64(iaxL, iax); __m128d t = _mm_sub_pd(_mm_mul_pd(_mm_sub_pd(d, _mm_set1_pd(b)), mra), x); __m128i u = _mm_castpd_si128(d); u = _mm_add_epi64(u, madj); u = _mm_srli_epi64(u, c.sbit); u = _mm_slli_epi64(u, 52); u = _mm_or_si128(u, iax); __m128d y = _mm_mul_pd(_mm_sub_pd(mC3, t), _mm_mul_pd(t, t)); y = _mm_mul_pd(y, mC2); y = _mm_add_pd(_mm_sub_pd(y, t), mC1); y = _mm_mul_pd(y, _mm_castsi128_pd(u)); return y; #endif } /* px : pointer to array of double n : size of array */ inline void expd_v(double *px, size_t n) { using namespace local; const ExpdVar<>& c = C<>::expdVar; const double b = double(3ULL << 51); #ifdef __AVX2__ size_t r = n & 3; n &= ~3; const __m256d mC1 = _mm256_set1_pd(c.C1[0]); const __m256d mC2 = _mm256_set1_pd(c.C2[0]); const __m256d mC3 = _mm256_set1_pd(c.C3[0]); const __m256d ma = _mm256_set1_pd(c.a); const __m256d mra = _mm256_set1_pd(c.ra); const __m256i madj = _mm256_set1_epi64x(c.adj); const __m256i maskSbit = _mm256_set1_epi64x(mask(c.sbit)); const __m256d expMax = _mm256_set1_pd(709.78272569338397); const __m256d expMin = _mm256_set1_pd(-708.39641853226408); for (size_t i = 0; i < n; i += 4) { __m256d x = _mm256_load_pd(px); x = _mm256_min_pd(x, expMax); x = _mm256_max_pd(x, expMin); __m256d d = _mm256_mul_pd(x, ma); d = _mm256_add_pd(d, _mm256_set1_pd(b)); __m256i adr = _mm256_and_si256(_mm256_castpd_si256(d), maskSbit); __m256i iax = _mm256_i64gather_epi64((const long long*)c.tbl, adr, 8); __m256d t = _mm256_sub_pd(_mm256_mul_pd(_mm256_sub_pd(d, _mm256_set1_pd(b)), mra), x); __m256i u = _mm256_castpd_si256(d); u = _mm256_add_epi64(u, madj); u = _mm256_srli_epi64(u, c.sbit); u = _mm256_slli_epi64(u, 52); u = _mm256_or_si256(u, iax); __m256d y = _mm256_mul_pd(_mm256_sub_pd(mC3, t), _mm256_mul_pd(t, t)); y = _mm256_mul_pd(y, mC2); y = _mm256_add_pd(_mm256_sub_pd(y, t), mC1); _mm256_store_pd(px, _mm256_mul_pd(y, _mm256_castsi256_pd(u))); px += 4; } #else size_t r = n & 1; n &= ~1; const __m128d mC1 = _mm_set1_pd(c.C1[0]); const __m128d mC2 = _mm_set1_pd(c.C2[0]); const __m128d mC3 = _mm_set1_pd(c.C3[0]); const __m128d ma = _mm_set1_pd(c.a); const __m128d mra = _mm_set1_pd(c.ra); #if defined(__x86_64__) || defined(_WIN64) const __m128i madj = _mm_set1_epi64x(c.adj); #else const __m128i madj = _mm_set_epi32(0, c.adj, 0, c.adj); #endif const __m128d expMax = _mm_set1_pd(709.78272569338397); const __m128d expMin = _mm_set1_pd(-708.39641853226408); for (size_t i = 0; i < n; i += 2) { __m128d x = _mm_load_pd(px); x = _mm_min_pd(x, expMax); x = _mm_max_pd(x, expMin); __m128d d = _mm_mul_pd(x, ma); d = _mm_add_pd(d, _mm_set1_pd(b)); int adr0 = _mm_cvtsi128_si32(_mm_castpd_si128(d)) & mask(c.sbit); int adr1 = _mm_cvtsi128_si32(_mm_srli_si128(_mm_castpd_si128(d), 8)) & mask(c.sbit); __m128i iaxL = _mm_castpd_si128(_mm_load_sd((const double*)&c.tbl[adr0])); __m128i iax = _mm_castpd_si128(_mm_load_sd((const double*)&c.tbl[adr1])); iax = _mm_unpacklo_epi64(iaxL, iax); __m128d t = _mm_sub_pd(_mm_mul_pd(_mm_sub_pd(d, _mm_set1_pd(b)), mra), x); __m128i u = _mm_castpd_si128(d); u = _mm_add_epi64(u, madj); u = _mm_srli_epi64(u, c.sbit); u = _mm_slli_epi64(u, 52); u = _mm_or_si128(u, iax); __m128d y = _mm_mul_pd(_mm_sub_pd(mC3, t), _mm_mul_pd(t, t)); y = _mm_mul_pd(y, mC2); y = _mm_add_pd(_mm_sub_pd(y, t), mC1); _mm_store_pd(px, _mm_mul_pd(y, _mm_castsi128_pd(u))); px += 2; } #endif for (size_t i = 0; i < r; i++) { px[i] = expd(px[i]); } } #ifdef FMATH_USE_XBYAK inline __m128 exp_psC(__m128 x) #else inline __m128 exp_ps(__m128 x) #endif { using namespace local; const ExpVar<>& expVar = C<>::expVar; __m128i limit = _mm_castps_si128(_mm_and_ps(x, *cast_to<__m128>(expVar.i7fffffff))); int over = _mm_movemask_epi8(_mm_cmpgt_epi32(limit, *cast_to<__m128i>(expVar.maxX))); if (over) { x = _mm_min_ps(x, _mm_load_ps(expVar.maxX)); x = _mm_max_ps(x, _mm_load_ps(expVar.minX)); } __m128i r = _mm_cvtps_epi32(_mm_mul_ps(x, *cast_to<__m128>(expVar.a))); __m128 t = _mm_sub_ps(x, _mm_mul_ps(_mm_cvtepi32_ps(r), *cast_to<__m128>(expVar.b))); t = _mm_add_ps(t, *cast_to<__m128>(expVar.f1)); __m128i v4 = _mm_and_si128(r, *cast_to<__m128i>(expVar.mask_s)); __m128i u4 = _mm_add_epi32(r, *cast_to<__m128i>(expVar.i127s)); u4 = _mm_srli_epi32(u4, expVar.s); u4 = _mm_slli_epi32(u4, 23); #ifdef __AVX2__ // fast? __m128i ti = _mm_i32gather_epi32((const int*)expVar.tbl, v4, 4); __m128 t0 = _mm_castsi128_ps(ti); #else unsigned int v0, v1, v2, v3; v0 = _mm_cvtsi128_si32(v4); v1 = _mm_extract_epi16(v4, 2); v2 = _mm_extract_epi16(v4, 4); v3 = _mm_extract_epi16(v4, 6); #if 1 __m128 t0, t1, t2, t3; t0 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v0])); t1 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v1])); t2 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v2])); t3 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v3])); t1 = _mm_movelh_ps(t1, t3); t1 = _mm_castsi128_ps(_mm_slli_epi64(_mm_castps_si128(t1), 32)); t0 = _mm_movelh_ps(t0, t2); t0 = _mm_castsi128_ps(_mm_srli_epi64(_mm_castps_si128(t0), 32)); t0 = _mm_or_ps(t0, t1); #else __m128i ti = _mm_castps_si128(_mm_load_ss((const float*)&expVar.tbl[v0])); ti = _mm_insert_epi32(ti, expVar.tbl[v1], 1); ti = _mm_insert_epi32(ti, expVar.tbl[v2], 2); ti = _mm_insert_epi32(ti, expVar.tbl[v3], 3); __m128 t0 = _mm_castsi128_ps(ti); #endif #endif t0 = _mm_or_ps(t0, _mm_castsi128_ps(u4)); t = _mm_mul_ps(t, t0); return t; } #ifdef __AVX2__ inline __m256 exp_ps256(__m256 x) { using namespace local; const ExpVar<>& expVar = C<>::expVar; __m256i limit = _mm256_castps_si256(_mm256_and_ps(x, *reinterpret_cast(expVar.i7fffffff))); int over = _mm256_movemask_epi8(_mm256_cmpgt_epi32(limit, *reinterpret_cast(expVar.maxX))); if (over) { x = _mm256_min_ps(x, _mm256_load_ps(expVar.maxX)); x = _mm256_max_ps(x, _mm256_load_ps(expVar.minX)); } __m256i r = _mm256_cvtps_epi32(_mm256_mul_ps(x, *reinterpret_cast(expVar.a))); __m256 t = _mm256_sub_ps(x, _mm256_mul_ps(_mm256_cvtepi32_ps(r), *reinterpret_cast(expVar.b))); t = _mm256_add_ps(t, *reinterpret_cast(expVar.f1)); __m256i v8 = _mm256_and_si256(r, *reinterpret_cast(expVar.mask_s)); __m256i u8 = _mm256_add_epi32(r, *reinterpret_cast(expVar.i127s)); u8 = _mm256_srli_epi32(u8, expVar.s); u8 = _mm256_slli_epi32(u8, 23); #if 1 __m256i ti = _mm256_i32gather_epi32((const int*)expVar.tbl, v8, 4); #else unsigned int v0, v1, v2, v3, v4, v5, v6, v7; v0 = _mm256_extract_epi16(v8, 0); v1 = _mm256_extract_epi16(v8, 2); v2 = _mm256_extract_epi16(v8, 4); v3 = _mm256_extract_epi16(v8, 6); v4 = _mm256_extract_epi16(v8, 8); v5 = _mm256_extract_epi16(v8, 10); v6 = _mm256_extract_epi16(v8, 12); v7 = _mm256_extract_epi16(v8, 14); __m256i ti = _mm256_setzero_si256(); ti = _mm256_insert_epi32(ti, expVar.tbl[v0], 0); ti = _mm256_insert_epi32(ti, expVar.tbl[v1], 1); ti = _mm256_insert_epi32(ti, expVar.tbl[v2], 2); ti = _mm256_insert_epi32(ti, expVar.tbl[v3], 3); ti = _mm256_insert_epi32(ti, expVar.tbl[v4], 4); ti = _mm256_insert_epi32(ti, expVar.tbl[v5], 5); ti = _mm256_insert_epi32(ti, expVar.tbl[v6], 6); ti = _mm256_insert_epi32(ti, expVar.tbl[v7], 7); #endif __m256 t0 = _mm256_castsi256_ps(ti); t0 = _mm256_or_ps(t0, _mm256_castsi256_ps(u8)); t = _mm256_mul_ps(t, t0); return t; } #endif inline float log(float x) { using namespace local; const LogVar<>& logVar = C<>::logVar; const size_t logLen = logVar.LEN; fi fi; fi.f = x; int a = fi.i & (mask(8) << 23); unsigned int b1 = fi.i & (mask(logLen) << (23 - logLen)); unsigned int b2 = fi.i & mask(23 - logLen); int idx = b1 >> (23 - logLen); float f = float(a - (127 << 23)) * logVar.c_log2 + logVar.tbl[idx].app + float(b2) * logVar.tbl[idx].rev; return f; } inline __m128 log_ps(__m128 x) { using namespace local; const LogVar<>& logVar = C<>::logVar; __m128i xi = _mm_castps_si128(x); __m128i idx = _mm_srli_epi32(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m2)), (23 - logVar.LEN)); __m128 a = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m1)), *cast_to<__m128i>(logVar.m5))); __m128 b2 = _mm_cvtepi32_ps(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m3))); a = _mm_mul_ps(a, *cast_to<__m128>(logVar.m4)); // c_log2 unsigned int i0 = _mm_cvtsi128_si32(idx); #if 1 unsigned int i1 = _mm_extract_epi16(idx, 2); unsigned int i2 = _mm_extract_epi16(idx, 4); unsigned int i3 = _mm_extract_epi16(idx, 6); #else idx = _mm_srli_si128(idx, 4); unsigned int i1 = _mm_cvtsi128_si32(idx); idx = _mm_srli_si128(idx, 4); unsigned int i2 = _mm_cvtsi128_si32(idx); idx = _mm_srli_si128(idx, 4); unsigned int i3 = _mm_cvtsi128_si32(idx); #endif __m128 app, rev; __m128i L = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i0].app)); __m128i H = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i1].app)); __m128 t = _mm_castsi128_ps(_mm_unpacklo_epi64(L, H)); L = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i2].app)); H = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i3].app)); rev = _mm_castsi128_ps(_mm_unpacklo_epi64(L, H)); app = _mm_shuffle_ps(t, rev, MIE_PACK(2, 0, 2, 0)); rev = _mm_shuffle_ps(t, rev, MIE_PACK(3, 1, 3, 1)); a = _mm_add_ps(a, app); rev = _mm_mul_ps(b2, rev); return _mm_add_ps(a, rev); } #ifndef __CYGWIN__ // cygwin defines log2() in global namespace! // log2(x) = log(x) / log(2) inline float log2(float x) { return fmath::log(x) * 1.442695f; } #endif /* for given y > 0 get f_y(x) := pow(x, y) for x >= 0 */ class PowGenerator { enum { N = 11 }; float tbl0_[256]; struct { float app; float rev; } tbl1_[1 << N]; public: PowGenerator(float y) { for (int i = 0; i < 256; i++) { tbl0_[i] = ::powf(2, (i - 127) * y); } const double e = 1 / double(1 << 24); const double h = 1 / double(1 << N); const size_t n = 1U << N; for (size_t i = 0; i < n; i++) { double x = 1 + double(i) / n; double a = ::pow(x, (double)y); tbl1_[i].app = (float)a; double b = ::pow(x + h - e, (double)y); tbl1_[i].rev = (float)((b - a) / (h - e) / (1 << 23)); } } float get(float x) const { using namespace local; fi fi; fi.f = x; int a = (fi.i >> 23) & mask(8); unsigned int b = fi.i & mask(23); unsigned int b1 = b & (mask(N) << (23 - N)); unsigned int b2 = b & mask(23 - N); float f; int idx = b1 >> (23 - N); f = tbl0_[a] * (tbl1_[idx].app + float(b2) * tbl1_[idx].rev); return f; } }; // for Xbyak version #ifdef FMATH_USE_XBYAK float (*const exp)(float) = local::C<>::getInstance().exp_; __m128 (*const exp_ps)(__m128) = local::C<>::getInstance().exp_ps_; #endif // exp2(x) = pow(2, x) inline float exp2(float x) { return fmath::exp(x * 0.6931472f); } /* this function may be optimized in the future */ inline __m128d log_pd(__m128d x) { double d[2]; memcpy(d, &x, sizeof(d)); d[0] = ::log(d[0]); d[1] = ::log(d[1]); __m128d m; memcpy(&m, d, sizeof(m)); return m; } inline __m128 pow_ps(__m128 x, __m128 y) { return exp_ps(_mm_mul_ps(y, log_ps(x))); } inline __m128d pow_pd(__m128d x, __m128d y) { return exp_pd(_mm_mul_pd(y, log_pd(x))); } } // fmath