Comments (14)
I thought that this could be easily fixed by tweaking the coefficients, but no success at this time. I will continue trying, but this could take time.
from sleef.
I tried too but couldn't get any better...
BTW I have benchmarked sleef's exp function to openlibm/musm version
and the msum version is faster than sleef by usually at around 6-11%
the sleef version is about 2x slower for small arguments since there is no explicit branch here
other cases it's usually about 0.92 slower
See:
julia> @benchmark Base.exp(0.5)
BenchmarkTools.Trial:
--------------
minimum time: 9.475 ns (0.00% GC)
median time: 10.264 ns (0.00% GC)
mean time: 11.109 ns (0.00% GC)
maximum time: 148.034 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> @benchmark Sleef.exp(0.5)
BenchmarkTools.Trial:
--------------
minimum time: 12.632 ns (0.00% GC)
median time: 14.211 ns (0.00% GC)
mean time: 14.394 ns (0.00% GC)
maximum time: 52.108 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> @benchmark Base.exp(100.0)
BenchmarkTools.Trial:
--------------
minimum time: 11.842 ns (0.00% GC)
median time: 12.238 ns (0.00% GC)
mean time: 13.429 ns (0.00% GC)
maximum time: 187.510 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> @benchmark Sleef.exp(100.0)
BenchmarkTools.Trial:
--------------
minimum time: 12.632 ns (0.00% GC)
median time: 13.027 ns (0.00% GC)
mean time: 14.074 ns (0.00% GC)
maximum time: 67.504 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> @benchmark Base.exp(10.0)
BenchmarkTools.Trial:
--------------
minimum time: 11.842 ns (0.00% GC)
median time: 12.238 ns (0.00% GC)
mean time: 13.005 ns (0.00% GC)
maximum time: 60.003 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> @benchmark Sleef.exp(10.0)
BenchmarkTools.Trial:
--------------
minimum time: 12.632 ns (0.00% GC)
median time: 14.211 ns (0.00% GC)
mean time: 14.620 ns (0.00% GC)
maximum time: 115.270 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> @benchmark Base.exp(0.001)
BenchmarkTools.Trial:
--------------
minimum time: 7.105 ns (0.00% GC)
median time: 7.501 ns (0.00% GC)
mean time: 8.228 ns (0.00% GC)
maximum time: 72.635 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> @benchmark Sleef.exp(0.001)
--------------
minimum time: 12.632 ns (0.00% GC)
median time: 13.027 ns (0.00% GC)
mean time: 14.288 ns (0.00% GC)
maximum time: 172.904 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
from sleef.
For computation speed, I am only checking the AVX2 version. It's two times as fast as the pure C version.
from sleef.
This is he native code I am getting for the exp function from sleef
pushq %rbp
movq %rsp, %rbp
movabsq $252195272, %rax # imm = 0xF0831C8
vucomisd (%rax), %xmm0
ja L358
vxorpd %xmm1, %xmm1, %xmm1
movabsq $252195280, %rax # imm = 0xF0831D0
vmovsd (%rax), %xmm2 # xmm2 = mem[0],zero
vucomisd %xmm0, %xmm2
ja L372
movabsq $252195288, %rax # imm = 0xF0831D8
vmulsd (%rax), %xmm0, %xmm1
vroundsd $4, %xmm1, %xmm1, %xmm1
vcvttsd2si %xmm1, %rax
vxorps %xmm1, %xmm1, %xmm1
vcvtsi2sdq %rax, %xmm1, %xmm1
movabsq $252195296, %rcx # imm = 0xF0831E0
vfmadd231sd (%rcx), %xmm1, %xmm0
movabsq $252195304, %rcx # imm = 0xF0831E8
vfmadd231sd (%rcx), %xmm1, %xmm0
movabsq $252195312, %rcx # imm = 0xF0831F0
vmovsd (%rcx), %xmm1 # xmm1 = mem[0],zero
movabsq $252195320, %rcx # imm = 0xF0831F8
vfmadd213sd (%rcx), %xmm0, %xmm1
movabsq $252195328, %rcx # imm = 0xF083200
vfmadd213sd (%rcx), %xmm0, %xmm1
movabsq $252195336, %rcx # imm = 0xF083208
vfmadd213sd (%rcx), %xmm0, %xmm1
movabsq $252195344, %rcx # imm = 0xF083210
vfmadd213sd (%rcx), %xmm0, %xmm1
movabsq $252195352, %rcx # imm = 0xF083218
vfmadd213sd (%rcx), %xmm0, %xmm1
movabsq $252195360, %rcx # imm = 0xF083220
vfmadd213sd (%rcx), %xmm0, %xmm1
movabsq $252195368, %rcx # imm = 0xF083228
vfmadd213sd (%rcx), %xmm0, %xmm1
movabsq $252195376, %rcx # imm = 0xF083230
vfmadd213sd (%rcx), %xmm0, %xmm1
movabsq $252195384, %rcx # imm = 0xF083238
vfmadd213sd (%rcx), %xmm0, %xmm1
movabsq $252195392, %rcx # imm = 0xF083240
vfmadd213sd (%rcx), %xmm0, %xmm1
vmulsd %xmm0, %xmm0, %xmm2
vmulsd %xmm1, %xmm2, %xmm1
vaddsd %xmm1, %xmm0, %xmm0
movabsq $252195400, %rcx # imm = 0xF083248
vaddsd (%rcx), %xmm0, %xmm0
movq %rax, %rcx
shrq %rcx
subl %ecx, %eax
shlq $52, %rcx
movabsq $4607182418800017408, %rdx # imm = 0x3FF0000000000000
addq %rdx, %rcx
vmovq %rcx, %xmm1
vmulsd %xmm0, %xmm1, %xmm0
shlq $52, %rax
addq %rdx, %rax
vmovq %rax, %xmm1
vmulsd %xmm0, %xmm1, %xmm0
popq %rbp
retq
L358:
movabsq $252195264, %rax # imm = 0xF0831C0
vmovsd (%rax), %xmm1 # xmm1 = mem[0],zero
L372:
vmovapd %xmm1, %xmm0
popq %rbp
retq
nopw (%rax,%rax)
note the vectorized instructions
from sleef.
here is the msum version (also note the vectorized instructions)
pushq %rbp
movq %rsp, %rbp
vmovq %xmm0, %rax
movb $63, %cl
bzhiq %rcx, %rax, %rdx
movq %rax, %r8
shrq $63, %r8
movabsq $4649454530587146736, %rcx # imm = 0x40862E42FEFA39F0
cmpq %rcx, %rdx
jb L106
movq %rdx, %rcx
shrq $52, %rcx
cmpq $2047, %rcx # imm = 0x7FF
jae L286
movabsq $252190152, %rax # imm = 0xF081DC8
vucomisd (%rax), %xmm0
ja L417
vxorpd %xmm1, %xmm1, %xmm1
movabsq $252190160, %rax # imm = 0xF081DD0
vmovsd (%rax), %xmm2 # xmm2 = mem[0],zero
vucomisd %xmm0, %xmm2
ja L506
L106:
movabsq $252190176, %rax # imm = 0xF081DE0
vucomisd (%rax), %xmm0
jne L128
jnp L324
L128:
movabsq $4599914934686071280, %rcx # imm = 0x3FD62E42FEFA39F0
cmpq %rcx, %rdx
jae L343
shrq $52, %rdx
cmpq $994, %rdx # imm = 0x3E2
jbe L444
vmulsd %xmm0, %xmm0, %xmm1
movabsq $252190224, %rcx # imm = 0xF081E10
vmovsd dcabs164_(%rcx), %xmm2 # xmm2 = mem[0],zero
movabsq $252190232, %rcx # imm = 0xF081E18
vfmadd213sd (%rcx), %xmm1, %xmm2
movabsq $252190240, %rcx # imm = 0xF081E20
vfmadd213sd (%rcx), %xmm1, %xmm2
movabsq $252190248, %rcx # imm = 0xF081E28
vfmadd213sd (%rcx), %xmm1, %xmm2
movabsq $252190256, %rcx # imm = 0xF081E30
vfmadd213sd (%rcx), %xmm1, %xmm2
vmulsd %xmm2, %xmm1, %xmm1
vsubsd %xmm1, %xmm0, %xmm1
vmulsd %xmm0, %xmm1, %xmm2
movabsq $252190288, %rcx # imm = 0xF081E50
vaddsd (%rcx), %xmm1, %xmm1
vdivsd %xmm1, %xmm2, %xmm1
vsubsd %xmm0, %xmm1, %xmm0
vmovsd (%rax), %xmm1 # xmm1 = mem[0],zero
vsubsd %xmm0, %xmm1, %xmm0
popq %rbp
retq
L286:
movabsq $4503599627370495, %rcx # imm = 0xFFFFFFFFFFFFF
testq %rcx, %rax
je L433
movabsq $252190136, %rax # imm = 0xF081DB8
vmovsd (%rax), %xmm1 # xmm1 = mem[0],zero
jmp L506
L324:
movabsq $252190168, %rax # imm = 0xF081DD8
vmovsd (%rax), %xmm1 # xmm1 = mem[0],zero
jmp L506
L343:
movabsq $4607361305248770930, %rcx # imm = 0x3FF0A2B23F3BAB72
cmpq %rcx, %rdx
jbe L450
movabsq $252190216, %rcx # imm = 0xF081E08
vmulsd (%rcx), %xmm0, %xmm1
vroundsd $4, %xmm1, %xmm1, %xmm1
vcvttsd2si %xmm1, %rcx
movabsq $252190200, %rdx # imm = 0xF081DF8
vfmadd231sd (%rdx), %xmm1, %xmm0
movabsq $252190208, %rdx # imm = 0xF081E00
vmulsd (%rdx), %xmm1, %xmm1
jmp L545
L417:
movabsq $252190144, %rax # imm = 0xF081DC0
vmovsd (%rax), %xmm1 # xmm1 = mem[0],zero
jmp L506
L433:
testq %r8, %r8
je L492
vxorpd %xmm1, %xmm1, %xmm1
jmp L506
L444:
vaddsd (%rax), %xmm0, %xmm0
popq %rbp
retq
L450:
testq %r8, %r8
je L512
movabsq $252190184, %rcx # imm = 0xF081DE8
vaddsd (%rcx), %xmm0, %xmm0
movq $-1, %rcx
movabsq $252190192, %rdx # imm = 0xF081DF0
vmovsd (%rdx), %xmm1 # xmm1 = mem[0],zero
jmp L545
L492:
movabsq $252190144, %rax # imm = 0xF081DC0
vmovsd (%rax), %xmm1 # xmm1 = mem[0],zero
L506:
vmovapd %xmm1, %xmm0
popq %rbp
retq
L512:
movabsq $252190200, %rcx # imm = 0xF081DF8
vaddsd (%rcx), %xmm0, %xmm0
movabsq $252190208, %rcx # imm = 0xF081E00
vmovsd (%rcx), %xmm1 # xmm1 = mem[0],zero
movl $1, %ecx
L545:
vsubsd %xmm1, %xmm0, %xmm2
vmulsd %xmm2, %xmm2, %xmm3
movabsq $252190224, %rdx # imm = 0xF081E10
vmovsd (%rdx), %xmm4 # xmm4 = mem[0],zero
movabsq $252190232, %rdx # imm = 0xF081E18
vfmadd213sd (%rdx), %xmm3, %xmm4
movabsq $252190240, %rdx # imm = 0xF081E20
vfmadd213sd (%rdx), %xmm3, %xmm4
movabsq $252190248, %rdx # imm = 0xF081E28
vfmadd213sd (%rdx), %xmm3, %xmm4
movabsq $252190256, %rdx # imm = 0xF081E30
vfmadd213sd (%rdx), %xmm3, %xmm4
vmulsd %xmm4, %xmm3, %xmm3
vsubsd %xmm3, %xmm2, %xmm3
vmulsd %xmm3, %xmm2, %xmm2
movabsq $252190264, %rdx # imm = 0xF081E38
vmovsd (%rdx), %xmm4 # xmm4 = mem[0],zero
vsubsd %xmm3, %xmm4, %xmm3
vdivsd %xmm3, %xmm2, %xmm2
vsubsd %xmm2, %xmm1, %xmm1
vsubsd %xmm0, %xmm1, %xmm0
vmovsd (%rax), %xmm1 # xmm1 = mem[0],zero
vsubsd %xmm0, %xmm1, %xmm0
cmpq $-51, %rcx
jge L725
shlq $52, %rcx
movabsq $4845873199050653696, %rax # imm = 0x4340000000000000
addq %rcx, %rax
vmovq %rax, %xmm1
vmulsd %xmm0, %xmm1, %xmm0
movabsq $252190280, %rax # imm = 0xF081E48
vmulsd (%rax), %xmm0, %xmm0
popq %rbp
retq
L725:
cmpq $1024, %rcx # imm = 0x400
jne L754
vaddsd %xmm0, %xmm0, %xmm0
movabsq $252190272, %rax # imm = 0xF081E40
vmulsd (%rax), %xmm0, %xmm0
popq %rbp
retq
L754:
shlq $52, %rcx
movabsq $4607182418800017408, %rax # imm = 0x3FF0000000000000
addq %rcx, %rax
vmovq %rax, %xmm1
vmulsd %xmm0, %xmm1, %xmm0
popq %rbp
retq
nop
from sleef.
Please compare with the AVX2 implementation. The pure C version is not much optimized. The primary reason I am providing the pure C implementation is for better understanding of the algorithm.
from sleef.
For the value of exp(1.0), it is hard due to the following reason.
exp can be calculated with 1 ulp accuracy without DD calculation, and this is because the last three coefficients are 1, 1, and 0.5. However, if I try to move the calculated value of exp(1.0) to the correctly rounded value, I need to change those three coefficients slightly. This worsens the accuracy, and it seems hard to achieve 1 ulp accuracy in this case.
from sleef.
fma only.
EXPORT CONST double xexp(double d) {
int q = (int)rintk(d * R_LN2);
double s, u;
s = mla(q, -L2U, d);
s = mla(q, -L2L, s);
u = +0.2081276378237164457e-8;
u = fma(u, s, +0.2511210703042288022e-7);
u = fma(u, s, +0.2755762628169491192e-6);
u = fma(u, s, +0.2755723402025388239e-5);
u = fma(u, s, +0.2480158687479686264e-4);
u = fma(u, s, +0.1984126989855865850e-3);
u = fma(u, s, +0.1388888888914497797e-2);
u = fma(u, s, +0.8333333333314938210e-2);
u = fma(u, s, +0.4166666666666602598e-1);
u = fma(u, s, +0.1666666666666669072e+0);
u = fma(u, s, +0.5000000000000000000e+0);
u = fma(u, s, +0.1000000000000000000e+1);
u = fma(u, s, +0.1000000000000000000e+1);
u = ldexp2k(u, q);
if (d > 709.78271114955742909217217426) u = INFINITY;
if (d < -1000) u = 0;
return u;
}
from sleef.
nice, will we see this in the next update?
from sleef.
It is already incorporated in the SIMD version. I'm still thinking if it is better to add it to the pure C version since I will have problems in testing it.
from sleef.
is this fixed in the pure C version?
from sleef.
It is fixed if you use purecfma version which is not yet available in the released version.
I have a plan to remove sleefdp.c and sleefsp.c, and replace the scalar functions with purec and purecfma functions.
from sleef.
I think this can be closed now
from sleef.
Yes
from sleef.
Related Issues (20)
- FP64 tan_u10 returns wrong sign for input `-0x1p-1074`
- Undefined reference `GOMP_critical_start' and `omp_init_lock' in dftcommon.c
- FP32 `hypotf_u05` and FP64 `hypot_u05` are inaccurate near underflow HOT 1
- Tests fail to compile with mpfr 4.2.0 HOT 2
- FP32 `log1pf_u10` and FP64 `log1p_u10` are infinitely inaccurate for large inputs
- Segfault when building with Apple Clang 15 HOT 12
- Unable to disable optional dependencies HOT 5
- Testvecabi fails to build on some x86 arch with `-march=native`
- unknown type name ‘__float128’ on loongarch64
- Tests fail to build with inline headers on for armhf
- Restart/Fix CI tests HOT 4
- Failures on various archs for gcc 12 HOT 2
- Failures for s390x VXE2 with qemu HOT 2
- Some tests fail on AArch64 with LLVM 17 HOT 1
- Deprecated usage of MD5 API
- Fix RVV inline header generation HOT 1
- Test more OS-es in Github Actions
- Add GNU make as potential generator in CI
- Please, create tag release HOT 3
- dft test gets stuck during initialisation when hardware vector length is very long HOT 2
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from sleef.