juliasimd / loopvectorization.jl Goto Github PK
View Code? Open in Web Editor NEWMacro(s) for vectorizing loops.
License: MIT License
Macro(s) for vectorizing loops.
License: MIT License
Possibly related to #15, following are a couple of compilation errors.
_
_ _ _(_)_ | Documentation: https://docs.julialang.org
(_) | (_) (_) |
_ _ _| |_ __ _ | Type "?" for help, "]?" for Pkg help.
| | | | | | |/ _` | |
| | |_| | | | (_| | | Version 1.3.1 (2019-12-30)
_/ |\__'_|_|_|\__'_| | Official https://julialang.org/ release
|__/ |
julia> using LoopVectorization
julia> using BenchmarkTools
julia> function toy1!(G, B,κ)
d = size(G,1)
@inbounds for d1=1:d
G[d1,κ] = B[1,d1]*B[1,κ]
for d2=2:d
G[d1,κ] += B[d2,d1]*B[d2,κ]
end
end
end
toy1! (generic function with 1 method)
julia> function toy2!(G, B,κ)
d = size(G,1)
@avx for d1=1:d
G[d1,κ] = B[1,d1]*B[1,κ]
for d2=2:d
G[d1,κ] += B[d2,d1]*B[d2,κ]
end
end
end
toy2! (generic function with 1 method)
julia> function toy3!(G, B,κ)
d = size(G,1)
z = zero(eltype(G))
@avx for d1=1:d
G[d1,κ] = z
for d2=1:d
G[d1,κ] += B[d2,d1]*B[d2,κ]
end
end
end
toy3! (generic function with 1 method)
julia> N = 8
8
julia> B = randn(N, N);
julia> G1 = zeros(N, N).*NaN;
julia> G2 = similar(G1);
julia> G3 = similar(G1);
julia> toy1!(G1,B,1)
julia> toy2!(G2,B,1)
ERROR: UndefVarError: ####temporary#425_ not defined
Stacktrace:
[1] toy2!(::Array{Float64,2}, ::Array{Float64,2}, ::Int64) at ./gcutils.jl:91
[2] top-level scope at REPL[12]:1
julia> toy3!(G3,B,1)
ERROR: UndefVarError: ##G_0 not defined
Stacktrace:
[1] macro expansion at ./gcutils.jl:91 [inlined]
[2] toy3!(::Array{Float64,2}, ::Array{Float64,2}, ::Int64) at ./REPL[5]:4
[3] top-level scope at REPL[13]:1
Minor modifications of the mygemm
functions from your README seem to work great for inputs that are all Float64 or all Int64
function mygemm!(C, A, B)
@inbounds for i ∈ 1:size(A,1), j ∈ 1:size(B,2)
Cᵢⱼ = zero(eltype(C))
@fastmath for k ∈ 1:size(A,2)
Cᵢⱼ += A[i,k] * B[k,j]
end
C[i,j] = Cᵢⱼ
end
end
function mygemmavx!(C, A, B)
@avx for i ∈ 1:size(A,1), j ∈ 1:size(B,2)
Cᵢⱼ = zero(eltype(C))
for k ∈ 1:size(A,2)
Cᵢⱼ += A[i,k] * B[k,j]
end
C[i,j] = Cᵢⱼ
end
end
If however I make A & B Int64 and C a Float 64, then the macro seems to fail
N = 20;
A = rand(0:1000,N, N);
B = rand(0:1000,N, N);
C1 = Matrix{Float64}(undef, N, N);
C2 = Matrix{Float64}(undef, N, N);
@btime mygemm!($C1, $A, $B)
@btime mygemmavx!($C2, $A, $B)
The error produced is
julia> @btime mygemmavx!($C2, $A, $B)
ERROR: MethodError: no method matching +(::VectorizationBase.SVec{4,Int64}, ::VectorizationBase.SVec{4,Float64})
...
My understanding from your comment on Discourse is that this should work.
FYI I'm using Julia 1.3.1 on MacOS.
Thank you a lot for fast responses. Here is one more issue. With latest master
N,M = 4,6
A = zeros(N,M)
B = reshape(cumsum(ones(3N)),N,:)
# ok
@avx for i=1:size(A,1), j=1:size(B,2)
A[i,j] = exp(B[i,j])
end
gives
4×6 Array{Float64,2}:
2.71828 148.413 8103.08 0.0 0.0 0.0
7.38906 403.429 22026.5 0.0 0.0 0.0
20.0855 1096.63 59874.1 0.0 0.0 0.0
54.5982 2980.96 1.62755e5 0.0 0.0 0.0
while
N,M = 4,6
A = zeros(N,M)
B = reshape(cumsum(ones(3N)),N,:)
# not ok
@avx for i=1:size(A,1), j=1:size(B,2)
A[i,j+1] = exp(B[i,j])
end
gives
4×6 Array{Float64,2}:
0.0 2.0 3.0 4.0 0.0 0.0
0.0 2.0 3.0 4.0 0.0 0.0
0.0 2.0 3.0 4.0 0.0 0.0
0.0 2.0 3.0 4.0 0.0 0.0
Could cospi
and sinpi
function be supported within @avx
?
function f1()
z = cumsum(ones(4))
A = zeros(4)
for i in 1:size(A)[1]
A[i] = cospi(z[i])
end
return A
end
function f1a()
z = cumsum(ones(4))
A = zeros(4)
@avx for i in 1:size(A)[1]
A[i] = cospi(z[i])
end
return A
end
gives
julia> f1()
4-element Array{Float64,1}:
-1.0
1.0
-1.0
1.0
julia> f1a()
ERROR: MethodError: no method matching cospi(::VectorizationBase.SVec{4,Float64})
Closest candidates are:
cospi(::T) where T<:AbstractFloat at special/trig.jl:815
cospi(::Integer) at special/trig.jl:864
cospi(::T) where T<:Union{Integer, Rational} at special/trig.jl:841
...
Stacktrace:
[1] macro expansion at C:\Users\jr\.julia\packages\LoopVectorization\9Qgzg\src\reconstruct_loopset.jl:232 [inlined]
[2] _avx_! at C:\Users\jr\.julia\packages\LoopVectorization\9Qgzg\src\reconstruct_loopset.jl:232 [inlined]
[3] macro expansion at .\gcutils.jl:91 [inlined]
[4] f1a() at .\REPL[35]:4
[5] top-level scope at REPL[37]:1
Sometimes sinpi
can be faster, I wonder if this applies to @avx
using BenchmarkTools
a=10e6*rand(10_000_000);
b=π*a;
@btime sinpi.($a)
@btime sin.($b);
gives
167.324 ms (2 allocations: 76.29 MiB)
322.987 ms (2 allocations: 76.29 MiB)
I get the following error when the loop index is multiplied with another variable:
results = Vector{Float64}(undef, 2500)
function calc_sins!(res)
code_phase_delta = 0.01
@avx for i = 1:2500
res[i] = sin(i * code_phase_delta)
end
end
julia> calc_sins!(results)
ERROR: UndefVarError: i not defined
Stacktrace:
[1] macro expansion at ./gcutils.jl:91 [inlined]
[2] calc_sins!(::Array{Float64,1}) at ./REPL[4]:3
[3] top-level scope at REPL[5]:1
[4] run_backend(::REPL.REPLBackend) at /home/zorn/.julia/packages/Revise/S7mrl/src/Revise.jl:1057
If it is saved in a different variable, the following error occurs when creating the function:
function calc_sins1!(res)
code_phase_delta = 0.01
@avx for i = 1:2500
phase = i * code_phase_delta
res[i] = sin(phase)
end
end
ERROR: LoadError: MethodError: no method matching add_operation!(::LoopVectorization.LoopSet, ::Symbol, ::Symbol, ::Int64)
Closest candidates are:
add_operation!(::LoopVectorization.LoopSet, ::Symbol, ::Expr, ::Int64) at /home/zorn/.julia/packages/LoopVectorization/dqtPD/src/graphs.jl:428
add_operation!(::LoopVectorization.LoopSet, ::Symbol, ::Expr, ::LoopVectorization.ArrayReferenceMetaPosition, ::Int64) at /home/zorn/.julia/packages/LoopVectorization/dqtPD/src/graphs.jl:452
add_operation!(::LoopVectorization.LoopSet, ::Symbol, ::Expr, ::LoopVectorization.ArrayReferenceMetaPosition) at /home/zorn/.julia/packages/LoopVectorization/dqtPD/src/graphs.jl:452
I get the following error
ERROR: MethodError: no method matching calc_A(::VectorizationBase.SVec{16,Int16})
where calc_A
is a user defined function. Do I need to implement calc_A(::VectorizationBase.SVec{16,Int16})
myself? Why isn't there an error for the other user defined functions? My first guess was because they are inlined, but calc_A
should be inlined, too.
Here is the code for that:
@inline function calc_A(x::Int16)
p = 15; r = 1; A = Int16(23170); C = Int16(-425)
n = 7
a = 7
x² = (x * x) >> n
rounding = one(x) << (p - a - 1)
(A + rounding + (x² * C) >> r) >> (p - a)
end
@inline function calc_B(x::Int16)
p = 14; r = 3; B = Int16(-17790); D = Int16(351)
n = 7
a = 7
x² = (x * x) >> n
rounding = one(x) << (p - a - 1)
(rounding + x * (B + (x² * D) >> r) >> n) >> (p - a)
end
@inline function get_first_bit_sign(x)
n = 7
mysign(x << (sizeof(x) * 8 - n - 1))
end
@inline function get_second_bit_sign(x)
n = 7
mysign(x << (sizeof(x) * 8 - n - 2))
end
@inline function get_quarter_angle(x)
n = 7
x & (one(x) << n - one(x)) - one(x) << (n - 1)
end
@inline mysign(x) = 2 * signbit(x) - 1
function gen_sincos!(sins, coss, phases)
@avx for i = 1:2500
first_bit_sign = get_first_bit_sign(phases[i])
second_bit_sign = get_second_bit_sign(phases[i])
quarter_angle = get_quarter_angle(phases[i])
A = calc_A(quarter_angle)
B = calc_B(quarter_angle)
coss[i] = second_bit_sign * (first_bit_sign * A + B)
sins[i] = second_bit_sign * (A - first_bit_sign * B)
end
end
sins = Vector{Int16}(undef, 2500)
coss = Vector{Int16}(undef, 2500)
phases = Vector{Int16}(undef, 2500)
gen_sincos!(sins, coss, phases)
EDIT: Alright I found the solution myself: If I write function calc_A(x)
instead of function calc_A(x::Int16)
, it will work. So should I omit the declaration of x
? What if I have a different function for x::Int32
?
This is a powerful and welcome tool. I would like to apply it to more packages. I have run into a limitation on supported types that seems more narrow than necessary.
running e.g. myselfdotavx(xs::T)
where T is a Float64 compatible primitive type (or most other numeric bitstype, including Int64 compatibles) except for T = Union{Float64, Float32, Int64, Int32} results in a KeyError
.
ERROR: KeyError: key XFloat32 not found
Stacktrace:
[1] getindex at .\dict.jl:477 [inlined]
[2] llvmtype(::Type{T} where T) at C:\Users\jas\.julia\packages\VectorizationBase\rPD9W\src\vectorizable.jl:22
[3] #s13#2(::Any, ::Any, ::Any, ::Any) at C:\Users\jas\.julia\packages\VectorizationBase\rPD9W\src\VectorizationBase.jl:50
[4] (::Core.GeneratedFunctionStub)(::Any, ::Vararg{Any,N} where N) at .\boot.jl:526
[5] vzero at C:\Users\jas\.julia\packages\VectorizationBase\rPD9W\src\VectorizationBase.jl:74 [inlined]
[6] macro expansion at C:\Users\jas\.julia\packages\LoopVectorization\8wcli\src\reconstruct_loopset.jl:248 [inlined]
[7] _avx_! at C:\Users\jas\.julia\packages\LoopVectorization\8wcli\src\reconstruct_loopset.jl:248 [inlined]
[8] myselfdotavx(::Array{XFloat32,1}) at .\REPL[39]:3
[9] ##core#495() at C:\Users\jas\.julia\packages\BenchmarkTools\7aqwe\src\execution.jl:297
[10] ##sample#496(::BenchmarkTools.Parameters) at C:\Users\jas\.julia\packages\BenchmarkTools\7aqwe\src\execution.jl:303
[11] _run(::BenchmarkTools.Benchmark{Symbol("##benchmark#494")}, ::BenchmarkTools.Parameters; verbose::Bool, pad::String, kwargs::Base.Iterators.Pairs{Symbol,Integer,NTuple{4,Symbol},NamedTuple{(:samples, :evals, :gctrial, :gcsample),Tuple{Int64,Int64,Bool,Bool}}}) at C:\Users\jas\.julia\packages\BenchmarkTools\7aqwe\src\execution.jl:331
[12] (::Base.var"#inner#2"{Base.Iterators.Pairs{Symbol,Integer,NTuple{5,Symbol},NamedTuple{(:verbose, :samples, :evals, :gctrial, :gcsample),Tuple{Bool,Int64,Int64,Bool,Bool}}},typeof(BenchmarkTools._run),Tuple{BenchmarkTools.Benchmark{Symbol("##benchmark#494")},BenchmarkTools.Parameters}})() at .\essentials.jl:715
[13] #invokelatest#1 at .\essentials.jl:716 [inlined]
[14] #run_result#37 at C:\Users\jas\.julia\packages\BenchmarkTools\7aqwe\src\execution.jl:32 [inlined]
[15] run(::BenchmarkTools.Benchmark{Symbol("##benchmark#494")}, ::BenchmarkTools.Parameters; kwargs::Base.Iterators.Pairs{Symbol,Integer,NTuple{5,Symbol},NamedTuple{(:verbose, :samples, :evals, :gctrial, :gcsample),Tuple{Bool,Int64,Int64,Bool,Bool}}}) at C:\Users\jas\.julia\packages\BenchmarkTools\7aqwe\src\execution.jl:46
[16] #warmup#42 at C:\Users\jas\.julia\packages\BenchmarkTools\7aqwe\src\execution.jl:79 [inlined]
[17] warmup(::BenchmarkTools.Benchmark{Symbol("##benchmark#494")}) at C:\Users\jas\.julia\packages\BenchmarkTools\7aqwe\src\execution.jl:79
[18] top-level scope at C:\Users\jas\.julia\packages\BenchmarkTools\7aqwe\src\execution.jl:390
or Using Int128 results in
ERROR: error compiling myselfdotavx: Failed to parse LLVM Assembly:
julia: llvmcall:3:5: error: value doesn't match function result type '[2 x i128]'
ret <2 x i128> zeroinitializer
^
This is just me dreaming about the future, but wouldn't it be neat if one could write stuff like
@vectorize map(f, x)
@vectorize f.(x)
? 😃
With LoopVectorization v0.6.9
N,M = 4,6
A = zeros(N,M)
B = reshape(cumsum(ones(3N)),N,:)
# ok
@avx for i=1:size(A,1), j=1:size(B,2)
A[i,j+1+1] = B[i,j]
end
# not ok
@avx for i=1:size(A,1), j=1:size(B,2)
A[i,j+2] = B[i,j]
end
gives
ERROR: MethodError: no method matching sizeequivalentint(::Type{Float64}, ::Int64)
Closest candidates are:
sizeequivalentint(::Type{Int64}, ::Int64) at C:\Users\jr\.julia\packages\LoopVectorization\HwB3A\src\lowering.jl:363
sizeequivalentint(::Type{Int32}, ::Int64) at C:\Users\jr\.julia\packages\LoopVectorization\HwB3A\src\lowering.jl:366
sizeequivalentint(::Type{Int16}, ::Int64) at C:\Users\jr\.julia\packages\LoopVectorization\HwB3A\src\lowering.jl:369
...
Stacktrace:
[1] macro expansion at C:\Users\jr\.julia\packages\LoopVectorization\HwB3A\src\reconstruct_loopset.jl:272 [inlined]
[2] _avx_!(::Val{(0, 0)}, ::Type{Tuple{:LoopVectorization,:getindex,LoopVectorization.OperationStruct(0x0000000000000002, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x01, 0x01),:LoopVectorization,Symbol("##253"),LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x02),:LoopVectorization,:+,LoopVectorization.OperationStruct(0x0000000000000002, 0x0000000000000000, 0x0000000000000000, 0x0000000000000102, LoopVectorization.compute, 0x00, 0x03),:LoopVectorization,:getindex,LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x02, 0x04),:LoopVectorization,:identity,LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000000004, LoopVectorization.compute, 0x00, 0x05),:LoopVectorization,:setindex!,LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000000305, LoopVectorization.memstore, 0x03, 0x06)}}, ::Type{Tuple{LoopVectorization.ArrayRefStruct(0x0000000000000001, 0x0000000000000002),LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000102),LoopVectorization.ArrayRefStruct(0x0000000000000102, 0x0000000000000103)}}, ::Type{Tuple{0,Tuple{},Tuple{},Tuple{(2, 2)},Tuple{},Tuple{},Tuple{}}}, ::Tuple{VectorizationBase.StaticLowerUnitRange{0},VectorizationBase.StaticLowerUnitRange{0}}, ::LoopVectorization.LoopValue, ::VectorizationBase.PackedStridedPointer{Float64,1}, ::VectorizationBase.PackedStridedPointer{Float64,1}) at C:\Users\jr\.julia\packages\LoopVectorization\HwB3A\src\reconstruct_loopset.jl:272
[3] top-level scope at REPL[163]:1
I'm not sure whether this is expected to work, but right now broadcasting over a transposed matrix gives surprising results:
julia> A = randn(2,3); A[2] = 10; At = transpose(A)
3×2 LinearAlgebra.Transpose{Float64,Array{Float64,2}}:
-0.121486 10.0
-1.55957 0.691854
-0.0858253 -0.918145
julia> exp.(At)
3×2 Array{Float64,2}:
0.885603 22026.5
0.210227 1.99742
0.917755 0.399259
julia> @avx exp.(At)
3×2 Array{Float64,2}:
0.885603 0.210227
22026.5 1.99742
0.210227 0.917755
I hope I'm not bothering you ;)
MWE:
function gen_phases(phases::Vector{Int16}, freq, start_phase)
fp = 21
delta = floor(Int32, freq * 1 << (fp + 9))
idxs = Int32(1):Int32(length(phases))
fixed_point_start_phase = floor(Int32, start_phase * 1 << (fp + 9))
@avx for i = 1:length(phases)
phases[i] = Int16((idxs[i] * delta + fixed_point_start_phase) >> fp)
end
end
phases = Vector{Int16}(undef, 1000)
julia> gen_phases(phases, 0.002, 0.1)
ERROR: MethodError: no method matching promote_vtype(::Type{VectorizationBase._MM{8}}, ::Type{Int32})
Here is a MWE:
function test_for_with_different_index!(a, b, c, start_sample, num_samples)
@inbounds for i = start_sample:num_samples + start_sample - 1
c[i] = b[i] * a[i]
end
end
function test_for_with_different_index_avx!(a, b, c, start_sample, num_samples)
@avx for i = start_sample:num_samples + start_sample - 1
c[i] = b[i] * a[i]
end
end
a = rand(Int16, 1000)
b = rand(Int16, 1000)
c = similar(a_re)
c_avx = similar(a_re)
test_for_with_different_index!(a, b, c, 30, 800)
test_for_with_different_index_avx!(a, b, c_avx, 30, 800)
@test c[30:829] == c_avx[30:829]
Test Failed at /home/zorn/Documents/Code/Testing/test_avx.jl:44
Expression: c[30:829] == c_avx[30:829]
Evaluated: Int16[-17628, -17250, -31956, 29328, 2240, 8696, -8285, 1924, 16032, 9148 … 25940, -4788, 23094, 18864, -21520, 6014, -7572, -18266, -15094, 28000] == Int16[-17628, -17250, -31956, 29328, 2240, 8696, -8285, 1924, 16032, 9148 … 25940, -4788, 23094, 18864, -21520, 6014, -7572, 32544, 0, 16048]
Only the last three entries are wrong.
With floats the result is correct, though.
I have a simple algorithm that I want to vectorize. Works with @simd
but for @avx
throws an error that x is not defined.
function check_simd(regex_list, input)
input_len=length(input)
icon_indices = Vector{Int64}(undef, input_len)
@avx for j=1:input_len
icon_indices[j] = findfirst(x-> match(x, input[j]) !== nothing, regex_list)
end
return icon_indices
end
MWE here: https://gist.github.com/aminya/4b6e6db1147e63727cffc781633a8e0b
I did a test and #25 is still an issue in version 0.3.11.
Currently ∗
(\ast
) is a lazy multiplication operator, but in many fonts the difference between it and *
is not readily apparent unless they are side by side. I'd reccomend using a unicode modifier on top of *
instead. Perhaps *̂
(*\hat
).
I was experimenting with the loop unroll and stumbled upon a bug. The result of a calculation should be the same regardless of the unroll
factor:
function test_avx2(x::Vector{T}, y::Vector{T}) where {T<:AbstractFloat}
z = zero(T)
@avx unroll=2 for i ∈ 1:length(x)
z += x[i]*y[i]
end
return z
end
function test_avx3(x::Vector{T}, y::Vector{T}) where {T<:AbstractFloat}
z = zero(T)
@avx unroll=3 for i ∈ 1:length(x)
z += x[i]*y[i]
end
return z
end
# test
T = Float32
N = 1024
x = rand(T,N)
y = rand(T,N)
@test test_avx2(x, y) ≈ test_avx3(x, y)
Test Failed at In[34]:24
Expression: test_avx2(x, y) ≈ test_avx3(x, y)
Evaluated: 258.05835f0 ≈ 172.16222f0
There was an error during testing
Closer inspection of the source code reveals that while @code_native debuginfo=:none test_avx2(x, y)
gives me the expected unrolling in the main loop.
L32:
vmovups (%rcx,%rdx,4), %ymm2
vmovups 32(%rcx,%rdx,4), %ymm3
vfmadd231ps (%rax,%rdx,4), %ymm2, %ymm0
vfmadd231ps 32(%rax,%rdx,4), %ymm3, %ymm1
addq $16, %rdx
cmpq %rsi, %rdx
jl L32
This does not hold true for test_avx3
which yields the same code snippet as test_avx2
.
With 0.4.1 (Julia 1.5 dev)
AssertionError: Ni == N + 1
Stacktrace:
[1] packed_indexpr(::Core.SimpleVector, ::Int64, ::Type{Float64}) at C:\Users\PKrysl\.julia\packages\SIMDPirates\Gw1Aa\src\memory.jl:996
[2] packed_strided_ptr_index(::Core.SimpleVector, ::Int64, ::Type{Float64}) at C:\Users\PKrysl\.julia\packages\SIMDPirates\Gw1Aa\src\memory.jl:1017
[3] #s28#117(::Any, ::Any, ::Any, ::Any, ::Any, ::Any, ::Any) at C:\Users\PKrysl\.julia\packages\SIMDPirates\Gw1Aa\src\memory.jl:1031
[4] (::Core.GeneratedFunctionStub)(::Any, ::Vararg{Any,N} where N) at .\boot.jl:526
[5] macro expansion at C:\Users\PKrysl\.julia\packages\LoopVectorization\wsOMH\src\reconstruct_loopset.jl:226 [inlined]
[6] _avx_! at C:\Users\PKrysl\.julia\packages\LoopVectorization\wsOMH\src\reconstruct_loopset.jl:226 [inlined]
[7] mulCAtB!(::Array{Float64,2}, ::Array{Float64,1}, ::Array{Float64,2}) at C:\Users\PKrysl\Documents\work\FinEtoolsTestAll.jl\tests\FinEtools.jl\src\MatrixUtilityModule.jl:380
With
function jselfdotavx(a)
s = zero(eltype(a))
@inbounds @avx for i ∈ eachindex(a)
s += a[i] * a[i]
end
s
end
This works:
jselfdotavx(rand(Float64, 1024))
but with Float32
jselfdotavx(rand(Float32, 1024))
I get
ERROR: MethodError: no method matching +(::VectorizationBase.SVec{4,Float32}, ::VectorizationBase.SVec{4,Float64})
[...]
I created a package for benchmarking functions called AcuteBenchmark.
https://github.com/aminya/AcuteBenchmark.jl
I particularly made it for IntelVectorMath (VML)
If you want we can switch the benchmarking system to AcuteBenchmark. It is very easy to use. It automatically generates random vectors based on the limits and the size given and then benchmarks and plots the result.
Other than the Acutebenchmark doc, there is a fully working example available here: https://github.com/JuliaMath/VML.jl/blob/AcuteBenchmark/benchmark/benchmark.jl
This function doesn't give the correct answer (but works when @avx
is removed):
function mulCAtB!(C, A, B)
M, N = size(C); K = size(B,1)
@assert size(C, 1) == size(A, 2)
@assert size(C, 2) == size(B, 2)
@assert size(A, 1) == size(B, 1)
if mod(M, 2) == 0 && mod(N, 2) == 0
for m ∈ 1:2:M
m1 = m + 1
for n ∈ 1:2:N
n1 = n + 1
C11, C21, C12, C22 = 0.0, 0.0, 0.0, 0.0
@avx for k ∈ 1:K
C11 += A[k,m] * B[k,n]
C21 += A[k,m1] * B[k,n]
C12 += A[k,m] * B[k,n1]
C22 += A[k,m1] * B[k,n1]
end
C[m,n] = C11
C[m1,n] = C21
C[m,n1] = C12
C[m1,n1] = C22
end
end
else
@inbounds for n ∈ 1:N, m ∈ 1:M
Cmn = 0.0
@inbounds for k ∈ 1:K
Cmn += A[k,m] * B[k,n]
end
C[m,n] = Cmn
end
end
return C
end
I was playing around a bit and noticed some bugs. The first two snippets are attempts at MWE of the final snippet which is the desired calculation
function simulate(x0::Float64; T = 10.0, dt = 0.0001, vt = 1.0)
times = 0.0:dt:T
positions = zeros(length(times))
v = 0.0
a = 0.0
x = x0
@avx for ii in eachindex(times)
x = x + v * dt
positions[ii] = x/x0
end
times, positions
end
simulate(10.0) # UndefVarError, but runs without @avx
function simulate(x0::Float64; T = 10.0, dt = 0.0001, vt = 1.0)
times = 0.0:dt:T
positions = zeros(length(times))
v = 0.0
a = 0.0
x = x0
@avx for ii in eachindex(times)
t = times[ii]
x = x + v * dt
positions[ii] = x/x0
end
times, positions
end
simulate(10.0) # StackOverflowError
The final and desired code is the following
@inline friction(v::Float64, vt::Float64) = v > vt ? -3v : -3vt*sign(v)
function simulate(x0::Float64; T = 10.0, dt = 0.0001, vt = 1.0)
times = 0.0:dt:T
positions = zeros(length(times))
v = 0.0
a = 0.0
x = x0
@avx for ii in eachindex(times)
t = times[ii]
a = friction(v, vt) - 100.0*x
a = - 100.0*x
v = v + a * dt
x = x + v * dt
positions[ii] = x/x0
end
times, positions
end
simulate(10.0)
Original code from discourse thread
There seems to be no method for << nor >>.
MWE:
function test_bit_shift()
accu = Int32(0)
counter = Int32(1):Int32(64)
@avx for i = 1:64
accu += counter[i] << 1
end
end
julia> test_bit_shift()
ERROR: MethodError: no method matching <<(::VectorizationBase._MM{8}, ::Int32)
Closest candidates are:
<<(::Integer, ::Integer) at operators.jl:590
<<(::VectorizationBase.Static{N}, ::Any) where N at /home/zorn/.julia/packages/VectorizationBase/Myhw6/src/static.jl:53
...
Stacktrace:
[1] macro expansion at /home/zorn/.julia/packages/LoopVectorization/2jh99/src/reconstruct_loopset.jl:260 [inlined]
[2] _avx_! at /home/zorn/.julia/packages/LoopVectorization/2jh99/src/reconstruct_loopset.jl:260 [inlined]
First of all thank you for this package!
The following code works fine for version 0.3.0.
function clenshaw(x,coeff)
len_c = length(coeff)
tmp = zero(x)
ret = zero(x)
for i in len_c:-1:2
ret = muladd(x,2tmp,coeff[i]-ret)
ret,tmp = tmp,ret
end
ret = muladd(x,tmp,coeff[1]-ret)
return ret
end
function clenshaw_avx!(ret,x,coeff)
@avx for j in 1:length(ret)
ret[j] = clenshaw(x[j],coeff)
end
end
c = rand(100); x = rand(10^6); y = similar(x);
clenshaw_avx!(y,x,c)
Starting with 0.3.1 the following error occurs:
ERROR: UndefVarError: clenshaw not defined
Stacktrace:
[1] macro expansion at .\gcutils.jl:91 [inlined]
[2] clenshaw_avx!(::Array{Float64,1}, ::Array{Float64,1}, ::Array{Float64,1}) at .\REPL[10]:2
[3] top-level scope at REPL[13]:1
And for version 0.4.0 the error message is:
ERROR: UndefVarError: clenshaw not defined
Stacktrace:
[1] macro expansion at /home/fabian/.julia/packages/LoopVectorization/SSpaP/src/reconstruct_loopset.jl:214 [inlined]
[2] _avx_!(::Val{(1, 0, 0)}, ::Type{Tuple{:LoopVectorization,:getindex,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x01, 0x01),:LoopVectorization,Symbol("##422"),LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x02),:LoopVectorization,:clenshaw,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000102, LoopVectorization.compute, 0x00, 0x03),:LoopVectorization,:setindex!,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000003, LoopVectorization.memstore, 0x02, 0x04)}}, ::Type{Tuple{LoopVectorization.ArrayRefStruct(0x0000000000000001, 0x0000000000000001),LoopVectorization.ArrayRefStruct(0x0000000000000001, 0x0000000000000001)}}, ::Type{Tuple{0,Tuple{},Tuple{2},Tuple{},Tuple{},Tuple{},Tuple{}}}, ::Tuple{VectorizationBase.StaticLowerUnitRange{0}}, ::VectorizationBase.PackedStridedPointer{Float64,0}, ::VectorizationBase.PackedStridedPointer{Float64,0}, ::Array{Float64,1}, ::Nothing) at /home/fabian/.julia/packages/LoopVectorization/SSpaP/src/reconstruct_loopset.jl:214
[3] macro expansion at ./gcutils.jl:91 [inlined]
[4] clenshaw_avx!(::Array{Float64,1}, ::Array{Float64,1}, ::Array{Float64,1}) at ./REPL[3]:3
[5] top-level scope at REPL[5]:1
Julia 1.5 dev
WARNING: Method definition alloca(Int32) in module SIMDPirates at C:\Users\appveyor\.julia\packages\SIMDPirates\obrXX\src\memory.jl:30 overwritten at C:\Users\appveyor\.julia\packages\SIMDPirates\obrXX\src\memory.jl:48.
WARNING: Method definition alloca(Int32, Type{T}) where {T} in module SIMDPirates at C:\Users\appveyor\.julia\packages\SIMDPirates\obrXX\src\memory.jl:30 overwritten at C:\Users\appveyor\.julia\packages\SIMDPirates\obrXX\src\memory.jl:48.
WARNING: Method definition alloca(Int32, Type{T}, Base.Val{Align}) where {T, Align} in module SIMDPirates at C:\Users\appveyor\.julia\packages\SIMDPirates\obrXX\src\memory.jl:30 overwritten at C:\Users\appveyor\.julia\packages\SIMDPirates\obrXX\src\memory.jl:48.
ERROR: LoadError: LoadError: LoadError: LoadError: LoadError: DomainError with -2.3611832414348226e21:
sqrt will only return a complex result if called with a complex argument. Try sqrt(Complex(x)).
Stacktrace:
[1] throw_complex_domainerror(::Symbol, ::Float64) at .\math.jl:31
[2] sqrt at .\math.jl:492 [inlined]
[3] solve_tilesize(::SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int32}},Int32},true}, ::SubArray{Int32,1,Array{Int32,2},Tuple{Base.Slice{Base.OneTo{Int32}},Int32},true}) at C:\Users\appveyor\.julia\packages\LoopVectorization\COyRT\src\determinestrategy.jl:172
[4] solve_tilesize(::SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int32}},Int32},true}, ::SubArray{Int32,1,Array{Int32,2},Tuple{Base.Slice{Base.OneTo{Int32}},Int32},true}, ::Int32, ::Int32) at C:\Users\appveyor\.julia\packages\LoopVectorization\COyRT\src\determinestrategy.jl:221
[5] solve_tilesize(::LoopVectorization.LoopSet, ::Symbol, ::Symbol, ::SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int32}},Int32},true}, ::SubArray{Int32,1,Array{Int32,2},Tuple{Base.Slice{Base.OneTo{Int32}},Int32},true}) at C:\Users\appveyor\.julia\packages\LoopVectorization\COyRT\src\determinestrategy.jl:245
[6] evaluate_cost_tile(::LoopVectorization.LoopSet, ::Array{Symbol,1}) at C:\Users\appveyor\.julia\packages\LoopVectorization\COyRT\src\determinestrategy.jl:317
[7] choose_tile at C:\Users\appveyor\.julia\packages\LoopVectorization\COyRT\src\determinestrategy.jl:387 [inlined]
[8] choose_order(::LoopVectorization.LoopSet) at C:\Users\appveyor\.julia\packages\LoopVectorization\COyRT\src\determinestrategy.jl:401
[9] lower(::LoopVectorization.LoopSet) at C:\Users\appveyor\.julia\packages\LoopVectorization\COyRT\src\lowering.jl:807
[10] @avx(::LineNumberNode, ::Module, ::Any) at C:\Users\appveyor\.julia\packages\LoopVectorization\COyRT\src\constructors.jl:53
[11] #macroexpand#32 at .\expr.jl:92 [inlined]
[12] macroexpand at .\expr.jl:91 [inlined]
[13] docm(::LineNumberNode, ::Module, ::Any, ::Any, ::Bool) at .\docs\Docs.jl:509 (repeats 2 times)
[14] @doc(::LineNumberNode, ::Module, ::String, ::Vararg{Any,N} where N) at .\boot.jl:451
[15] include at .\sysimg.jl:29 [inlined]
[16] include(::String) at C:\projects\finetools-jl\src\FinEtools.jl:7
[17] top-level scope at none:0
[18] include at .\sysimg.jl:29 [inlined]
[19] include(::String) at C:\projects\finetools-jl\src\FinEtools.jl:7
[20] top-level scope at none:0
[21] include at .\boot.jl:317 [inlined]
[22] include_relative(::Module, ::String) at .\loading.jl:1044
[23] include(::Module, ::String) at .\sysimg.jl:29
[24] top-level scope at none:2
[25] eval at .\boot.jl:319 [inlined]
[26] eval(::Expr) at .\client.jl:393
[27] top-level scope at .\none:3
in expression starting at C:\projects\finetools-jl\src\MatrixUtilityModule.jl:380
Here are two variants of a function that I cannot figure how one can apply @avx
in front of for
function f1(N)
x = π*cumsum(ones(N))/2
y = repeat(similar(x),1,2)
for i = 1:length(x)
y[i,1:2] .= sincos(x[i])
end
return y
end
function f2(N)
x = π*cumsum(ones(N))/2
y = repeat(similar(x),1,2)
for i = 1:length(x)
y[i,1], y[i,2] = sincos(x[i])
end
return y
end
If I refer to https://software.intel.com/en-us/mkl-vmperfdata-sincos and https://software.intel.com/en-us/mkl-vmperfdata-sin , those pages suggest that not a big gain is available if sin
and cos
are to be evaluated using sincos
than separately. But I would like to test it myself.
Here is a MWE:
function calc!(x)
delta_fp = 2
phase_fp = Int32(0)
@avx for i = 1:length(x)
phase_fp = phase_fp + delta_fp
phase_int = phase_fp >> 17
x[i] = phase_int
end
end
x = Vector{Int32}(undef, 100)
calc!(x)
ERROR: InexactError: trunc(Int64, Inf)
Stacktrace:
[1] trunc at ./float.jl:702 [inlined]
[2] round at ./float.jl:367 [inlined]
[3] unroll_no_reductions(::LoopVectorization.LoopSet, ::Array{Symbol,1}, ::Symbol, ::Int64, ::Int64) at /home/zorn/.julia/packages/LoopVectorization/9Qgzg/src/determinestrategy.jl:151
[4] determine_unroll_factor(::LoopVectorization.LoopSet, ::Array{Symbol,1}, ::Symbol, ::Symbol) at /home/zorn/.julia/packages/LoopVectorization/9Qgzg/src/determinestrategy.jl:170
[5] choose_order(::LoopVectorization.LoopSet) at /home/zorn/.julia/packages/LoopVectorization/9Qgzg/src/determinestrategy.jl:550
[6] lower(::LoopVectorization.LoopSet, ::Int8) at /home/zorn/.julia/packages/LoopVectorization/9Qgzg/src/lowering.jl:497
[7] avx_body(::Tuple{Int8,Int8,Int8}, ::Array{LoopVectorization.Instruction,1}, ::Array{LoopVectorization.OperationStruct,1}, ::Array{LoopVectorization.ArrayRefStruct,1}, ::Core.SimpleVector, ::Core.SimpleVector, ::NTuple{4,DataType}) at /home/zorn/.julia/packages/LoopVectorization/9Qgzg/src/reconstruct_loopset.jl:225
MWE:
@inline mysign(x) = ifelse(x >= zero(x), one(x), -one(x))
function accumulate_mysign(a)
accu = zero(eltype(a))
@inbounds for i = 1:length(a)
accu += mysign(a[i])
end
accu
end
function accumulate_mysign_avx(a)
accu = zero(eltype(a))
@avx for i = 1:length(a)
accu += mysign(a[i])
end
accu
end
a = rand(Int16, 32)
julia> accumulate_mysign(a)
-4
julia> accumulate_mysign_avx(a)
ERROR: TypeError: non-boolean (UInt16) used in boolean context
Stacktrace:
[1] mysign at ./none:1 [inlined]
[2] macro expansion at ./gcutils.jl:91 [inlined]
[3] accumulate_mysign_avx(::Array{Int16,1}) at ./none:3
[4] top-level scope at none:0
MWE:
function gen_phases(phases::Vector{Int16}, freq)
fp = 21
delta = floor(Int32, freq * 1 << (fp + 9))
idxs = Int32(1):Int32(length(phases))
@avx for i = 1:length(phases)
phases[i] = Int16((idxs[i] * delta) >> fp)
end
end
phases = Vector{Int16}(undef, 1000);
julia> gen_phases(phases, 0.002)
ERROR: MethodError: no method matching Int16(::VectorizationBase.SVec{8,Int32})
The same applies to Int64.
As predicted, vmap
gives me a tremendous boost in performance =) I'm hitting an error when mapping over static arrays though
julia> vmap(exp, randn(8));
julia> vmap(exp, @SVector randn(8));
ERROR: conversion to pointer not defined for SArray{Tuple{8},Float64,1,8}
Now if I run the following I get an error:
a = collect(1:100)
items = rand(20)
indices = collect(1:20)
@avx a[indices] .= items
ERROR: MethodError: no method matching vmaterialize!(::SubArray{Int64,1,Array{Int64,1},Tuple{Array{Int64,1}},false}, ::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1},Nothing,typeof(identity),Tuple{Array{Int64,1}}}, ::Val{:Main})Closest candidates are:
vmaterialize!(::Union{LinearAlgebra.Adjoint{T,A}, LinearAlgebra.Transpose{T,A}}, ::BC, ::Val{Mod}) where {T<:Union{Float32, Float64}, N, A<:AbstractArray{T,N}, BC<:Base.Broadcast.Broadcasted, Mod} at C:\Users\yahyaaba\.julia\packages\LoopVectorization\iNfCA\src\broadcast.jl:218
vmaterialize!(::AbstractArray{T,N}, ::BC, ::Val{Mod}) where {T<:Union{Float32, Float64}, N, BC<:Base.Broadcast.Broadcasted, Mod} at C:\Users\yahyaaba\.julia\packages\LoopVectorization\iNfCA\src\broadcast.jl:194
That expression is equal to the following that works
indiceslen = length(indices)
@avx for k = 1:indiceslen
a[indices[k]] = items[k]
end
With LoopVectorization v0.5.1
using LoopVectorization
function f1()
z = zeros(8)
A = ones(8)
for i in eachindex(A)
A[i] = z[i]
end
return A
end
function f1a()
z = zeros(8)
A = ones(8)
@avx for i in eachindex(A)
A[i] = z[i]
end
return A
end
function f2()
z = zeros(8)
A = ones(8)
for i in eachindex(A)
A[i] = sin(z[i])
end
return A
end
function f2a()
z = zeros(8)
A = ones(8)
@avx for i in eachindex(A)
A[i] = sin(z[i])
end
return A
end
julia> f1() ≈ f2() ≈ f2a() ≈ f1a()
false
julia> f1() ≈ f2() ≈ f2a()
true
julia> f1a()
8-element Array{Float64,1}:
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
Output should be a vector of zeros.
Is it possible to vectorize filter
algorithm of julia?
https://github.com/JuliaLang/julia/blob/3720edfc06ee8687b54ee7fbb908c9f03f4fc557/base/array.jl#L2368
Does LoopVectorization
support complex vectors? E.g.
using LoopVectorization
function test_avx!(x::Vector, y::Vector, beta)
@avx for n=1:length(x)
x[n] += conj(beta)*y[n]
end
end
# set up
T = ComplexF32
N = 1024
x = zeros(T,N)
y = rand(T,N)
β = rand(T)
# end set up
test_avm!(x,y,β)
gives me the following error
KeyError: key Complex{Float32} not found
Stacktrace:
[1] getindex at ./dict.jl:477 [inlined]
[2] llvmtype(::Type) at /home/moeddel/.julia/packages/VectorizationBase/hBLD0/src/vectorizable.jl:21
[3] #s24#123 at /home/moeddel/.julia/packages/SIMDPirates/aKvPC/src/memory.jl:94 [inlined]
[4] #s24#123(::Any, ::Any, ::Any, ::Any, ::Any, ::Type, ::Any, ::Type, ::Any) at ./none:0
[5] (::Core.GeneratedFunctionStub)(::Any, ::Vararg{Any,N} where N) at ./boot.jl:524
[6] vload at /home/moeddel/.julia/packages/SIMDPirates/aKvPC/src/memory.jl:218 [inlined]
[7] macro expansion at ./gcutils.jl:91 [inlined]
[8] test_avm!(::Array{Complex{Float32},1}, ::Array{Complex{Float32},1}, ::Complex{Float32}) at ./In[26]:4
First of all: great package!
However, I get an error when I loop over Int32 or Int16 indices.
Here is a MWE:
function test()
acc = 0
@avx for i = Int32(1):Int32(100)
acc = acc + 1
end
acc
end
ERROR: LoadError: UndefVarError: N not defined
Stacktrace:
[1] register_single_loop!(::LoopVectorization.LoopSet, ::Expr) at /home/zorn/.julia/packages/LoopVectorization/SSpaP/src/graphs.jl:373
[2] register_loop!(::LoopVectorization.LoopSet, ::Expr) at /home/zorn/.julia/packages/LoopVectorization/SSpaP/src/graphs.jl:401
[3] add_loop!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at /home/zorn/.julia/packages/LoopVectorization/SSpaP/src/graphs.jl:405 (repeats 2 times)
[4] copyto! at /home/zorn/.julia/packages/LoopVectorization/SSpaP/src/constructors.jl:6 [inlined]
[5] LoopVectorization.LoopSet(::Expr) at /home/zorn/.julia/packages/LoopVectorization/SSpaP/src/constructors.jl:46
[6] @avx(::LineNumberNode, ::Module, ::Any) at /home/zorn/.julia/packages/LoopVectorization/SSpaP/src/constructors.jl:95
That's LoopVectorization v0.4.0, but it also threw an error in v0.3.11
MWE
julia> using LoopVectorization
julia> x = rand(4);
julia> @avx tanh.(x)
ERROR: MethodError: no method matching vifelse(::UInt8, ::SLEEFPirates.Double{VectorizationBase.SVec{4,Float64}}, ::SLEEFPirates.Double{VectorizationBase.SVec{4,Float64}})
Closest candidates are:
vifelse(::Bool, ::Any, ::Any) at /home/scheme/.julia/packages/SIMDPirates/h2DSE/src/conditionals.jl:96
vifelse(::Unsigned, ::Tuple{Vararg{VecElement{T},W}}, ::Union{Int64, T}) where {W, T} at /home/scheme/.julia/packages/SIMDPirates/h2DSE/src/conditionals.jl:166
vifelse(::Unsigned, ::Union{Tuple{Vararg{VecElement{T},W}}, VectorizationBase.AbstractStructVec{W,T}}, ::Union{Int64, T}) where {W, T} at /home/scheme/.julia/packages/SIMDPirates/h2DSE/src/conditionals.jl:167
...
Stacktrace:
[1] expk2 at /home/scheme/.julia/dev/SLEEFPirates/src/priv.jl:284 [inlined]
[2] tanh(::VectorizationBase.SVec{4,Float64}) at /home/scheme/.julia/dev/SLEEFPirates/src/hyp.jl:56
[3] macro expansion at /home/scheme/.julia/dev/SLEEFPirates/src/SLEEFPirates.jl:147 [inlined]
[4] macro expansion at /home/scheme/.julia/packages/LoopVectorization/HwB3A/src/broadcast.jl:205 [inlined]
[5] vmaterialize! at /home/scheme/.julia/packages/LoopVectorization/HwB3A/src/broadcast.jl:205 [inlined]
[6] vmaterialize(::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1},Nothing,typeof(tanh),Tuple{Array{Float64,1}}}, ::Val{:Main}) at /home/scheme/.julia/packages/LoopVectorization/HwB3A/src/broadcast.jl:252
[7] top-level scope at REPL[3]:1
Pkg.add("LoopVectorization") on julia 1.3.1 and LoopVectorization 0.3.5 gives (from the log):
ERROR: LoadError: "Architecture not yet supported. Please file an issue!\nYou can fix immediately by supplying the correct number of cores\nand the cache sizes (L1 data per core, L2 per core, L3).\nIf you do, please file an issue with the information or a PR\nadding it to the build script, so your architecture will be\nsupported for all future releases."
Stacktrace:
[1] top-level scope at /home/user/.julia/packages/VectorizationBase/cKn6V/deps/build.jl:15
[2] include at ./boot.jl:328 [inlined]
[3] include_relative(::Module, ::String) at ./loading.jl:1105
[4] include(::Module, ::String) at ./Base.jl:31
[5] include(::String) at ./client.jl:424
[6] top-level scope at none:5
in expression starting at /home/user/.julia/packages/VectorizationBase/cKn6V/deps/build.jl:7
The CPU is the AMD 8350 /proc/cpuinfo gives:
processor : 0
vendor_id : AuthenticAMD
cpu family : 21
model : 2
model name : AMD FX(tm)-8350 Eight-Core Processor
stepping : 0
microcode : 0x6000852
cpu MHz : 1400.000
cache size : 2048 KB
physical id : 0
siblings : 8
core id : 0
cpu cores : 4
apicid : 16
initial apicid : 0
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 popcnt aes xsave avx f16c lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs xop skinit wdt fma4 tce nodeid_msr tbm topoext perfctr_core perfctr_nb cpb hw_pstate ssbd ibpb vmmcall bmi1 arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold
bugs : fxsave_leak sysret_ss_attrs spectre_v1 spectre_v2 spec_store_bypass
bogomips : 8016.25
TLB size : 1536 4K pages
clflush size : 64
cache_alignment : 64
address sizes : 48 bits physical, 48 bits virtual
power management: ts ttp tm 100mhzsteps hwpstate cpb eff_freq_ro
It seems that the project files requires julia v1.3. Could that bound be lowered, or are you using certain new features?
function f1a()
A = zeros(8)
@avx for i in 1:size(A)[1]
A[i] = 1.0
end
return A
end
gives
ERROR: BoundsError: attempt to access ()
at index [1]
Stacktrace:
[1] getindex(::Tuple, ::Int64) at .\tuple.jl:24
[2] sizeofeltypes(::Tuple{}, ::Int64) at C:\Users\jr\.julia\packages\LoopVectorization\9Qgzg\src\reconstruct_loopset.jl:204
[3] avx_body(::Tuple{Int8,Int8,Int8}, ::Array{LoopVectorization.Instruction,1}, ::Array{LoopVectorization.OperationStruct,1}, ::Array{LoopVectorization.ArrayRefStruct,1}, ::Core.SimpleVector, ::Core.SimpleVector, ::Tuple{}) at C:\Users\jr\.julia\packages\LoopVectorization\9Qgzg\src\reconstruct_loopset.jl:215
[4] #s41#61 at C:\Users\jr\.julia\packages\LoopVectorization\9Qgzg\src\reconstruct_loopset.jl:236 [inlined]
[5] #s41#61(::Any, ::Any, ::Any, ::Any, ::Any, ::Any, ::Type, ::Type, ::Type, ::Any, ::Any, ::Any) at .\none:0
[6] (::Core.GeneratedFunctionStub)(::Any, ::Vararg{Any,N} where N) at .\boot.jl:524
[7] f1a() at .\REPL[10]:3
[8] top-level scope at REPL[11]:1
I do not know why I would write code like this, but the problem is mainly that the error message was difficult to understand that assigning a constant is not supported.
Version of the package 0.4.0. Error:
LoadError: UndefVarError: z not defined
Stacktrace:
[1] macro expansion at C:\Users\PKrysl\.julia\packages\LoopVectorization\SSpaP\src\reconstruct_loopset.jl:214 [inlined]
[2] _avx_!(::Val{(1, 0, 0)}, ::Type{Tuple{:constsym,:z,LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x01),:LoopVectorization,:getindex,LoopVectorization.OperationStruct(0x0000000000000032, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x01, 0x02),:LoopVectorization,:getindex,LoopVectorization.OperationStruct(0x0000000000000031, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x02, 0x03),:zero,Symbol("##temp#471"),LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000321, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x04),:LoopVectorization,:vfmadd_fast,LoopVectorization.OperationStruct(0x0000000000000321, 0x0000000000000003, 0x0000000000000000, 0x0000000000020304, LoopVectorization.compute, 0x00, 0x04),:LoopVectorization,:reduced_add,LoopVectorization.OperationStruct(0x0000000000000021, 0x0000000000000003, 0x0000000000000000, 0x0000000000000501, LoopVectorization.compute, 0x00, 0x01),:LoopVectorization,:setindex!,LoopVectorization.OperationStruct(0x0000000000000021, 0x0000000000000000, 0x0000000000000000, 0x0000000000000006, LoopVectorization.memstore, 0x03, 0x05)}}, ::Type{Tuple{LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000302),LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000301),LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000201)}}, ::Type{Tuple{0,Tuple{},Tuple{},Tuple{},Tuple{},Tuple{4},Tuple{}}}, ::Tuple{VectorizationBase.StaticLowerUnitRange{0},VectorizationBase.StaticLowerUnitRange{0},VectorizationBase.StaticLowerUnitRange{0}}, ::VectorizationBase.PackedStridedPointer{Float64,1}, ::VectorizationBase.PackedStridedPointer{Float64,1}, ::VectorizationBase.PackedStridedPointer{Float64,1},
::Nothing) at C:\Users\PKrysl\.julia\packages\LoopVectorization\SSpaP\src\reconstruct_loopset.jl:214
[3] macro expansion at .\gcutils.jl:96 [inlined]
[4] mulCAtB!(::Array{Float64,2}, ::Array{Float64,2}, ::Array{Float64,2}) at C:\Users\PKrysl\Documents\work\FinEtoolsTestAll.jl\tests\FinEtools.jl\src\MatrixUtilityModule.jl:380
Occurs in:
function mulCAtB!(C, A, B)
M, N = size(C); K = size(B,1)
@assert size(C, 1) == size(A, 2)
@assert size(C, 2) == size(B, 2)
@assert size(A, 1) == size(B, 1)
# When the @avx macro is available, this code is faster:
z = zero(eltype(C))
@avx for n in 1:size(C,2), m in 1:size(C,1)
Cmn = z
for k in 1:size(A,1)
Cmn += A[k,m] * B[k,n]
end
C[m,n] = Cmn
end
return C
end
Thanks for all your work on this!
It would be useful for some of my code to have a negative step size in the for loop. I'd be happy if either toy2!
or toy3!
could be made to work.
using LoopVectorization
function toy1!(B,X,κ)
n = size(B,1)
@inbounds for i = κ-1:-1:1
for nx=1:n
B[nx,κ] -=X[i]*B[nx,i]
end
end
end
# the following function never exits lowering (?). Or doing
# something triggered by the @avx macro
function toy2!(B,X,κ)
n = size(B,1)
@avx for i = κ-1:-1:1
for nx=1:n
B[nx,κ] -=X[i]*B[nx,i]
end
end
end
# I'm willing to switch to this form of the same function
# which doesn't have the negative step, but it fails lowering with
# ERROR: LoadError: InexactError: trunc(Int64, Inf)
function toy3!(B,X,κ)
n = size(B,1)
@avx for ii = 1:κ-1
i = κ-ii
for nx=1:n
B[nx,κ] -=X[i]*B[nx,i]
end
end
end
N = 4
X = randn(N);
B1 = randn(N, N);
B2 = copy(B1);
B3 = copy(B1);
toy1!(B1,X,3)
toy2!(B2,X,3) # hopefully this would be faster, if we could run it :-)
toy3!(B3,X,3) # hopefully this would be faster, if we could run it :-)
How would I multiply 2 16-bit integers and accumulate to 32-bit integer? Something like _mm_madd_epi16
So far the result of the multiplication seems to be done in 16-bit:
a = Int16.([1 << 8, 1 << 9, 1 << 10, 1 << 10])
b = Int16.([1 << 7, 1 << 8, 1 << 11, 1 << 10])
function fma_avx(a, b)
res = Int32(0)
@avx for i = 1:4
res = res + a[i] * b[i]
end
res
end
julia> fma_avx(a, b)
-32768
julia> fma_avx(Int32.(a), Int32.(b))
3309568
Using LoopVectorization.jl#master, I get the following error. I'm not certain how to describe the error, so I put part of the error message in the issue name. I tried a few small variants, with the same result. Is it somehow obvious that I should not put B[i,j]
inside the k
loop? It works fine without the @avx
macro.
julia> using LoopVectorization
julia> function myGramFail!(A, B)
zeroB = zero(eltype(B))
@avx for i ∈ 1:size(A,1), j ∈ 1:size(A,2)
B[i,j] = zeroB
for k ∈ 1:size(A,2)
B[i,j] += A[i,k] * A[k,j]
end
end
end
ERROR: LoadError: KeyError: key :zeroB not found
Stacktrace:
[1] getindex at ./dict.jl:477 [inlined]
[2] getop at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/graphs.jl:161 [inlined]
[3] add_store!(::LoopVectorization.LoopSet, ::Symbol, ::LoopVectorization.ArrayReference, ::Int64) at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/graphs.jl:505
[4] add_store_ref! at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/graphs.jl:512 [inlined]
[5] push!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/graphs.jl:586
[6] add_block!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/graphs.jl:228
[7] add_loop!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/graphs.jl:285
[8] add_loop! at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/graphs.jl:282 [inlined]
[9] copyto! at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/constructors.jl:6 [inlined]
[10] LoopVectorization.LoopSet(::Expr) at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/constructors.jl:46
[11] @avx(::LineNumberNode, ::Module, ::Any) at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/constructors.jl:86
in expression starting at REPL[2]:3
using LoopVectorization
function f(a)
s = 0.0
@avx for i in eachindex(a)
s += a[i]
end
s
end
julia> f([1,2,3])
6.0
julia> f(1:3)
StackOverflowError:
Stacktrace:
[1] stridedpointer(::UnitRange{Int64}) at /Users/mason/.julia/packages/VectorizationBase/yJngh/src/vectorizable.jl:389 (repeats 80000 times)
I think this should be supportable. One (dumb) way to make this work would just be
VectorizationBase.stridedpointer(r::AbstractRange) = VectorizationBase.stridedpointer(collect(r))
but I wonder if there's a way we can do it that doesn't require explicitly materializing the range. Maybe this issue should be moved to VectorizationBase though,
Hi @chriselrod,
I have been experimenting with the structs and came across something, which seems to be a bug. If I consider the following example
using StructArrays, Test
function test_native(x, A, k::Int)
out = zero(T)
for i ∈ 1:length(x)
out += A[i,k]*x[i]
end
return out
end
function test_struct(xre::Vector{T}, xim::Vector{T}, Are::Matrix{T}, Aim::Matrix{T}, k::Int) where {T<:AbstractFloat}
zre = zero(T)
zim = zero(T)
for i ∈ 1:length(xre)
zre += xre[i]*Are[i,k] - xim[i]*Aim[i,k]
zim += xre[i]*Aim[i,k] + xim[i]*Are[i,k]
end
return Complex{T}(zre,zim)
end
function test_struct(x::StructVector{Complex{T}}, A::StructArray{Complex{T},2}, k::Int) where {T<:AbstractFloat}
return test_struct(x.re, x.im, A.re , A.im, k)
end
# set up
T = ComplexF32
M = rand(1:1024)
N = rand(1:1024)
A = StructArray(rand(T,M,N))
b = StructVector(rand(T,M))
k = rand(1:N)
# end set up
@test test_native(b, A, k) ≈ test_struct(b, A, k)
everything is fine. However, if I insert the @avx
macro
using LoopVectorization
function test_avx(xre::Vector{T}, xim::Vector{T}, Are::Matrix{T}, Aim::Matrix{T}, k::Int) where {T<:AbstractFloat}
zre = zero(T)
zim = zero(T)
@avx for i ∈ 1:length(xre)
zre += xre[i]*Are[i,k] - xim[i]*Aim[i,k]
zim += xre[i]*Aim[i,k] + xim[i]*Are[i,k]
end
return Complex{T}(zre,zim)
end
function test_avx(x::StructVector{Complex{T}}, A::StructArray{Complex{T},2}, k::Int) where {T<:AbstractFloat}
return test_avx(x.re, x.im, A.re , A.im, k)
end
@test test_native(b, A, k) ≈ test_avx(b, A, k)
The test fails
Test Failed at In[2]:17
Expression: test_native(b, A, k) ≈ test_avx(b, A, k)
Evaluated: 5.7645955f0 + 234.34486f0im ≈ -5.764599f0 + 234.34473f0im
since the real part has wrong sign. Do you have any ideas why?
If I increment an index of one of the variables, I will get the error ##op#5321 not defined
MWE:
function complex_mul_with_index_offset(a_re, a_im, b_re, b_im, c_re, c_im)
@avx for i = 1:length(a_re) - 1
c_re[i] = b_re[i] * a_re[i + 1] - b_im[i] * a_im[i + 1]
c_im[i] = b_re[i] * a_im[i + 1] + b_im[i] * a_re[i + 1]
end
end
a_re = ones(Int, 64); a_im = copy(a_re); b_re = copy(a_re); b_im = copy(a_re); c_im = copy(a_re); c_re = copy(a_re)
julia> complex_mul_with_index_offset(a_re, a_im, b_re, b_im, c_re, c_im)
ERROR: UndefVarError: ##op#5321 not defined
Stacktrace:
[1] sum_with_index_offset(::Array{Int64,1}, ::Array{Int64,1}, ::Array{Int64,1}, ::Array{Int64,1}, ::Array{Int64,1}, ::Array{Int64,1}) at /home/zorn/.julia/packages/LoopVectorization/2jh99/src/reconstruct_loopset.jl:260
This does work, however:
function complex_mul_with_index_offset1(a_re, a_im, b_re, b_im, c_re, c_im)
@avx for i = 1:length(a_re) - 1
c_re[i] = b_re[i] * a_re[i + 1] - b_im[i] * a_im[i + 1]
end
end
@chriselrod mentioned on Discourse that the fact that @avx
is a macro imposes some codegen difficulties since macros don't see type information, and mentioned generated functions as an alternative. I agree that the best solution would be to gut all the complicated parts out of @avx
and put them into @generated function avx
and then make @avx
just rewrite the input loops into appropriate data structures to be passed to the avx
function.
This would cleanly separate two the distinct functionalities which are currently conflated: syntax transformations and complicated, heavily datatype dependant code generation. Furthermore, it'd make it much easier to expose an interface for other packages to write methods and make their types compatible with @avx
.
This won't be news to Chris, but I figured it'd be good to have an issue here for the benefit of other readers.
I'd be happy to help with the generated functions and macro rewrite if you're looking for help, though it will take me a while to go through and digest the code. If you could point out places where the syntax transformations occur, that would accelerate the process.
Hi @chriselrod, I've upgraded from @vectorize
to @avx
and am excited to use an officially registered package.
Looks like a change (maybe in SIMDPirates?) broke something for me -- the code below works unvectorized, but is broken when I add the @avx
. Seems like there are 2 issues. First, it looks like if statements in an @avx
loop don't work. Second, it appears that something goes wrong when I iterate over a matrix or a view, not a vector. @code_warntype
suggests there are problems with type inference. See unvectorized, broken vectorized, and fixed vectorized code below.
I'm using julia 1.3.1 and
[[LoopVectorization]]
deps = ["LinearAlgebra", "MacroTools", "Parameters", "SIMDPirates", "SLEEFPirates", "VectorizationBase"]
git-tree-sha1 = "a8d158a4971113443269739f3cc51ae992e95a58"
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
version = "0.3.6"
[[SIMDPirates]]
deps = ["MacroTools", "VectorizationBase"]
git-tree-sha1 = "500294a8b1001bdda2483fc6d675956798ad8764"
uuid = "21efa798-c60a-11e8-04d3-e1a92915a26a"
version = "0.1.6"
[[SLEEFPirates]]
deps = ["SIMDPirates", "VectorizationBase"]
git-tree-sha1 = "4733445246d3d5536c7aee1bffb55ab37b88347b"
uuid = "476501e8-09a2-5ece-8869-fb82de89a1fa"
version = "0.1.3"
[[VectorizationBase]]
deps = ["CpuId", "LinearAlgebra"]
git-tree-sha1 = "a2576763aa20968ffb5668e2e15d45ae8e364d05"
uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
version = "0.1.9"
using LoopVectorization
using Base: OneTo
using LinearAlgebra: stride1
using Test
add_1_dim(x::AbstractArray) = reshape(x, size(x)..., 1)
check_finite(x::AbstractArray) = all(isfinite.(x)) || throw(error("x not finite!"))
"Given k-dimensional array `x` where `n=size(x,k)`, compute multinomial logistic Pr(i ∈ 1:n | x[:, ..., :, k] )"
function softmax3!(q::AA, lse::A, tmpmax::A, x::AA, maxk=size(q, ndims(q)) ) where {T<:Real, A<:Array{T}, AA<:AbstractArray{T}}
ndims(q) == 1+ndims(lse) || throw(DimensionMismatch())
xsizes = size(x)
xsizes == size(q) || throw(DimensionMismatch("size(x) = $(size(x)) but size(q) = $(size(q))"))
nk = last(xsizes)
for i = OneTo(ndims(lse))
size(q,i) == size(lse,i) == size(tmpmax,i) || throw(DimensionMismatch("size(x) = $(size(x)), size(lse) = $(size(lse)), and size(tmpmax) = $(size(tmpmax))"))
end
0 < maxk <= nk || throw(DomainError(maxk))
1 == stride1(q) == stride1(x) || throw(error("Arrays not strided"))
isempty(x) && throw(error("x empty"))
check_finite(x)
maximum!(add_1_dim(tmpmax), x)
fill!(lse, zero(T))
xx = reshape(x, :, nk)
qq = reshape(q, :, nk)
for k in OneTo(nk)
for i in eachindex(lse)
tmp = exp(xx[i,k] - tmpmax[i])
lse[i] += tmp
k <= maxk && (qq[i,k] = tmp)
end
end
qq[:,OneTo(maxk)] ./= vec(lse)
end
function softmax3_avx_broken!(q::AA, lse::A, tmpmax::A, x::AA, maxk=size(q, ndims(q)) ) where {T<:Real, A<:Array{T}, AA<:AbstractArray{T}}
ndims(q) == 1+ndims(lse) || throw(DimensionMismatch())
xsizes = size(x)
xsizes == size(q) || throw(DimensionMismatch("size(x) = $(size(x)) but size(q) = $(size(q))"))
nk = last(xsizes)
for i = OneTo(ndims(lse))
size(q,i) == size(lse,i) == size(tmpmax,i) || throw(DimensionMismatch("size(x) = $(size(x)), size(lse) = $(size(lse)), and size(tmpmax) = $(size(tmpmax))"))
end
0 < maxk <= nk || throw(DomainError(maxk))
1 == stride1(q) == stride1(x) || throw(error("Arrays not strided"))
isempty(x) && throw(error("x empty"))
check_finite(x)
maximum!(add_1_dim(tmpmax), x)
fill!(lse, zero(T))
xx = reshape(x, :, nk)
qq = reshape(q, :, nk)
for k in OneTo(maxk)
@avx for i in eachindex(lse)
tmp = exp(xx[i,k] - tmpmax[i])
lse[i] += tmp
# FIXME - would prefer to replace 2nd loop w/ if stmt
# k <= maxk && (qq[i,k] = tmp)
qq[i,k] = tmp
end
end
for k in maxk+1:nk
@avx for i in eachindex(lse)
tmp = exp(xx[i,k] - tmpmax[i])
lse[i] += tmp
end
end
qq[:,OneTo(maxk)] ./= vec(lse)
end
function softmax3_avx_fixed!(q::AA, lse::A, tmpmax::A, x::AA, maxk=size(q, ndims(q)) ) where {T<:Real, A<:Array{T}, AA<:AbstractArray{T}}
ndims(q) == 1+ndims(lse) || throw(DimensionMismatch())
xsizes = size(x)
xsizes == size(q) || throw(DimensionMismatch("size(x) = $(size(x)) but size(q) = $(size(q))"))
nk = last(xsizes)
for i = OneTo(ndims(lse))
size(q,i) == size(lse,i) == size(tmpmax,i) || throw(DimensionMismatch("size(x) = $(size(x)), size(lse) = $(size(lse)), and size(tmpmax) = $(size(tmpmax))"))
end
0 < maxk <= nk || throw(DomainError(maxk))
1 == stride1(q) == stride1(x) || throw(error("Arrays not strided"))
isempty(x) && throw(error("x empty"))
check_finite(x)
maximum!(add_1_dim(tmpmax), x)
fill!(lse, zero(T))
xx = reshape(x, :, nk)
qq = reshape(q, :, nk)
tmpmaxvec = vec(tmpmax) # Always required??
lsevec = vec(lse) # Always required??
for k in OneTo(maxk)
qk = view(qq, :, k) # required if using a View
xk = view(xx, :, k) # required if using a View
@avx for i in eachindex(lsevec)
tmp = exp(xk[i] - tmpmaxvec[i])
lsevec[i] += tmp
qk[i] = tmp
end
end
for k in maxk+1:nk
qk = view(qq, :, k)
xk = view(xx, :, k)
@avx for i in eachindex(lsevec)
tmp = exp(xk[i] - tmpmaxvec[i])
lsevec[i] += tmp
end
end
qq[:,OneTo(maxk)] ./= vec(lse)
end
ni, nj, nk = (100, 100, 10)
x = rand(ni, nj, nk)
q = similar(x0)
tmpmax = zeros(ni,nj)
lse = similar(tmpmax)
for f! in (softmax3!, softmax3_avx_fixed!, softmax3_avx_broken!)
@testset "$(f!) with arrays" begin
fill!(q,0)
fill!(lse,0)
@show @code_warntype f!(q, lse, tmpmax, x, 1)
f!(q, lse, tmpmax, x, 1)
@test all(sum(q; dims=3) .<= 1)
fill!(q,0)
fill!(lse,0)
f!(q, lse, tmpmax, x)
@test sum(q; dims=3) ≈ ones(ni,nj)
end
@testset "$(f!) with views" begin
nkm1 = nk-1
@views qvw = q[:,:,1:nkm1]
@views xvw = x[:,:,1:nkm1]
fill!(q,0)
fill!(lse,0)
@show @code_warntype f!(qvw, lse, tmpmax, xvw, 1)
f!(qvw, lse, tmpmax, xvw, 1)
@test all(sum(qvw; dims=3) .<= 1)
fill!(q,0)
fill!(lse,0)
f!(qvw, lse, tmpmax, xvw)
@test sum(qvw; dims=3) ≈ ones(ni,nj)
end
end
breakage is of 2 forms
The vector/matrix issue:
#= D:\libraries\julia\dev\ShaleDrillingLikelihood\example\softmax3test.jl:136 =# @code_warntype(f!(q, lse, tmpmax, x, 1)) = nothing
softmax3_avx_broken! with arrays: Error During Test at D:\libraries\julia\dev\ShaleDrillingLikelihood\example\softmax3test.jl:133
Got exception outside of a @test
AssertionError: Ni == N + 1
Stacktrace:
[1] packed_strided_ptr_index(::Core.SimpleVector, ::Int64, ::Type{Float64}) at D:\libraries\julia\packages\SIMDPirates\nxDqU\src\memory.jl:996
[2] #s28#117(::Any, ::Any, ::Any, ::Any, ::Any, ::Any) at D:\libraries\julia\packages\SIMDPirates\nxDqU\src\memory.jl:1020
[3] (::Core.GeneratedFunctionStub)(::Any, ::Vararg{Any,N} where N) at .\boot.jl:524
[4] macro expansion at .\gcutils.jl:91 [inlined]
[5] softmax3_avx_broken!(::Array{Float64,3}, ::Array{Float64,2}, ::Array{Float64,2}, ::Array{Float64,3}, ::Int64) at D:\libraries\julia\dev\ShaleDrillingLikelihood\example\softmax3test.jl:62
[6] top-level scope at D:\libraries\julia\dev\ShaleDrillingLikelihood\example\softmax3test.jl:137
[7] top-level scope at D:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.3\Test\src\Test.jl:1107
[8] top-level scope at D:\libraries\julia\dev\ShaleDrillingLikelihood\example\softmax3test.jl:134
[9] include_string(::Module, ::String, ::String) at .\loading.jl:1075
[10] include_string(::Module, ::String, ::String, ::Int64) at D:\libraries\julia\packages\CodeTools\sf1Tz\src\eval.jl:30
[11] (::Atom.var"#127#132"{String,Int64,String,Bool})() at D:\libraries\julia\packages\Atom\X8fAI\src\eval.jl:94
[12] withpath(::Atom.var"#127#132"{String,Int64,String,Bool}, ::String) at D:\libraries\julia\packages\CodeTools\sf1Tz\src\utils.jl:30
[13] withpath(::Function, ::String) at D:\libraries\julia\packages\Atom\X8fAI\src\eval.jl:47
[14] #126 at D:\libraries\julia\packages\Atom\X8fAI\src\eval.jl:93 [inlined]
[15] with_logstate(::Atom.var"#126#131"{String,Int64,String,Bool}, ::Base.CoreLogging.LogState) at .\logging.jl:395
[16] with_logger at .\logging.jl:491 [inlined]
[17] #125 at D:\libraries\julia\packages\Atom\X8fAI\src\eval.jl:92 [inlined]
[18] hideprompt(::Atom.var"#125#130"{String,Int64,String,Bool}) at D:\libraries\julia\packages\Atom\X8fAI\src\repl.jl:85
[19] macro expansion at D:\libraries\julia\packages\Atom\X8fAI\src\eval.jl:91 [inlined]
[20] macro expansion at D:\libraries\julia\packages\Media\ItEPc\src\dynamic.jl:24 [inlined]
[21] (::Atom.var"#124#129")(::Dict{String,Any}) at D:\libraries\julia\packages\Atom\X8fAI\src\eval.jl:86
[22] handlemsg(::Dict{String,Any}, ::Dict{String,Any}) at D:\libraries\julia\packages\Atom\X8fAI\src\comm.jl:164
[23] (::Atom.var"#19#21"{Array{Any,1}})() at .\task.jl:333
and
LoadError: "Don't know how to handle expression:\nk <= maxk && (qq[i, k] = tmp)"
in expression starting at D:\libraries\julia\dev\ShaleDrillingLikelihood\example\softmax3test.jl:62
push!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at graphs.jl:821
add_block!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at graphs.jl:280
add_loop!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at graphs.jl:347
add_loop! at graphs.jl:344 [inlined]
copyto! at constructors.jl:6 [inlined]
LoopVectorization.LoopSet(::Expr) at constructors.jl:46
@avx(::LineNumberNode, ::Module, ::Any) at constructors.jl:86
Thanks a lot for your package, it is seem very nice!
I write you because when I want to study one package, I like to read the documentation. The links in the README.md to the docs is not working.
Also, it should be nice of part of the examples and documentations in Discourse forum not only should be in the documentation, but also a short version could be added to README.md.
With a vector a=rand(1000)
, this works:
julia> @avx @. sin(a) + sqrt(a);
but this doesn't:
julia> @avx @. 3*a + sin(a) + sqrt(a);
ERROR: MethodError: no method matching add_broadcast!(::LoopVectorization.LoopSet, ::Symbol, ::Symbol, ::Array{Symbol,1}, ::Type{Int64})
Hi @chriselrod,
something seem to be broken in version v0.3.10. i get
"Don't know how to handle expression:\nzre += xre[i] * Are[i, k] - xim[i] * Aim[i, k]"
Stacktrace:
[1] push!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at /home/moeddel/.julia/packages/LoopVectorization/WdP5f/src/graphs.jl:539
[2] add_block!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at /home/moeddel/.julia/packages/LoopVectorization/WdP5f/src/graphs.jl:310
[3] add_loop!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at /home/moeddel/.julia/packages/LoopVectorization/WdP5f/src/graphs.jl:401
[4] add_loop! at /home/moeddel/.julia/packages/LoopVectorization/WdP5f/src/graphs.jl:398 [inlined]
[5] copyto! at /home/moeddel/.julia/packages/LoopVectorization/WdP5f/src/constructors.jl:6 [inlined]
[6] LoopVectorization.LoopSet(::Expr) at /home/moeddel/.julia/packages/LoopVectorization/WdP5f/src/constructors.jl:45
[7] @avx(::LineNumberNode, ::Module, ::Any) at /home/moeddel/.julia/packages/LoopVectorization/WdP5f/src/constructors.jl:93
when executing
using LoopVectorization
function test_avx(xre::Vector{T}, xim::Vector{T}, Are::Matrix{T}, Aim::Matrix{T}, k::Int) where {T<:AbstractFloat}
zre = zero(T)
zim = zero(T)
@avx for i ∈ 1:length(xre)
zre += xre[i]*Are[i,k] - xim[i]*Aim[i,k]
zim += xre[i]*Aim[i,k] + xim[i]*Are[i,k]
end
return Complex{T}(zre,zim)
end
function test_avx(x::StructVector{Complex{T}}, A::StructArray{Complex{T},2}, k::Int) where {T<:AbstractFloat}
return test_avx(x.re, x.im, A.re , A.im, k)
end
test_avx(b, A, k)
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.