juliasimd / loopvectorization.jl Goto Github PK

View Code? Open in Web Editor NEW

741.0 17.0 66.0 98.54 MB

Macro(s) for vectorizing loops.

License: MIT License

Julia 100.00%

loops vectorizing-loops simd

loopvectorization.jl's Issues

ERROR: UndefVarError: ####temporary#425_ not defined

Possibly related to #15, following are a couple of compilation errors.

               _
   _       _ _(_)_     |  Documentation: https://docs.julialang.org
  (_)     | (_) (_)    |
   _ _   _| |_  __ _   |  Type "?" for help, "]?" for Pkg help.
  | | | | | | |/ _` |  |
  | | |_| | | | (_| |  |  Version 1.3.1 (2019-12-30)
 _/ |\__'_|_|_|\__'_|  |  Official https://julialang.org/ release
|__/                   |

julia> using LoopVectorization
julia> using BenchmarkTools
julia> function toy1!(G, B,κ)
           d = size(G,1)
           @inbounds for d1=1:d
               G[d1,κ] = B[1,d1]*B[1,κ]
               for d2=2:d
                   G[d1,κ] += B[d2,d1]*B[d2,κ]
               end
           end
       end
toy1! (generic function with 1 method)
julia> function toy2!(G, B,κ)
           d = size(G,1)
           @avx for d1=1:d
               G[d1,κ] = B[1,d1]*B[1,κ]
               for d2=2:d
                   G[d1,κ] += B[d2,d1]*B[d2,κ]
               end
           end
       end
toy2! (generic function with 1 method)
julia> function toy3!(G, B,κ)
           d = size(G,1)
           z = zero(eltype(G))
           @avx for d1=1:d
               G[d1,κ] = z
               for d2=1:d
                   G[d1,κ] += B[d2,d1]*B[d2,κ]
               end
           end
       end
toy3! (generic function with 1 method)

julia> N = 8
8
julia> B = randn(N, N);
julia> G1 = zeros(N, N).*NaN;
julia> G2 = similar(G1);
julia> G3 = similar(G1);
julia> toy1!(G1,B,1)
julia> toy2!(G2,B,1)
ERROR: UndefVarError: ####temporary#425_ not defined
Stacktrace:
 [1] toy2!(::Array{Float64,2}, ::Array{Float64,2}, ::Int64) at ./gcutils.jl:91
 [2] top-level scope at REPL[12]:1

julia> toy3!(G3,B,1)
ERROR: UndefVarError: ##G_0 not defined
Stacktrace:
 [1] macro expansion at ./gcutils.jl:91 [inlined]
 [2] toy3!(::Array{Float64,2}, ::Array{Float64,2}, ::Int64) at ./REPL[5]:4
 [3] top-level scope at REPL[13]:1

mixed Float64 and Int64

Minor modifications of the mygemm functions from your README seem to work great for inputs that are all Float64 or all Int64

function mygemm!(C, A, B)
    @inbounds for i ∈ 1:size(A,1), j ∈ 1:size(B,2)
        Cᵢⱼ = zero(eltype(C))
        @fastmath for k ∈ 1:size(A,2)
            Cᵢⱼ += A[i,k] * B[k,j]
        end
        C[i,j] = Cᵢⱼ
    end
end
function mygemmavx!(C, A, B)
    @avx for i ∈ 1:size(A,1), j ∈ 1:size(B,2)
        Cᵢⱼ = zero(eltype(C))
        for k ∈ 1:size(A,2)
            Cᵢⱼ += A[i,k] * B[k,j]
        end
        C[i,j] = Cᵢⱼ
    end
end

If however I make A & B Int64 and C a Float 64, then the macro seems to fail

N = 20;
A = rand(0:1000,N, N);
B = rand(0:1000,N, N);
C1 = Matrix{Float64}(undef, N, N); 
C2 = Matrix{Float64}(undef, N, N); 
@btime mygemm!($C1, $A, $B)
@btime mygemmavx!($C2, $A, $B)

The error produced is

julia> @btime mygemmavx!($C2, $A, $B)
ERROR: MethodError: no method matching +(::VectorizationBase.SVec{4,Int64}, ::VectorizationBase.SVec{4,Float64})
...

My understanding from your comment on Discourse is that this should work.
FYI I'm using Julia 1.3.1 on MacOS.

wrong result with index offset

Thank you a lot for fast responses. Here is one more issue. With latest master

N,M = 4,6
A = zeros(N,M)
B = reshape(cumsum(ones(3N)),N,:)

# ok
@avx for i=1:size(A,1), j=1:size(B,2)
	A[i,j] = exp(B[i,j])
end

gives

4×6 Array{Float64,2}:
  2.71828   148.413   8103.08       0.0  0.0  0.0
  7.38906   403.429  22026.5        0.0  0.0  0.0
 20.0855   1096.63   59874.1        0.0  0.0  0.0
 54.5982   2980.96       1.62755e5  0.0  0.0  0.0

while

N,M = 4,6
A = zeros(N,M)
B = reshape(cumsum(ones(3N)),N,:)
# not ok
@avx for i=1:size(A,1), j=1:size(B,2)
	A[i,j+1] = exp(B[i,j])
end

gives

4×6 Array{Float64,2}:
 0.0  2.0  3.0  4.0  0.0  0.0
 0.0  2.0  3.0  4.0  0.0  0.0
 0.0  2.0  3.0  4.0  0.0  0.0
 0.0  2.0  3.0  4.0  0.0  0.0

support of cospi and sinpi

Could cospi and sinpi function be supported within @avx?

function f1()
	z = cumsum(ones(4))
	A = zeros(4)
	for i in 1:size(A)[1]
		A[i] = cospi(z[i])
	end
	return A
end

function f1a()
	z = cumsum(ones(4))
	A = zeros(4)
	@avx for i in 1:size(A)[1]
		A[i] = cospi(z[i])
	end
	return A
end

gives

julia> f1()
4-element Array{Float64,1}:
 -1.0
  1.0
 -1.0
  1.0
julia> f1a()
ERROR: MethodError: no method matching cospi(::VectorizationBase.SVec{4,Float64})
Closest candidates are:
  cospi(::T) where T<:AbstractFloat at special/trig.jl:815
  cospi(::Integer) at special/trig.jl:864
  cospi(::T) where T<:Union{Integer, Rational} at special/trig.jl:841
  ...
Stacktrace:
 [1] macro expansion at C:\Users\jr\.julia\packages\LoopVectorization\9Qgzg\src\reconstruct_loopset.jl:232 [inlined]
 [2] _avx_! at C:\Users\jr\.julia\packages\LoopVectorization\9Qgzg\src\reconstruct_loopset.jl:232 [inlined]
 [3] macro expansion at .\gcutils.jl:91 [inlined]
 [4] f1a() at .\REPL[35]:4
 [5] top-level scope at REPL[37]:1

Sometimes sinpi can be faster, I wonder if this applies to @avx

using BenchmarkTools
a=10e6*rand(10_000_000);
b=π*a;
@btime sinpi.($a)
@btime sin.($b);

gives

  167.324 ms (2 allocations: 76.29 MiB)
  322.987 ms (2 allocations: 76.29 MiB)

Loop over function call: ERROR: UndefVarError: i not defined

I get the following error when the loop index is multiplied with another variable:

results = Vector{Float64}(undef, 2500)
function calc_sins!(res)
    code_phase_delta = 0.01
    @avx for i = 1:2500
        res[i] = sin(i * code_phase_delta)
    end
end

julia> calc_sins!(results)
ERROR: UndefVarError: i not defined
Stacktrace:
 [1] macro expansion at ./gcutils.jl:91 [inlined]
 [2] calc_sins!(::Array{Float64,1}) at ./REPL[4]:3
 [3] top-level scope at REPL[5]:1
 [4] run_backend(::REPL.REPLBackend) at /home/zorn/.julia/packages/Revise/S7mrl/src/Revise.jl:1057

If it is saved in a different variable, the following error occurs when creating the function:

function calc_sins1!(res)
    code_phase_delta = 0.01
    @avx for i = 1:2500
        phase = i * code_phase_delta
        res[i] = sin(phase)
    end
end

ERROR: LoadError: MethodError: no method matching add_operation!(::LoopVectorization.LoopSet, ::Symbol, ::Symbol, ::Int64)
Closest candidates are:
  add_operation!(::LoopVectorization.LoopSet, ::Symbol, ::Expr, ::Int64) at /home/zorn/.julia/packages/LoopVectorization/dqtPD/src/graphs.jl:428
  add_operation!(::LoopVectorization.LoopSet, ::Symbol, ::Expr, ::LoopVectorization.ArrayReferenceMetaPosition, ::Int64) at /home/zorn/.julia/packages/LoopVectorization/dqtPD/src/graphs.jl:452
  add_operation!(::LoopVectorization.LoopSet, ::Symbol, ::Expr, ::LoopVectorization.ArrayReferenceMetaPosition) at /home/zorn/.julia/packages/LoopVectorization/dqtPD/src/graphs.jl:452

How to handle user defined functions?

I get the following error

ERROR: MethodError: no method matching calc_A(::VectorizationBase.SVec{16,Int16})

where calc_A is a user defined function. Do I need to implement calc_A(::VectorizationBase.SVec{16,Int16}) myself? Why isn't there an error for the other user defined functions? My first guess was because they are inlined, but calc_A should be inlined, too.

Here is the code for that:

@inline function calc_A(x::Int16)
    p = 15; r = 1; A = Int16(23170); C = Int16(-425)
    n = 7
    a = 7
    x² = (x * x) >> n
    rounding = one(x) << (p - a - 1)
    (A + rounding + (x² * C) >> r) >> (p - a)
end
@inline function calc_B(x::Int16)
    p = 14; r = 3; B = Int16(-17790); D = Int16(351)
    n = 7
    a = 7
    x² = (x * x) >> n
    rounding = one(x) << (p - a - 1)
    (rounding + x * (B + (x² * D) >> r) >> n) >> (p - a)
end
@inline function get_first_bit_sign(x)
    n = 7
    mysign(x << (sizeof(x) * 8 - n - 1))
end
@inline function get_second_bit_sign(x)
    n = 7
    mysign(x << (sizeof(x) * 8 - n - 2))
end
@inline function get_quarter_angle(x)
    n = 7
    x & (one(x) << n - one(x)) - one(x) << (n - 1)
end
@inline mysign(x) = 2 * signbit(x) - 1

function gen_sincos!(sins, coss, phases)
    @avx for i = 1:2500
        first_bit_sign = get_first_bit_sign(phases[i])
        second_bit_sign = get_second_bit_sign(phases[i])
        quarter_angle = get_quarter_angle(phases[i])
        A = calc_A(quarter_angle)
        B = calc_B(quarter_angle)

        coss[i] = second_bit_sign * (first_bit_sign * A + B)
        sins[i] = second_bit_sign * (A - first_bit_sign * B)
    end
end
sins = Vector{Int16}(undef, 2500)
coss = Vector{Int16}(undef, 2500)
phases = Vector{Int16}(undef, 2500)
gen_sincos!(sins, coss, phases)

EDIT: Alright I found the solution myself: If I write function calc_A(x) instead of function calc_A(x::Int16), it will work. So should I omit the declaration of x? What if I have a different function for x::Int32?

KeyError: key NumType not found

This is a powerful and welcome tool. I would like to apply it to more packages. I have run into a limitation on supported types that seems more narrow than necessary.

running e.g. myselfdotavx(xs::T) where T is a Float64 compatible primitive type (or most other numeric bitstype, including Int64 compatibles) except for T = Union{Float64, Float32, Int64, Int32} results in a KeyError.

ERROR: KeyError: key XFloat32 not found
Stacktrace:
 [1] getindex at .\dict.jl:477 [inlined]
 [2] llvmtype(::Type{T} where T) at C:\Users\jas\.julia\packages\VectorizationBase\rPD9W\src\vectorizable.jl:22
 [3] #s13#2(::Any, ::Any, ::Any, ::Any) at C:\Users\jas\.julia\packages\VectorizationBase\rPD9W\src\VectorizationBase.jl:50
 [4] (::Core.GeneratedFunctionStub)(::Any, ::Vararg{Any,N} where N) at .\boot.jl:526
 [5] vzero at C:\Users\jas\.julia\packages\VectorizationBase\rPD9W\src\VectorizationBase.jl:74 [inlined]
 [6] macro expansion at C:\Users\jas\.julia\packages\LoopVectorization\8wcli\src\reconstruct_loopset.jl:248 [inlined]
 [7] _avx_! at C:\Users\jas\.julia\packages\LoopVectorization\8wcli\src\reconstruct_loopset.jl:248 [inlined]
 [8] myselfdotavx(::Array{XFloat32,1}) at .\REPL[39]:3
 [9] ##core#495() at C:\Users\jas\.julia\packages\BenchmarkTools\7aqwe\src\execution.jl:297
 [10] ##sample#496(::BenchmarkTools.Parameters) at C:\Users\jas\.julia\packages\BenchmarkTools\7aqwe\src\execution.jl:303
 [11] _run(::BenchmarkTools.Benchmark{Symbol("##benchmark#494")}, ::BenchmarkTools.Parameters; verbose::Bool, pad::String, kwargs::Base.Iterators.Pairs{Symbol,Integer,NTuple{4,Symbol},NamedTuple{(:samples, :evals, :gctrial, :gcsample),Tuple{Int64,Int64,Bool,Bool}}}) at C:\Users\jas\.julia\packages\BenchmarkTools\7aqwe\src\execution.jl:331
 [12] (::Base.var"#inner#2"{Base.Iterators.Pairs{Symbol,Integer,NTuple{5,Symbol},NamedTuple{(:verbose, :samples, :evals, :gctrial, :gcsample),Tuple{Bool,Int64,Int64,Bool,Bool}}},typeof(BenchmarkTools._run),Tuple{BenchmarkTools.Benchmark{Symbol("##benchmark#494")},BenchmarkTools.Parameters}})() at .\essentials.jl:715
 [13] #invokelatest#1 at .\essentials.jl:716 [inlined]
 [14] #run_result#37 at C:\Users\jas\.julia\packages\BenchmarkTools\7aqwe\src\execution.jl:32 [inlined]
 [15] run(::BenchmarkTools.Benchmark{Symbol("##benchmark#494")}, ::BenchmarkTools.Parameters; kwargs::Base.Iterators.Pairs{Symbol,Integer,NTuple{5,Symbol},NamedTuple{(:verbose, :samples, :evals, :gctrial, :gcsample),Tuple{Bool,Int64,Int64,Bool,Bool}}}) at C:\Users\jas\.julia\packages\BenchmarkTools\7aqwe\src\execution.jl:46
 [16] #warmup#42 at C:\Users\jas\.julia\packages\BenchmarkTools\7aqwe\src\execution.jl:79 [inlined]
 [17] warmup(::BenchmarkTools.Benchmark{Symbol("##benchmark#494")}) at C:\Users\jas\.julia\packages\BenchmarkTools\7aqwe\src\execution.jl:79
 [18] top-level scope at C:\Users\jas\.julia\packages\BenchmarkTools\7aqwe\src\execution.jl:390

or Using Int128 results in

ERROR: error compiling myselfdotavx: Failed to parse LLVM Assembly:
julia: llvmcall:3:5: error: value doesn't match function result type '[2 x i128]'
ret <2 x i128> zeroinitializer
    ^

Vectorized map and broadcast

This is just me dreaming about the future, but wouldn't it be neat if one could write stuff like

@vectorize map(f, x)
@vectorize f.(x)

? 😃

matrix assignment with offset other than 1

With LoopVectorization v0.6.9

N,M = 4,6
A = zeros(N,M)
B = reshape(cumsum(ones(3N)),N,:)

# ok
@avx for i=1:size(A,1), j=1:size(B,2)
	A[i,j+1+1] = B[i,j]
end

# not ok
@avx for i=1:size(A,1), j=1:size(B,2)
	A[i,j+2] = B[i,j]
end

gives

ERROR: MethodError: no method matching sizeequivalentint(::Type{Float64}, ::Int64)
Closest candidates are:
  sizeequivalentint(::Type{Int64}, ::Int64) at C:\Users\jr\.julia\packages\LoopVectorization\HwB3A\src\lowering.jl:363
  sizeequivalentint(::Type{Int32}, ::Int64) at C:\Users\jr\.julia\packages\LoopVectorization\HwB3A\src\lowering.jl:366
  sizeequivalentint(::Type{Int16}, ::Int64) at C:\Users\jr\.julia\packages\LoopVectorization\HwB3A\src\lowering.jl:369
  ...
Stacktrace:
 [1] macro expansion at C:\Users\jr\.julia\packages\LoopVectorization\HwB3A\src\reconstruct_loopset.jl:272 [inlined]
 [2] _avx_!(::Val{(0, 0)}, ::Type{Tuple{:LoopVectorization,:getindex,LoopVectorization.OperationStruct(0x0000000000000002, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x01, 0x01),:LoopVectorization,Symbol("##253"),LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x02),:LoopVectorization,:+,LoopVectorization.OperationStruct(0x0000000000000002, 0x0000000000000000, 0x0000000000000000, 0x0000000000000102, LoopVectorization.compute, 0x00, 0x03),:LoopVectorization,:getindex,LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x02, 0x04),:LoopVectorization,:identity,LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000000004, LoopVectorization.compute, 0x00, 0x05),:LoopVectorization,:setindex!,LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000000305, LoopVectorization.memstore, 0x03, 0x06)}}, ::Type{Tuple{LoopVectorization.ArrayRefStruct(0x0000000000000001, 0x0000000000000002),LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000102),LoopVectorization.ArrayRefStruct(0x0000000000000102, 0x0000000000000103)}}, ::Type{Tuple{0,Tuple{},Tuple{},Tuple{(2, 2)},Tuple{},Tuple{},Tuple{}}}, ::Tuple{VectorizationBase.StaticLowerUnitRange{0},VectorizationBase.StaticLowerUnitRange{0}}, ::LoopVectorization.LoopValue, ::VectorizationBase.PackedStridedPointer{Float64,1}, ::VectorizationBase.PackedStridedPointer{Float64,1}) at C:\Users\jr\.julia\packages\LoopVectorization\HwB3A\src\reconstruct_loopset.jl:272
 [3] top-level scope at REPL[163]:1

Transpose and avx broadcast macro

I'm not sure whether this is expected to work, but right now broadcasting over a transposed matrix gives surprising results:

julia> A = randn(2,3); A[2] = 10; At = transpose(A)
3×2 LinearAlgebra.Transpose{Float64,Array{Float64,2}}:
 -0.121486   10.0     
 -1.55957     0.691854
 -0.0858253  -0.918145

julia> exp.(At)
3×2 Array{Float64,2}:
 0.885603  22026.5     
 0.210227      1.99742 
 0.917755      0.399259

julia> @avx exp.(At)
3×2 Array{Float64,2}:
     0.885603  0.210227
 22026.5       1.99742 
     0.210227  0.917755

ERROR: MethodError: no method matching promote_vtype(::Type{VectorizationBase._MM{8}}, ::Type{Int32})

I hope I'm not bothering you ;)

MWE:

function gen_phases(phases::Vector{Int16}, freq, start_phase)
    fp = 21
    delta = floor(Int32, freq * 1 << (fp + 9))
    idxs = Int32(1):Int32(length(phases))
    fixed_point_start_phase = floor(Int32, start_phase * 1 << (fp + 9))
    @avx for i = 1:length(phases)
        phases[i] = Int16((idxs[i] * delta + fixed_point_start_phase) >> fp)
    end
end
phases = Vector{Int16}(undef, 1000)

julia> gen_phases(phases, 0.002, 0.1)
ERROR: MethodError: no method matching promote_vtype(::Type{VectorizationBase._MM{8}}, ::Type{Int32})

Wrong result with Int16 and offset index in for loop

Here is a MWE:

function test_for_with_different_index!(a, b, c, start_sample, num_samples)
    @inbounds for i = start_sample:num_samples + start_sample - 1
        c[i] = b[i] * a[i]
    end
end

function test_for_with_different_index_avx!(a, b, c, start_sample, num_samples)
    @avx for i = start_sample:num_samples + start_sample - 1
        c[i] = b[i] * a[i]
    end
end

a = rand(Int16, 1000)
b = rand(Int16, 1000)
c = similar(a_re)
c_avx = similar(a_re)
test_for_with_different_index!(a, b, c, 30, 800)
test_for_with_different_index_avx!(a, b, c_avx, 30, 800)
@test c[30:829] == c_avx[30:829]

Test Failed at /home/zorn/Documents/Code/Testing/test_avx.jl:44
  Expression: c[30:829] == c_avx[30:829]
   Evaluated: Int16[-17628, -17250, -31956, 29328, 2240, 8696, -8285, 1924, 16032, 9148  …  25940, -4788, 23094, 18864, -21520, 6014, -7572, -18266, -15094, 28000] == Int16[-17628, -17250, -31956, 29328, 2240, 8696, -8285, 1924, 16032, 9148  …  25940, -4788, 23094, 18864, -21520, 6014, -7572, 32544, 0, 16048]

Only the last three entries are wrong.

With floats the result is correct, though.

Cannot vectorize a loop with inner hidden loop (findfirst)

I have a simple algorithm that I want to vectorize. Works with @simd but for @avx throws an error that x is not defined.

function check_simd(regex_list, input)
  input_len=length(input)
  icon_indices = Vector{Int64}(undef, input_len)
      @avx for j=1:input_len
          icon_indices[j] = findfirst(x-> match(x, input[j]) !== nothing, regex_list)
      end
  return icon_indices
end

MWE here: https://gist.github.com/aminya/4b6e6db1147e63727cffc781633a8e0b

sign bug

I did a test and #25 is still an issue in version 0.3.11.

Consider replacing \ast with a different symbol

Currently ∗ (\ast) is a lazy multiplication operator, but in many fonts the difference between it and * is not readily apparent unless they are side by side. I'd reccomend using a unicode modifier on top of * instead. Perhaps *̂ (*\hat).

`@avx unroll=3` does not work as intended

I was experimenting with the loop unroll and stumbled upon a bug. The result of a calculation should be the same regardless of the unroll factor:

function test_avx2(x::Vector{T}, y::Vector{T}) where {T<:AbstractFloat}
    z = zero(T)
    @avx unroll=2 for i ∈ 1:length(x)
        z += x[i]*y[i]
    end
    return z
end

function test_avx3(x::Vector{T}, y::Vector{T}) where {T<:AbstractFloat}
    z = zero(T)
    @avx unroll=3 for i ∈ 1:length(x)
        z += x[i]*y[i]
    end
    return z
end

# test
T = Float32
N = 1024
x = rand(T,N)
y = rand(T,N)
@test test_avx2(x, y) ≈ test_avx3(x, y)

Test Failed at In[34]:24
  Expression: test_avx2(x, y) ≈ test_avx3(x, y)
   Evaluated: 258.05835f0 ≈ 172.16222f0

There was an error during testing

Closer inspection of the source code reveals that while @code_native debuginfo=:none test_avx2(x, y) gives me the expected unrolling in the main loop.

L32:
	vmovups	(%rcx,%rdx,4), %ymm2
	vmovups	32(%rcx,%rdx,4), %ymm3
	vfmadd231ps	(%rax,%rdx,4), %ymm2, %ymm0
	vfmadd231ps	32(%rax,%rdx,4), %ymm3, %ymm1
	addq	$16, %rdx
	cmpq	%rsi, %rdx
	jl	L32

This does not hold true for test_avx3 which yields the same code snippet as test_avx2.

Failed assertion

With 0.4.1 (Julia 1.5 dev)

AssertionError: Ni == N + 1                                                                                                                      
  Stacktrace:                                                                                                                                      
   [1] packed_indexpr(::Core.SimpleVector, ::Int64, ::Type{Float64}) at C:\Users\PKrysl\.julia\packages\SIMDPirates\Gw1Aa\src\memory.jl:996        
   [2] packed_strided_ptr_index(::Core.SimpleVector, ::Int64, ::Type{Float64}) at C:\Users\PKrysl\.julia\packages\SIMDPirates\Gw1Aa\src\memory.jl:1017                                                                                                                                                
   [3] #s28#117(::Any, ::Any, ::Any, ::Any, ::Any, ::Any, ::Any) at C:\Users\PKrysl\.julia\packages\SIMDPirates\Gw1Aa\src\memory.jl:1031           
   [4] (::Core.GeneratedFunctionStub)(::Any, ::Vararg{Any,N} where N) at .\boot.jl:526                                                             
   [5] macro expansion at C:\Users\PKrysl\.julia\packages\LoopVectorization\wsOMH\src\reconstruct_loopset.jl:226 [inlined]                         
   [6] _avx_! at C:\Users\PKrysl\.julia\packages\LoopVectorization\wsOMH\src\reconstruct_loopset.jl:226 [inlined]                                  
   [7] mulCAtB!(::Array{Float64,2}, ::Array{Float64,1}, ::Array{Float64,2}) at C:\Users\PKrysl\Documents\work\FinEtoolsTestAll.jl\tests\FinEtools.jl\src\MatrixUtilityModule.jl:380

Trouble with Float32

With

function jselfdotavx(a)
    s = zero(eltype(a))
    @inbounds @avx for i ∈ eachindex(a)
        s += a[i] * a[i]
    end
    s
end

This works:

jselfdotavx(rand(Float64, 1024))

but with Float32

jselfdotavx(rand(Float32, 1024))

I get

ERROR: MethodError: no method matching +(::VectorizationBase.SVec{4,Float32}, ::VectorizationBase.SVec{4,Float64})
[...]

Using AcuteBenchmark system

I created a package for benchmarking functions called AcuteBenchmark.
https://github.com/aminya/AcuteBenchmark.jl

I particularly made it for IntelVectorMath (VML)

If you want we can switch the benchmarking system to AcuteBenchmark. It is very easy to use. It automatically generates random vectors based on the limits and the size given and then benchmarks and plots the result.

Other than the Acutebenchmark doc, there is a fully working example available here: https://github.com/JuliaMath/VML.jl/blob/AcuteBenchmark/benchmark/benchmark.jl

Application for transposed matrix-matrix product: wrong answer

This function doesn't give the correct answer (but works when @avx is removed):

function mulCAtB!(C, A, B)
    M, N = size(C); K = size(B,1)
    @assert size(C, 1) == size(A, 2)
    @assert size(C, 2) == size(B, 2)
    @assert size(A, 1) == size(B, 1)
    if mod(M, 2) == 0 && mod(N, 2) == 0
	    for m ∈ 1:2:M
	    	m1 = m + 1
	    	for n ∈ 1:2:N 
	    		n1 = n + 1
		    	C11, C21, C12, C22 = 0.0, 0.0, 0.0, 0.0 
		    	@avx for k ∈ 1:K
		    		C11 += A[k,m] * B[k,n] 
		    		C21 += A[k,m1] * B[k,n] 
		    		C12 += A[k,m] * B[k,n1] 
		    		C22 += A[k,m1] * B[k,n1]
		    	end
		    	C[m,n] = C11
		    	C[m1,n] = C21
		    	C[m,n1] = C12
		    	C[m1,n1] = C22
		    end
	    end
	else
		@inbounds for n ∈ 1:N, m ∈ 1:M 
	    	Cmn = 0.0
	    	@inbounds for k ∈ 1:K
	    		Cmn += A[k,m] * B[k,n]
	    	end
	    	C[m,n] = Cmn
	    end
	end
    return C
end

Some bugs

I was playing around a bit and noticed some bugs. The first two snippets are attempts at MWE of the final snippet which is the desired calculation

function simulate(x0::Float64; T = 10.0, dt = 0.0001, vt = 1.0)
    times = 0.0:dt:T
    positions = zeros(length(times))
    v = 0.0
    a = 0.0
    x = x0
    @avx for ii in eachindex(times)
        x = x + v * dt
        positions[ii] = x/x0
    end
    times, positions
end

simulate(10.0) # UndefVarError, but runs without @avx

function simulate(x0::Float64; T = 10.0, dt = 0.0001, vt = 1.0)
    times = 0.0:dt:T
    positions = zeros(length(times))
    v = 0.0
    a = 0.0
    x = x0
    @avx for ii in eachindex(times)
        t = times[ii]
        x = x + v * dt
        positions[ii] = x/x0
    end
    times, positions
end

simulate(10.0) # StackOverflowError

The final and desired code is the following

@inline friction(v::Float64, vt::Float64) =  v > vt ? -3v : -3vt*sign(v)
function simulate(x0::Float64; T = 10.0, dt = 0.0001, vt = 1.0)
    times = 0.0:dt:T
    positions = zeros(length(times))
    v = 0.0
    a = 0.0
    x = x0
    @avx for ii in eachindex(times)
        t = times[ii]
        a = friction(v, vt) - 100.0*x
        a = - 100.0*x
        v = v + a * dt
        x = x + v * dt
        positions[ii] = x/x0
    end
    times, positions
end

simulate(10.0)

Original code from discourse thread

ERROR: MethodError: no method matching <<(::VectorizationBase._MM{8}, ::Int32)

There seems to be no method for << nor >>.
MWE:

function test_bit_shift()
    accu = Int32(0)
    counter = Int32(1):Int32(64)
    @avx for i = 1:64
        accu += counter[i] << 1
    end
end

julia> test_bit_shift()
ERROR: MethodError: no method matching <<(::VectorizationBase._MM{8}, ::Int32)
Closest candidates are:
  <<(::Integer, ::Integer) at operators.jl:590
  <<(::VectorizationBase.Static{N}, ::Any) where N at /home/zorn/.julia/packages/VectorizationBase/Myhw6/src/static.jl:53
  ...
Stacktrace:
 [1] macro expansion at /home/zorn/.julia/packages/LoopVectorization/2jh99/src/reconstruct_loopset.jl:260 [inlined]
 [2] _avx_! at /home/zorn/.julia/packages/LoopVectorization/2jh99/src/reconstruct_loopset.jl:260 [inlined]

Error when looping over function call

First of all thank you for this package!
The following code works fine for version 0.3.0.

function clenshaw(x,coeff)
    len_c = length(coeff)
    tmp = zero(x)
    ret = zero(x)
    for i in len_c:-1:2
        ret     = muladd(x,2tmp,coeff[i]-ret)
        ret,tmp = tmp,ret
    end
    ret = muladd(x,tmp,coeff[1]-ret)
    return ret
end
function clenshaw_avx!(ret,x,coeff)
    @avx for j in 1:length(ret)
        ret[j] = clenshaw(x[j],coeff)
    end
end

c = rand(100); x = rand(10^6); y = similar(x);
clenshaw_avx!(y,x,c)

Starting with 0.3.1 the following error occurs:

ERROR: UndefVarError: clenshaw not defined
Stacktrace:
 [1] macro expansion at .\gcutils.jl:91 [inlined]
 [2] clenshaw_avx!(::Array{Float64,1}, ::Array{Float64,1}, ::Array{Float64,1}) at .\REPL[10]:2
 [3] top-level scope at REPL[13]:1

And for version 0.4.0 the error message is:

ERROR: UndefVarError: clenshaw not defined
Stacktrace:
  [1] macro expansion at /home/fabian/.julia/packages/LoopVectorization/SSpaP/src/reconstruct_loopset.jl:214 [inlined]
  [2] _avx_!(::Val{(1, 0, 0)}, ::Type{Tuple{:LoopVectorization,:getindex,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x01, 0x01),:LoopVectorization,Symbol("##422"),LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x02),:LoopVectorization,:clenshaw,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000102, LoopVectorization.compute, 0x00, 0x03),:LoopVectorization,:setindex!,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000003, LoopVectorization.memstore, 0x02, 0x04)}}, ::Type{Tuple{LoopVectorization.ArrayRefStruct(0x0000000000000001, 0x0000000000000001),LoopVectorization.ArrayRefStruct(0x0000000000000001, 0x0000000000000001)}}, ::Type{Tuple{0,Tuple{},Tuple{2},Tuple{},Tuple{},Tuple{},Tuple{}}}, ::Tuple{VectorizationBase.StaticLowerUnitRange{0}}, ::VectorizationBase.PackedStridedPointer{Float64,0}, ::VectorizationBase.PackedStridedPointer{Float64,0}, ::Array{Float64,1}, ::Nothing) at /home/fabian/.julia/packages/LoopVectorization/SSpaP/src/reconstruct_loopset.jl:214
  [3] macro expansion at ./gcutils.jl:91 [inlined]
  [4] clenshaw_avx!(::Array{Float64,1}, ::Array{Float64,1}, ::Array{Float64,1}) at ./REPL[3]:3
  [5] top-level scope at REPL[5]:1

DomainError in SIMDPirates

Julia 1.5 dev

WARNING: Method definition alloca(Int32) in module SIMDPirates at C:\Users\appveyor\.julia\packages\SIMDPirates\obrXX\src\memory.jl:30 overwritten at C:\Users\appveyor\.julia\packages\SIMDPirates\obrXX\src\memory.jl:48.
WARNING: Method definition alloca(Int32, Type{T}) where {T} in module SIMDPirates at C:\Users\appveyor\.julia\packages\SIMDPirates\obrXX\src\memory.jl:30 overwritten at C:\Users\appveyor\.julia\packages\SIMDPirates\obrXX\src\memory.jl:48.
WARNING: Method definition alloca(Int32, Type{T}, Base.Val{Align}) where {T, Align} in module SIMDPirates at C:\Users\appveyor\.julia\packages\SIMDPirates\obrXX\src\memory.jl:30 overwritten at C:\Users\appveyor\.julia\packages\SIMDPirates\obrXX\src\memory.jl:48.
ERROR: LoadError: LoadError: LoadError: LoadError: LoadError: DomainError with -2.3611832414348226e21:
sqrt will only return a complex result if called with a complex argument. Try sqrt(Complex(x)).
Stacktrace:
 [1] throw_complex_domainerror(::Symbol, ::Float64) at .\math.jl:31
 [2] sqrt at .\math.jl:492 [inlined]
 [3] solve_tilesize(::SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int32}},Int32},true}, ::SubArray{Int32,1,Array{Int32,2},Tuple{Base.Slice{Base.OneTo{Int32}},Int32},true}) at C:\Users\appveyor\.julia\packages\LoopVectorization\COyRT\src\determinestrategy.jl:172
 [4] solve_tilesize(::SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int32}},Int32},true}, ::SubArray{Int32,1,Array{Int32,2},Tuple{Base.Slice{Base.OneTo{Int32}},Int32},true}, ::Int32, ::Int32) at C:\Users\appveyor\.julia\packages\LoopVectorization\COyRT\src\determinestrategy.jl:221
 [5] solve_tilesize(::LoopVectorization.LoopSet, ::Symbol, ::Symbol, ::SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int32}},Int32},true}, ::SubArray{Int32,1,Array{Int32,2},Tuple{Base.Slice{Base.OneTo{Int32}},Int32},true}) at C:\Users\appveyor\.julia\packages\LoopVectorization\COyRT\src\determinestrategy.jl:245
 [6] evaluate_cost_tile(::LoopVectorization.LoopSet, ::Array{Symbol,1}) at C:\Users\appveyor\.julia\packages\LoopVectorization\COyRT\src\determinestrategy.jl:317
 [7] choose_tile at C:\Users\appveyor\.julia\packages\LoopVectorization\COyRT\src\determinestrategy.jl:387 [inlined]
 [8] choose_order(::LoopVectorization.LoopSet) at C:\Users\appveyor\.julia\packages\LoopVectorization\COyRT\src\determinestrategy.jl:401
 [9] lower(::LoopVectorization.LoopSet) at C:\Users\appveyor\.julia\packages\LoopVectorization\COyRT\src\lowering.jl:807
 [10] @avx(::LineNumberNode, ::Module, ::Any) at C:\Users\appveyor\.julia\packages\LoopVectorization\COyRT\src\constructors.jl:53
 [11] #macroexpand#32 at .\expr.jl:92 [inlined]
 [12] macroexpand at .\expr.jl:91 [inlined]
 [13] docm(::LineNumberNode, ::Module, ::Any, ::Any, ::Bool) at .\docs\Docs.jl:509 (repeats 2 times)
 [14] @doc(::LineNumberNode, ::Module, ::String, ::Vararg{Any,N} where N) at .\boot.jl:451
 [15] include at .\sysimg.jl:29 [inlined]
 [16] include(::String) at C:\projects\finetools-jl\src\FinEtools.jl:7
 [17] top-level scope at none:0
 [18] include at .\sysimg.jl:29 [inlined]
 [19] include(::String) at C:\projects\finetools-jl\src\FinEtools.jl:7
 [20] top-level scope at none:0
 [21] include at .\boot.jl:317 [inlined]
 [22] include_relative(::Module, ::String) at .\loading.jl:1044
 [23] include(::Module, ::String) at .\sysimg.jl:29
 [24] top-level scope at none:2
 [25] eval at .\boot.jl:319 [inlined]
 [26] eval(::Expr) at .\client.jl:393
 [27] top-level scope at .\none:3
in expression starting at C:\projects\finetools-jl\src\MatrixUtilityModule.jl:380

usage of sincos

Here are two variants of a function that I cannot figure how one can apply @avx in front of for

function f1(N)
	x = π*cumsum(ones(N))/2
	y = repeat(similar(x),1,2)
	for i = 1:length(x)
		y[i,1:2] .= sincos(x[i])
	end
	return y
end

function f2(N)
	x = π*cumsum(ones(N))/2
	y = repeat(similar(x),1,2)
	for i = 1:length(x)
		y[i,1], y[i,2] = sincos(x[i])
	end
	return y
end

If I refer to https://software.intel.com/en-us/mkl-vmperfdata-sincos and https://software.intel.com/en-us/mkl-vmperfdata-sin , those pages suggest that not a big gain is available if sin and cos are to be evaluated using sincos than separately. But I would like to test it myself.

ERROR: InexactError: trunc(Int64, Inf)

Here is a MWE:

function calc!(x)
    delta_fp = 2
    phase_fp = Int32(0)
    @avx for i = 1:length(x)
        phase_fp = phase_fp + delta_fp
        phase_int = phase_fp >> 17
        x[i] = phase_int
    end
end
x = Vector{Int32}(undef, 100)
calc!(x)

ERROR: InexactError: trunc(Int64, Inf)
Stacktrace:
 [1] trunc at ./float.jl:702 [inlined]
 [2] round at ./float.jl:367 [inlined]
 [3] unroll_no_reductions(::LoopVectorization.LoopSet, ::Array{Symbol,1}, ::Symbol, ::Int64, ::Int64) at /home/zorn/.julia/packages/LoopVectorization/9Qgzg/src/determinestrategy.jl:151
 [4] determine_unroll_factor(::LoopVectorization.LoopSet, ::Array{Symbol,1}, ::Symbol, ::Symbol) at /home/zorn/.julia/packages/LoopVectorization/9Qgzg/src/determinestrategy.jl:170
 [5] choose_order(::LoopVectorization.LoopSet) at /home/zorn/.julia/packages/LoopVectorization/9Qgzg/src/determinestrategy.jl:550
 [6] lower(::LoopVectorization.LoopSet, ::Int8) at /home/zorn/.julia/packages/LoopVectorization/9Qgzg/src/lowering.jl:497
 [7] avx_body(::Tuple{Int8,Int8,Int8}, ::Array{LoopVectorization.Instruction,1}, ::Array{LoopVectorization.OperationStruct,1}, ::Array{LoopVectorization.ArrayRefStruct,1}, ::Core.SimpleVector, ::Core.SimpleVector, ::NTuple{4,DataType}) at /home/zorn/.julia/packages/LoopVectorization/9Qgzg/src/reconstruct_loopset.jl:225

ERROR: TypeError: non-boolean (UInt16) used in boolean context

MWE:

@inline mysign(x) = ifelse(x >= zero(x), one(x), -one(x))
function accumulate_mysign(a)
    accu = zero(eltype(a))
    @inbounds for i = 1:length(a)
        accu += mysign(a[i])
    end
    accu
end
function accumulate_mysign_avx(a)
    accu = zero(eltype(a))
    @avx for i = 1:length(a)
        accu += mysign(a[i])
    end
    accu
end
a = rand(Int16, 32)

julia> accumulate_mysign(a)
-4

julia> accumulate_mysign_avx(a)
ERROR: TypeError: non-boolean (UInt16) used in boolean context
Stacktrace:
 [1] mysign at ./none:1 [inlined]
 [2] macro expansion at ./gcutils.jl:91 [inlined]
 [3] accumulate_mysign_avx(::Array{Int16,1}) at ./none:3
 [4] top-level scope at none:0

ERROR: MethodError: no method matching Int16(::VectorizationBase.SVec{8,Int32})

MWE:

function gen_phases(phases::Vector{Int16}, freq)
    fp = 21
    delta = floor(Int32, freq * 1 << (fp + 9))
    idxs = Int32(1):Int32(length(phases))
    @avx for i = 1:length(phases)
        phases[i] = Int16((idxs[i] * delta) >> fp)
    end
end
phases = Vector{Int16}(undef, 1000);

julia> gen_phases(phases, 0.002)
ERROR: MethodError: no method matching Int16(::VectorizationBase.SVec{8,Int32})

The same applies to Int64.

vmap and StaticVectors

As predicted, vmap gives me a tremendous boost in performance =) I'm hitting an error when mapping over static arrays though

julia> vmap(exp, randn(8));

julia> vmap(exp, @SVector randn(8));
ERROR: conversion to pointer not defined for SArray{Tuple{8},Float64,1,8}

Assignment vectorization

Now if I run the following I get an error:

a = collect(1:100)
items = rand(20)
indices = collect(1:20)
@avx a[indices] .= items

ERROR: MethodError: no method matching vmaterialize!(::SubArray{Int64,1,Array{Int64,1},Tuple{Array{Int64,1}},false}, ::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1},Nothing,typeof(identity),Tuple{Array{Int64,1}}}, ::Val{:Main})Closest candidates are:
  vmaterialize!(::Union{LinearAlgebra.Adjoint{T,A}, LinearAlgebra.Transpose{T,A}}, ::BC, ::Val{Mod}) where {T<:Union{Float32, Float64}, N, A<:AbstractArray{T,N}, BC<:Base.Broadcast.Broadcasted, Mod} at C:\Users\yahyaaba\.julia\packages\LoopVectorization\iNfCA\src\broadcast.jl:218
  vmaterialize!(::AbstractArray{T,N}, ::BC, ::Val{Mod}) where {T<:Union{Float32, Float64}, N, BC<:Base.Broadcast.Broadcasted, Mod} at C:\Users\yahyaaba\.julia\packages\LoopVectorization\iNfCA\src\broadcast.jl:194

That expression is equal to the following that works

indiceslen = length(indices)
@avx for k = 1:indiceslen
        a[indices[k]] = items[k]
    end

case where loop does not change vector values

With LoopVectorization v0.5.1

using LoopVectorization

function f1()
z = zeros(8)
A = ones(8)
for i in eachindex(A)
A[i] = z[i]
end
return A
end

function f1a()
z = zeros(8)
A = ones(8)
@avx for i in eachindex(A)
A[i] = z[i]
end
return A
end


function f2()
z = zeros(8)
A = ones(8)
for i in eachindex(A)
A[i] = sin(z[i])
end
return A
end

function f2a()
z = zeros(8)
A = ones(8)
@avx for i in eachindex(A)
A[i] = sin(z[i])
end
return A
end

julia> f1() ≈ f2() ≈ f2a() ≈ f1a()
false

julia> f1() ≈ f2() ≈ f2a()
true

julia> f1a()
8-element Array{Float64,1}:
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0

Output should be a vector of zeros.

Is it possible to vectorize `filter`?

Is it possible to vectorize filter algorithm of julia?
https://github.com/JuliaLang/julia/blob/3720edfc06ee8687b54ee7fbb908c9f03f4fc557/base/array.jl#L2368

Support for complex numbers?

Does LoopVectorization support complex vectors? E.g.

using LoopVectorization

function test_avx!(x::Vector, y::Vector, beta)
  @avx for n=1:length(x)
    x[n] += conj(beta)*y[n]
  end
end

# set up
T = ComplexF32
N = 1024

x = zeros(T,N)
y = rand(T,N)
β = rand(T)
# end set up

test_avm!(x,y,β)

gives me the following error

KeyError: key Complex{Float32} not found

Stacktrace:
 [1] getindex at ./dict.jl:477 [inlined]
 [2] llvmtype(::Type) at /home/moeddel/.julia/packages/VectorizationBase/hBLD0/src/vectorizable.jl:21
 [3] #s24#123 at /home/moeddel/.julia/packages/SIMDPirates/aKvPC/src/memory.jl:94 [inlined]
 [4] #s24#123(::Any, ::Any, ::Any, ::Any, ::Any, ::Type, ::Any, ::Type, ::Any) at ./none:0
 [5] (::Core.GeneratedFunctionStub)(::Any, ::Vararg{Any,N} where N) at ./boot.jl:524
 [6] vload at /home/moeddel/.julia/packages/SIMDPirates/aKvPC/src/memory.jl:218 [inlined]
 [7] macro expansion at ./gcutils.jl:91 [inlined]
 [8] test_avm!(::Array{Complex{Float32},1}, ::Array{Complex{Float32},1}, ::Complex{Float32}) at ./In[26]:4

Loop over Int32 or Int16 indices errors with UndefVarError: N not defined

First of all: great package!

However, I get an error when I loop over Int32 or Int16 indices.
Here is a MWE:

function test()
    acc = 0
    @avx for i = Int32(1):Int32(100)
        acc = acc + 1
    end
    acc
end

ERROR: LoadError: UndefVarError: N not defined
Stacktrace:
 [1] register_single_loop!(::LoopVectorization.LoopSet, ::Expr) at /home/zorn/.julia/packages/LoopVectorization/SSpaP/src/graphs.jl:373
 [2] register_loop!(::LoopVectorization.LoopSet, ::Expr) at /home/zorn/.julia/packages/LoopVectorization/SSpaP/src/graphs.jl:401
 [3] add_loop!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at /home/zorn/.julia/packages/LoopVectorization/SSpaP/src/graphs.jl:405 (repeats 2 times)
 [4] copyto! at /home/zorn/.julia/packages/LoopVectorization/SSpaP/src/constructors.jl:6 [inlined]
 [5] LoopVectorization.LoopSet(::Expr) at /home/zorn/.julia/packages/LoopVectorization/SSpaP/src/constructors.jl:46
 [6] @avx(::LineNumberNode, ::Module, ::Any) at /home/zorn/.julia/packages/LoopVectorization/SSpaP/src/constructors.jl:95

That's LoopVectorization v0.4.0, but it also threw an error in v0.3.11

MethodError from @avx tanh.(x)

MWE

julia> using LoopVectorization

julia> x = rand(4);

julia> @avx tanh.(x)
ERROR: MethodError: no method matching vifelse(::UInt8, ::SLEEFPirates.Double{VectorizationBase.SVec{4,Float64}}, ::SLEEFPirates.Double{VectorizationBase.SVec{4,Float64}})
Closest candidates are:
  vifelse(::Bool, ::Any, ::Any) at /home/scheme/.julia/packages/SIMDPirates/h2DSE/src/conditionals.jl:96
  vifelse(::Unsigned, ::Tuple{Vararg{VecElement{T},W}}, ::Union{Int64, T}) where {W, T} at /home/scheme/.julia/packages/SIMDPirates/h2DSE/src/conditionals.jl:166
  vifelse(::Unsigned, ::Union{Tuple{Vararg{VecElement{T},W}}, VectorizationBase.AbstractStructVec{W,T}}, ::Union{Int64, T}) where {W, T} at /home/scheme/.julia/packages/SIMDPirates/h2DSE/src/conditionals.jl:167
  ...
Stacktrace:
 [1] expk2 at /home/scheme/.julia/dev/SLEEFPirates/src/priv.jl:284 [inlined]
 [2] tanh(::VectorizationBase.SVec{4,Float64}) at /home/scheme/.julia/dev/SLEEFPirates/src/hyp.jl:56
 [3] macro expansion at /home/scheme/.julia/dev/SLEEFPirates/src/SLEEFPirates.jl:147 [inlined]
 [4] macro expansion at /home/scheme/.julia/packages/LoopVectorization/HwB3A/src/broadcast.jl:205 [inlined]
 [5] vmaterialize! at /home/scheme/.julia/packages/LoopVectorization/HwB3A/src/broadcast.jl:205 [inlined]
 [6] vmaterialize(::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1},Nothing,typeof(tanh),Tuple{Array{Float64,1}}}, ::Val{:Main}) at /home/scheme/.julia/packages/LoopVectorization/HwB3A/src/broadcast.jl:252
 [7] top-level scope at REPL[3]:1

Install fails on AMD 8350

Pkg.add("LoopVectorization") on julia 1.3.1 and LoopVectorization 0.3.5 gives (from the log):

ERROR: LoadError: "Architecture not yet supported. Please file an issue!\nYou can fix immediately by supplying the correct number of cores\nand the cache sizes (L1 data per core, L2 per core, L3).\nIf you do, please file an issue with the information or a PR\nadding it to the build script, so your architecture will be\nsupported for all future releases."
Stacktrace:
 [1] top-level scope at /home/user/.julia/packages/VectorizationBase/cKn6V/deps/build.jl:15
 [2] include at ./boot.jl:328 [inlined]
 [3] include_relative(::Module, ::String) at ./loading.jl:1105
 [4] include(::Module, ::String) at ./Base.jl:31
 [5] include(::String) at ./client.jl:424
 [6] top-level scope at none:5
in expression starting at /home/user/.julia/packages/VectorizationBase/cKn6V/deps/build.jl:7

The CPU is the AMD 8350 /proc/cpuinfo gives:

processor : 0
vendor_id : AuthenticAMD
cpu family : 21
model : 2
model name : AMD FX(tm)-8350 Eight-Core Processor
stepping : 0
microcode : 0x6000852
cpu MHz : 1400.000
cache size : 2048 KB
physical id : 0
siblings : 8
core id : 0
cpu cores : 4
apicid : 16
initial apicid : 0
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 popcnt aes xsave avx f16c lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs xop skinit wdt fma4 tce nodeid_msr tbm topoext perfctr_core perfctr_nb cpb hw_pstate ssbd ibpb vmmcall bmi1 arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold
bugs : fxsave_leak sysret_ss_attrs spectre_v1 spectre_v2 spec_store_bypass
bogomips : 8016.25
TLB size : 1536 4K pages
clflush size : 64
cache_alignment : 64
address sizes : 48 bits physical, 48 bits virtual
power management: ts ttp tm 100mhzsteps hwpstate cpb eff_freq_ro

Compatibility with julia 1.0

It seems that the project files requires julia v1.3. Could that bound be lowered, or are you using certain new features?

assigning a constant

function f1a()
   A = zeros(8)
   @avx for i in 1:size(A)[1]
      A[i] = 1.0
   end
   return A
end

gives

ERROR: BoundsError: attempt to access ()
  at index [1]
Stacktrace:
 [1] getindex(::Tuple, ::Int64) at .\tuple.jl:24
 [2] sizeofeltypes(::Tuple{}, ::Int64) at C:\Users\jr\.julia\packages\LoopVectorization\9Qgzg\src\reconstruct_loopset.jl:204
 [3] avx_body(::Tuple{Int8,Int8,Int8}, ::Array{LoopVectorization.Instruction,1}, ::Array{LoopVectorization.OperationStruct,1}, ::Array{LoopVectorization.ArrayRefStruct,1}, ::Core.SimpleVector, ::Core.SimpleVector, ::Tuple{}) at C:\Users\jr\.julia\packages\LoopVectorization\9Qgzg\src\reconstruct_loopset.jl:215
 [4] #s41#61 at C:\Users\jr\.julia\packages\LoopVectorization\9Qgzg\src\reconstruct_loopset.jl:236 [inlined]
 [5] #s41#61(::Any, ::Any, ::Any, ::Any, ::Any, ::Any, ::Type, ::Type, ::Type, ::Any, ::Any, ::Any) at .\none:0
 [6] (::Core.GeneratedFunctionStub)(::Any, ::Vararg{Any,N} where N) at .\boot.jl:524
 [7] f1a() at .\REPL[10]:3
 [8] top-level scope at REPL[11]:1

I do not know why I would write code like this, but the problem is mainly that the error message was difficult to understand that assigning a constant is not supported.

Undefined quantity during transformation

Version of the package 0.4.0. Error:

  LoadError: UndefVarError: z not defined                                                                                                          
  Stacktrace:                                                                                                                                      
   [1] macro expansion at C:\Users\PKrysl\.julia\packages\LoopVectorization\SSpaP\src\reconstruct_loopset.jl:214 [inlined]                         
   [2] _avx_!(::Val{(1, 0, 0)}, ::Type{Tuple{:constsym,:z,LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x01),:LoopVectorization,:getindex,LoopVectorization.OperationStruct(0x0000000000000032, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x01, 0x02),:LoopVectorization,:getindex,LoopVectorization.OperationStruct(0x0000000000000031, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x02, 0x03),:zero,Symbol("##temp#471"),LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000321, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x04),:LoopVectorization,:vfmadd_fast,LoopVectorization.OperationStruct(0x0000000000000321, 0x0000000000000003, 0x0000000000000000, 0x0000000000020304, LoopVectorization.compute, 0x00, 0x04),:LoopVectorization,:reduced_add,LoopVectorization.OperationStruct(0x0000000000000021, 0x0000000000000003, 0x0000000000000000, 0x0000000000000501, LoopVectorization.compute, 0x00, 0x01),:LoopVectorization,:setindex!,LoopVectorization.OperationStruct(0x0000000000000021, 0x0000000000000000, 0x0000000000000000, 0x0000000000000006, LoopVectorization.memstore, 0x03, 0x05)}}, ::Type{Tuple{LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000302),LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000301),LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000201)}}, ::Type{Tuple{0,Tuple{},Tuple{},Tuple{},Tuple{},Tuple{4},Tuple{}}}, ::Tuple{VectorizationBase.StaticLowerUnitRange{0},VectorizationBase.StaticLowerUnitRange{0},VectorizationBase.StaticLowerUnitRange{0}}, ::VectorizationBase.PackedStridedPointer{Float64,1}, ::VectorizationBase.PackedStridedPointer{Float64,1}, ::VectorizationBase.PackedStridedPointer{Float64,1}, 
::Nothing) at C:\Users\PKrysl\.julia\packages\LoopVectorization\SSpaP\src\reconstruct_loopset.jl:214                                               
   [3] macro expansion at .\gcutils.jl:96 [inlined]                                                                                                
   [4] mulCAtB!(::Array{Float64,2}, ::Array{Float64,2}, ::Array{Float64,2}) at C:\Users\PKrysl\Documents\work\FinEtoolsTestAll.jl\tests\FinEtools.jl\src\MatrixUtilityModule.jl:380

Occurs in:

function mulCAtB!(C, A, B)
    M, N = size(C); K = size(B,1)
    @assert size(C, 1) == size(A, 2)
    @assert size(C, 2) == size(B, 2)
    @assert size(A, 1) == size(B, 1)
    # When the @avx macro is available, this code is faster:
    z = zero(eltype(C))
    @avx for n in 1:size(C,2), m in 1:size(C,1)
        Cmn = z
        for k in 1:size(A,1)
            Cmn += A[k,m] * B[k,n]
        end
        C[m,n] = Cmn
    end
    return C
end

for loop with a negative step

Thanks for all your work on this!

It would be useful for some of my code to have a negative step size in the for loop. I'd be happy if either toy2! or toy3! could be made to work.

using LoopVectorization

function toy1!(B,X,κ)
    n = size(B,1)
    @inbounds for i = κ-1:-1:1
        for nx=1:n
            B[nx,κ] -=X[i]*B[nx,i]
        end
    end
end
# the following function never exits lowering (?). Or doing
# something triggered by the @avx macro
function toy2!(B,X,κ)
    n = size(B,1)
    @avx for i = κ-1:-1:1
        for nx=1:n
            B[nx,κ] -=X[i]*B[nx,i]
        end
    end
end
# I'm willing to switch to this form of the same function
# which doesn't have the negative step, but it fails lowering with
# ERROR: LoadError: InexactError: trunc(Int64, Inf)
function toy3!(B,X,κ)
    n = size(B,1)
    @avx for ii = 1:κ-1
        i = κ-ii
        for nx=1:n
            B[nx,κ] -=X[i]*B[nx,i]
        end
    end
end

N = 4
X = randn(N);
B1 = randn(N, N);
B2 = copy(B1);
B3 = copy(B1);
toy1!(B1,X,3)
toy2!(B2,X,3) # hopefully this would be faster, if we could run it :-)
toy3!(B3,X,3) # hopefully this would be faster, if we could run it :-)

Multiply 2 16-bit integers and accumulate to 32-bit integer

How would I multiply 2 16-bit integers and accumulate to 32-bit integer? Something like _mm_madd_epi16
So far the result of the multiplication seems to be done in 16-bit:

a = Int16.([1 << 8, 1 << 9, 1 << 10, 1 << 10])
b = Int16.([1 << 7, 1 << 8, 1 << 11, 1 << 10])
function fma_avx(a, b)
    res = Int32(0)
    @avx for i = 1:4
        res = res + a[i] * b[i]
    end
    res
end

julia> fma_avx(a, b)
-32768

julia> fma_avx(Int32.(a), Int32.(b))
3309568

ERROR: LoadError: KeyError: key :zeroB not found

Using LoopVectorization.jl#master, I get the following error. I'm not certain how to describe the error, so I put part of the error message in the issue name. I tried a few small variants, with the same result. Is it somehow obvious that I should not put B[i,j] inside the k loop? It works fine without the @avx macro.

julia> using LoopVectorization
julia> function myGramFail!(A, B)
           zeroB = zero(eltype(B))
           @avx for i ∈ 1:size(A,1), j ∈ 1:size(A,2)
               B[i,j] = zeroB
               for k ∈ 1:size(A,2)
                   B[i,j] += A[i,k] * A[k,j]
               end
           end
       end
ERROR: LoadError: KeyError: key :zeroB not found
Stacktrace:
 [1] getindex at ./dict.jl:477 [inlined]
 [2] getop at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/graphs.jl:161 [inlined]
 [3] add_store!(::LoopVectorization.LoopSet, ::Symbol, ::LoopVectorization.ArrayReference, ::Int64) at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/graphs.jl:505
 [4] add_store_ref! at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/graphs.jl:512 [inlined]
 [5] push!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/graphs.jl:586
 [6] add_block!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/graphs.jl:228
 [7] add_loop!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/graphs.jl:285
 [8] add_loop! at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/graphs.jl:282 [inlined]
 [9] copyto! at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/constructors.jl:6 [inlined]
 [10] LoopVectorization.LoopSet(::Expr) at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/constructors.jl:46
 [11] @avx(::LineNumberNode, ::Module, ::Any) at /Users/cp/.julia/packages/LoopVectorization/M6TfP/src/constructors.jl:86
in expression starting at REPL[2]:3

Support ranges

using LoopVectorization
function f(a)
    s = 0.0
    @avx for i in eachindex(a)
        s += a[i]
    end
    s
end

julia> f([1,2,3]) 
6.0

julia> f(1:3)
 StackOverflowError:
 Stacktrace:
  [1] stridedpointer(::UnitRange{Int64}) at /Users/mason/.julia/packages/VectorizationBase/yJngh/src/vectorizable.jl:389 (repeats 80000 times)

I think this should be supportable. One (dumb) way to make this work would just be

VectorizationBase.stridedpointer(r::AbstractRange) = VectorizationBase.stridedpointer(collect(r))

but I wonder if there's a way we can do it that doesn't require explicitly materializing the range. Maybe this issue should be moved to VectorizationBase though,

sign Bug?

Hi @chriselrod,

I have been experimenting with the structs and came across something, which seems to be a bug. If I consider the following example

using StructArrays, Test

function test_native(x, A, k::Int)
    out = zero(T)
    for i ∈ 1:length(x)
        out += A[i,k]*x[i]
    end
    return out
end

function test_struct(xre::Vector{T}, xim::Vector{T}, Are::Matrix{T}, Aim::Matrix{T}, k::Int) where {T<:AbstractFloat}
    zre = zero(T)
    zim = zero(T)
    for i ∈ 1:length(xre)
        zre += xre[i]*Are[i,k] - xim[i]*Aim[i,k]
        zim += xre[i]*Aim[i,k] + xim[i]*Are[i,k]
    end
    return Complex{T}(zre,zim)
end

function test_struct(x::StructVector{Complex{T}}, A::StructArray{Complex{T},2}, k::Int) where {T<:AbstractFloat}
    return test_struct(x.re, x.im, A.re , A.im, k)
end

# set up
T = ComplexF32
M = rand(1:1024)
N = rand(1:1024)

A = StructArray(rand(T,M,N))
b = StructVector(rand(T,M))
k = rand(1:N)
# end set up

@test test_native(b, A, k) ≈ test_struct(b, A, k)

everything is fine. However, if I insert the @avx macro

using LoopVectorization

function test_avx(xre::Vector{T}, xim::Vector{T}, Are::Matrix{T}, Aim::Matrix{T}, k::Int) where {T<:AbstractFloat}
    zre = zero(T)
    zim = zero(T)
    @avx for i ∈ 1:length(xre)
        zre += xre[i]*Are[i,k] - xim[i]*Aim[i,k]
        zim += xre[i]*Aim[i,k] + xim[i]*Are[i,k]
    end
    return Complex{T}(zre,zim)
end

function test_avx(x::StructVector{Complex{T}}, A::StructArray{Complex{T},2}, k::Int) where {T<:AbstractFloat}
    return test_avx(x.re, x.im, A.re , A.im, k)
end

@test test_native(b, A, k) ≈ test_avx(b, A, k)

The test fails

Test Failed at In[2]:17
  Expression: test_native(b, A, k) ≈ test_avx(b, A, k)
   Evaluated: 5.7645955f0 + 234.34486f0im ≈ -5.764599f0 + 234.34473f0im

since the real part has wrong sign. Do you have any ideas why?

ERROR: UndefVarError: ##op#5321 not defined

If I increment an index of one of the variables, I will get the error ##op#5321 not defined
MWE:

function complex_mul_with_index_offset(a_re, a_im, b_re, b_im, c_re, c_im)
    @avx for i = 1:length(a_re) - 1
        c_re[i] = b_re[i] * a_re[i + 1] - b_im[i] * a_im[i + 1]
        c_im[i] = b_re[i] * a_im[i + 1] + b_im[i] * a_re[i + 1]
    end
end
a_re = ones(Int, 64); a_im = copy(a_re); b_re = copy(a_re); b_im = copy(a_re); c_im = copy(a_re); c_re = copy(a_re)

julia> complex_mul_with_index_offset(a_re, a_im, b_re, b_im, c_re, c_im)
ERROR: UndefVarError: ##op#5321 not defined
Stacktrace:
 [1] sum_with_index_offset(::Array{Int64,1}, ::Array{Int64,1}, ::Array{Int64,1}, ::Array{Int64,1}, ::Array{Int64,1}, ::Array{Int64,1}) at /home/zorn/.julia/packages/LoopVectorization/2jh99/src/reconstruct_loopset.jl:260

This does work, however:

function complex_mul_with_index_offset1(a_re, a_im, b_re, b_im, c_re, c_im)
    @avx for i = 1:length(a_re) - 1
        c_re[i] = b_re[i] * a_re[i + 1] - b_im[i] * a_im[i + 1]
    end
end

Refactor @avx to rewrite loops into a @generated function.

@chriselrod mentioned on Discourse that the fact that @avx is a macro imposes some codegen difficulties since macros don't see type information, and mentioned generated functions as an alternative. I agree that the best solution would be to gut all the complicated parts out of @avx and put them into @generated function avx and then make @avx just rewrite the input loops into appropriate data structures to be passed to the avx function.

This would cleanly separate two the distinct functionalities which are currently conflated: syntax transformations and complicated, heavily datatype dependant code generation. Furthermore, it'd make it much easier to expose an interface for other packages to write methods and make their types compatible with @avx.

This won't be news to Chris, but I figured it'd be good to have an issue here for the benefit of other readers.

I'd be happy to help with the generated functions and macro rewrite if you're looking for help, though it will take me a while to go through and digest the code. If you could point out places where the syntax transformations occur, that would accelerate the process.

if statements & difficulties using matrices vs vectors

Hi @chriselrod, I've upgraded from @vectorize to @avx and am excited to use an officially registered package.

Looks like a change (maybe in SIMDPirates?) broke something for me -- the code below works unvectorized, but is broken when I add the @avx. Seems like there are 2 issues. First, it looks like if statements in an @avx loop don't work. Second, it appears that something goes wrong when I iterate over a matrix or a view, not a vector. @code_warntype suggests there are problems with type inference. See unvectorized, broken vectorized, and fixed vectorized code below.

I'm using julia 1.3.1 and

[[LoopVectorization]]
deps = ["LinearAlgebra", "MacroTools", "Parameters", "SIMDPirates", "SLEEFPirates", "VectorizationBase"]
git-tree-sha1 = "a8d158a4971113443269739f3cc51ae992e95a58"
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
version = "0.3.6"

[[SIMDPirates]]
deps = ["MacroTools", "VectorizationBase"]
git-tree-sha1 = "500294a8b1001bdda2483fc6d675956798ad8764"
uuid = "21efa798-c60a-11e8-04d3-e1a92915a26a"
version = "0.1.6"

[[SLEEFPirates]]
deps = ["SIMDPirates", "VectorizationBase"]
git-tree-sha1 = "4733445246d3d5536c7aee1bffb55ab37b88347b"
uuid = "476501e8-09a2-5ece-8869-fb82de89a1fa"
version = "0.1.3"

[[VectorizationBase]]
deps = ["CpuId", "LinearAlgebra"]
git-tree-sha1 = "a2576763aa20968ffb5668e2e15d45ae8e364d05"
uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
version = "0.1.9"

using LoopVectorization
using Base: OneTo
using LinearAlgebra: stride1
using Test

add_1_dim(x::AbstractArray) = reshape(x, size(x)..., 1)
check_finite(x::AbstractArray) = all(isfinite.(x)) || throw(error("x not finite!"))

"Given k-dimensional array `x` where `n=size(x,k)`, compute multinomial logistic Pr(i ∈ 1:n | x[:, ..., :, k] )"
function softmax3!(q::AA, lse::A, tmpmax::A, x::AA, maxk=size(q, ndims(q)) ) where {T<:Real, A<:Array{T}, AA<:AbstractArray{T}}
    ndims(q) == 1+ndims(lse) || throw(DimensionMismatch())
    xsizes = size(x)
    xsizes == size(q) || throw(DimensionMismatch("size(x) = $(size(x)) but size(q) = $(size(q))"))
    nk = last(xsizes)
    for i = OneTo(ndims(lse))
        size(q,i) == size(lse,i) == size(tmpmax,i) || throw(DimensionMismatch("size(x) = $(size(x)),  size(lse) = $(size(lse)), and size(tmpmax) = $(size(tmpmax))"))
    end
    0 < maxk <= nk || throw(DomainError(maxk))
    1 == stride1(q) == stride1(x) || throw(error("Arrays not strided"))

    isempty(x) && throw(error("x empty"))
    check_finite(x)

    maximum!(add_1_dim(tmpmax), x)
    fill!(lse, zero(T))

    xx = reshape(x, :, nk)
    qq = reshape(q, :, nk)

    for k in OneTo(nk)
        for i in eachindex(lse)
            tmp = exp(xx[i,k] - tmpmax[i])
            lse[i] += tmp
            k <= maxk && (qq[i,k] = tmp)
        end
    end

    qq[:,OneTo(maxk)] ./= vec(lse)
end

function softmax3_avx_broken!(q::AA, lse::A, tmpmax::A, x::AA, maxk=size(q, ndims(q)) ) where {T<:Real, A<:Array{T}, AA<:AbstractArray{T}}
        ndims(q) == 1+ndims(lse) || throw(DimensionMismatch())
        xsizes = size(x)
        xsizes == size(q) || throw(DimensionMismatch("size(x) = $(size(x)) but size(q) = $(size(q))"))
        nk = last(xsizes)
        for i = OneTo(ndims(lse))
            size(q,i) == size(lse,i) == size(tmpmax,i) || throw(DimensionMismatch("size(x) = $(size(x)),  size(lse) = $(size(lse)), and size(tmpmax) = $(size(tmpmax))"))
        end
        0 < maxk <= nk || throw(DomainError(maxk))
        1 == stride1(q) == stride1(x) || throw(error("Arrays not strided"))

        isempty(x) && throw(error("x empty"))
        check_finite(x)

        maximum!(add_1_dim(tmpmax), x)
        fill!(lse, zero(T))

        xx = reshape(x, :, nk)
        qq = reshape(q, :, nk)

        for k in OneTo(maxk)
            @avx for i in eachindex(lse)
                tmp = exp(xx[i,k] - tmpmax[i])
                lse[i] += tmp
                # FIXME - would prefer to replace 2nd loop w/ if stmt
                # k <= maxk && (qq[i,k] = tmp)
                qq[i,k] = tmp
            end
        end

        for k in maxk+1:nk
            @avx for i in eachindex(lse)
                tmp = exp(xx[i,k] - tmpmax[i])
                lse[i] += tmp
            end
        end

        qq[:,OneTo(maxk)] ./= vec(lse)
end

function softmax3_avx_fixed!(q::AA, lse::A, tmpmax::A, x::AA, maxk=size(q, ndims(q)) ) where {T<:Real, A<:Array{T}, AA<:AbstractArray{T}}
        ndims(q) == 1+ndims(lse) || throw(DimensionMismatch())
        xsizes = size(x)
        xsizes == size(q) || throw(DimensionMismatch("size(x) = $(size(x)) but size(q) = $(size(q))"))
        nk = last(xsizes)
        for i = OneTo(ndims(lse))
            size(q,i) == size(lse,i) == size(tmpmax,i) || throw(DimensionMismatch("size(x) = $(size(x)),  size(lse) = $(size(lse)), and size(tmpmax) = $(size(tmpmax))"))
        end
        0 < maxk <= nk || throw(DomainError(maxk))
        1 == stride1(q) == stride1(x) || throw(error("Arrays not strided"))

        isempty(x) && throw(error("x empty"))
        check_finite(x)

        maximum!(add_1_dim(tmpmax), x)
        fill!(lse, zero(T))

        xx = reshape(x, :, nk)
        qq = reshape(q, :, nk)
        tmpmaxvec = vec(tmpmax)  # Always required??
        lsevec = vec(lse)        # Always required??

        for k in OneTo(maxk)
            qk = view(qq, :, k) # required if using a View
            xk = view(xx, :, k) # required if using a View
            @avx for i in eachindex(lsevec)
                tmp = exp(xk[i] - tmpmaxvec[i])
                lsevec[i] += tmp
                qk[i] = tmp
            end
        end

        for k in maxk+1:nk
            qk = view(qq, :, k)
            xk = view(xx, :, k)
            @avx for i in eachindex(lsevec)
                tmp = exp(xk[i] - tmpmaxvec[i])
                lsevec[i] += tmp
            end
        end

        qq[:,OneTo(maxk)] ./= vec(lse)
end


ni, nj, nk = (100, 100, 10)
x = rand(ni, nj, nk)
q = similar(x0)
tmpmax = zeros(ni,nj)
lse = similar(tmpmax)

for f! in (softmax3!, softmax3_avx_fixed!, softmax3_avx_broken!)
    @testset "$(f!) with arrays" begin
        fill!(q,0)
        fill!(lse,0)
        @show @code_warntype f!(q, lse, tmpmax, x, 1)
        f!(q, lse, tmpmax, x, 1)
        @test all(sum(q; dims=3) .<= 1)
        fill!(q,0)
        fill!(lse,0)
        f!(q, lse, tmpmax, x)
        @test sum(q; dims=3) ≈ ones(ni,nj)
    end

    @testset "$(f!) with views" begin
        nkm1 = nk-1
        @views qvw = q[:,:,1:nkm1]
        @views xvw = x[:,:,1:nkm1]
        fill!(q,0)
        fill!(lse,0)
        @show @code_warntype f!(qvw, lse, tmpmax, xvw, 1)
        f!(qvw, lse, tmpmax, xvw, 1)
        @test all(sum(qvw; dims=3) .<= 1)
        fill!(q,0)
        fill!(lse,0)
        f!(qvw, lse, tmpmax, xvw)
        @test sum(qvw; dims=3) ≈ ones(ni,nj)
    end
end

breakage is of 2 forms

The vector/matrix issue:

#= D:\libraries\julia\dev\ShaleDrillingLikelihood\example\softmax3test.jl:136 =# @code_warntype(f!(q, lse, tmpmax, x, 1)) = nothing
softmax3_avx_broken! with arrays: Error During Test at D:\libraries\julia\dev\ShaleDrillingLikelihood\example\softmax3test.jl:133
  Got exception outside of a @test
  AssertionError: Ni == N + 1
  Stacktrace:
   [1] packed_strided_ptr_index(::Core.SimpleVector, ::Int64, ::Type{Float64}) at D:\libraries\julia\packages\SIMDPirates\nxDqU\src\memory.jl:996
   [2] #s28#117(::Any, ::Any, ::Any, ::Any, ::Any, ::Any) at D:\libraries\julia\packages\SIMDPirates\nxDqU\src\memory.jl:1020
   [3] (::Core.GeneratedFunctionStub)(::Any, ::Vararg{Any,N} where N) at .\boot.jl:524
   [4] macro expansion at .\gcutils.jl:91 [inlined]
   [5] softmax3_avx_broken!(::Array{Float64,3}, ::Array{Float64,2}, ::Array{Float64,2}, ::Array{Float64,3}, ::Int64) at D:\libraries\julia\dev\ShaleDrillingLikelihood\example\softmax3test.jl:62
   [6] top-level scope at D:\libraries\julia\dev\ShaleDrillingLikelihood\example\softmax3test.jl:137
   [7] top-level scope at D:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.3\Test\src\Test.jl:1107
   [8] top-level scope at D:\libraries\julia\dev\ShaleDrillingLikelihood\example\softmax3test.jl:134
   [9] include_string(::Module, ::String, ::String) at .\loading.jl:1075
   [10] include_string(::Module, ::String, ::String, ::Int64) at D:\libraries\julia\packages\CodeTools\sf1Tz\src\eval.jl:30
   [11] (::Atom.var"#127#132"{String,Int64,String,Bool})() at D:\libraries\julia\packages\Atom\X8fAI\src\eval.jl:94
   [12] withpath(::Atom.var"#127#132"{String,Int64,String,Bool}, ::String) at D:\libraries\julia\packages\CodeTools\sf1Tz\src\utils.jl:30
   [13] withpath(::Function, ::String) at D:\libraries\julia\packages\Atom\X8fAI\src\eval.jl:47
   [14] #126 at D:\libraries\julia\packages\Atom\X8fAI\src\eval.jl:93 [inlined]
   [15] with_logstate(::Atom.var"#126#131"{String,Int64,String,Bool}, ::Base.CoreLogging.LogState) at .\logging.jl:395
   [16] with_logger at .\logging.jl:491 [inlined]
   [17] #125 at D:\libraries\julia\packages\Atom\X8fAI\src\eval.jl:92 [inlined]
   [18] hideprompt(::Atom.var"#125#130"{String,Int64,String,Bool}) at D:\libraries\julia\packages\Atom\X8fAI\src\repl.jl:85
   [19] macro expansion at D:\libraries\julia\packages\Atom\X8fAI\src\eval.jl:91 [inlined]
   [20] macro expansion at D:\libraries\julia\packages\Media\ItEPc\src\dynamic.jl:24 [inlined]
   [21] (::Atom.var"#124#129")(::Dict{String,Any}) at D:\libraries\julia\packages\Atom\X8fAI\src\eval.jl:86
   [22] handlemsg(::Dict{String,Any}, ::Dict{String,Any}) at D:\libraries\julia\packages\Atom\X8fAI\src\comm.jl:164
   [23] (::Atom.var"#19#21"{Array{Any,1}})() at .\task.jl:333

and

LoadError: "Don't know how to handle expression:\nk <= maxk && (qq[i, k] = tmp)"
in expression starting at D:\libraries\julia\dev\ShaleDrillingLikelihood\example\softmax3test.jl:62
push!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at graphs.jl:821
add_block!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at graphs.jl:280
add_loop!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at graphs.jl:347
add_loop! at graphs.jl:344 [inlined]
copyto! at constructors.jl:6 [inlined]
LoopVectorization.LoopSet(::Expr) at constructors.jl:46
@avx(::LineNumberNode, ::Module, ::Any) at constructors.jl:86

doc links in README is not working

Thanks a lot for your package, it is seem very nice!

I write you because when I want to study one package, I like to read the documentation. The links in the README.md to the docs is not working.

Also, it should be nice of part of the examples and documentations in Discourse forum not only should be in the documentation, but also a short version could be added to README.md.

failed broadcasted vectorization

With a vector a=rand(1000), this works:

julia> @avx @. sin(a) + sqrt(a);

but this doesn't:

julia> @avx @. 3*a + sin(a) + sqrt(a);
ERROR: MethodError: no method matching add_broadcast!(::LoopVectorization.LoopSet, ::Symbol, ::Symbol, ::Array{Symbol,1}, ::Type{Int64})

Error: "Don't know how to handle expression: ..."

Hi @chriselrod,

something seem to be broken in version v0.3.10. i get

"Don't know how to handle expression:\nzre += xre[i] * Are[i, k] - xim[i] * Aim[i, k]"

Stacktrace:
 [1] push!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at /home/moeddel/.julia/packages/LoopVectorization/WdP5f/src/graphs.jl:539
 [2] add_block!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at /home/moeddel/.julia/packages/LoopVectorization/WdP5f/src/graphs.jl:310
 [3] add_loop!(::LoopVectorization.LoopSet, ::Expr, ::Int64) at /home/moeddel/.julia/packages/LoopVectorization/WdP5f/src/graphs.jl:401
 [4] add_loop! at /home/moeddel/.julia/packages/LoopVectorization/WdP5f/src/graphs.jl:398 [inlined]
 [5] copyto! at /home/moeddel/.julia/packages/LoopVectorization/WdP5f/src/constructors.jl:6 [inlined]
 [6] LoopVectorization.LoopSet(::Expr) at /home/moeddel/.julia/packages/LoopVectorization/WdP5f/src/constructors.jl:45
 [7] @avx(::LineNumberNode, ::Module, ::Any) at /home/moeddel/.julia/packages/LoopVectorization/WdP5f/src/constructors.jl:93

when executing

using LoopVectorization

function test_avx(xre::Vector{T}, xim::Vector{T}, Are::Matrix{T}, Aim::Matrix{T}, k::Int) where {T<:AbstractFloat}
    zre = zero(T)
    zim = zero(T)
    @avx for i ∈ 1:length(xre)
        zre += xre[i]*Are[i,k] - xim[i]*Aim[i,k]
        zim += xre[i]*Aim[i,k] + xim[i]*Are[i,k]
    end
    return Complex{T}(zre,zim)
end

function test_avx(x::StructVector{Complex{T}}, A::StructArray{Complex{T},2}, k::Int) where {T<:AbstractFloat}
    return test_avx(x.re, x.im, A.re , A.im, k)
end

test_avx(b, A, k)

juliasimd / loopvectorization.jl Goto Github PK

loopvectorization.jl's Issues

Recommend Projects

Recommend Topics

Recommend Org