I hope I'm not bothering you ;) MWE: <div class="highlight highl

ERROR: MethodError: no method matching promote_vtype(::Type{VectorizationBase._MM{8}}, ::Type{Int32}) about loopvectorization.jl HOT 1 CLOSED

zsoerenm commented on July 16, 2024

ERROR: MethodError: no method matching promote_vtype(::Type{VectorizationBase._MM{8}}, ::Type{Int32})

from loopvectorization.jl.

Comments (1)

chriselrod commented on July 16, 2024 1

Thanks for all the bug reports!
You're helping make the software more robust.
I fixed it on SIMDPirates master.

julia> using LoopVectorization

julia> function gen_phases(phases::Vector{Int16}, freq, start_phase)
           fp = 21
           delta = floor(Int32, freq * 1 << (fp + 9))
           idxs = Int32(1):Int32(length(phases))
           fixed_point_start_phase = floor(Int32, start_phase * 1 << (fp + 9))
           @avx for i = 1:length(phases)
               phases[i] = Int16((idxs[i] * delta + fixed_point_start_phase) >> fp)
           end
       end
gen_phases (generic function with 1 method)

julia> phases = Vector{Int16}(undef, 1000);

julia> gen_phases(phases, 0.02, 0.1)

julia> function gen_phases2(phases::Vector{Int16}, freq, start_phase)
           fp = 21
           delta = floor(Int32, freq * 1 << (fp + 9))
           idxs = Int32(1):Int32(length(phases))
           fixed_point_start_phase = floor(Int32, start_phase * 1 << (fp + 9))
           @inbounds for i = 1:length(phases)
               phases[i] = Int16((idxs[i] * delta + fixed_point_start_phase) >> fp)
           end
       end
gen_phases2 (generic function with 1 method)

julia> phases2 = Vector{Int16}(undef, 1000);

julia> gen_phases2(phases2, 0.02, 0.1)

julia> phases == phases2
true

julia> using BenchmarkTools

julia> @benchmark gen_phases2($phases2, 0.02, 0.1)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     79.582 ns (0.00% GC)
  median time:      80.519 ns (0.00% GC)
  mean time:        81.743 ns (0.00% GC)
  maximum time:     121.466 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     968

julia> @benchmark gen_phases($phases, 0.02, 0.1)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     69.748 ns (0.00% GC)
  median time:      71.652 ns (0.00% GC)
  mean time:        71.644 ns (0.00% GC)
  maximum time:     110.123 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     976

I suspect this code may need AVX512 to be faster with @avx, because @inbounds also vectorizes the loop, meaning it is a matter of which generates better code.
@avx:

L240:
	leal	1(%rcx), %eax
	vpbroadcastd	%eax, %zmm3
	vpaddd	%zmm2, %zmm3, %zmm3
	vpmulld	%zmm0, %zmm3, %zmm3
	vpaddd	%zmm1, %zmm3, %zmm3
	vpsrad	$21, %zmm3, %zmm3
	vpmovdw	%zmm3, (%r9,%rcx,2)
	addq	$16, %rcx
	cmpq	%rsi, %rcx
	jl	L240

The loop is not unrolled beyond vectorization, meaning each iteration does 16 iterations.
It uses vpmovdw to down convert from 32 bit ints to 16 bit ints and store.
With @inbounds:

L336:
	vpaddd	%zmm4, %zmm3, %zmm6
	vpaddd	%zmm4, %zmm2, %zmm7
	vpmulld	%zmm0, %zmm2, %zmm8
	vpmulld	%zmm0, %zmm3, %zmm9
	vpmulld	%zmm0, %zmm7, %zmm7
	vpmulld	%zmm0, %zmm6, %zmm6
	vpaddd	%zmm1, %zmm9, %zmm9
	vpaddd	%zmm1, %zmm8, %zmm8
	vpaddd	%zmm1, %zmm6, %zmm6
	vpaddd	%zmm1, %zmm7, %zmm7
	vpsrad	$21, %zmm8, %zmm8
	vpsrad	$21, %zmm9, %zmm9
	vpsrad	$21, %zmm7, %zmm7
	vpsrad	$21, %zmm6, %zmm6
	vpmovdw	%zmm9, %ymm9
	vpmovdw	%zmm8, %ymm8
	vinserti64x4	$1, %ymm8, %zmm9, %zmm8
	vpmovdw	%zmm6, %ymm6
	vpmovdw	%zmm7, %ymm7
	vinserti64x4	$1, %ymm7, %zmm6, %zmm6
	vmovdqu64	%zmm8, (%rax,%rdx,2)
	vmovdqu64	%zmm6, 64(%rax,%rdx,2)
	addq	$64, %rdx
	vpaddd	%zmm5, %zmm2, %zmm2
	vpaddd	%zmm5, %zmm3, %zmm3
	cmpq	%rdx, %r11
	jne	L336

It follows the same strategy as with AVX2, where it combines two registers into one, which it then stores. It unrolls it 4x, so each iteration of the above does 64 iterations of the loop as written.

I tried the code you wrote yesterday, and found the same thing. In both cases AVX512 was faster with @avx than with @inbounds, while AVX2 had the opposite behavior.

from loopvectorization.jl.

ERROR: MethodError: no method matching promote_vtype(::Type{VectorizationBase._MM{8}}, ::Type{Int32}) about loopvectorization.jl HOT 1 CLOSED

Comments (1)

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent