Edit: Working Solution at bottom LLVM info: 11.0.1 (43ff75f2c3feef64

CUDA kernels not being autodiff'd,about enzymead/enzyme

Comments (7)

proteneer commented on May 21, 2024 1

Thanks for the prompt reply, I've modified the code so that it calls other device code, unfortunately I still seem to be getting unresolved extern errors:

07:42 $ clang -c test3.cu -Xclang -load -Xclang /home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so -O2 -fno-vectorize -fno-unroll-loops --cuda-gpu-arch=sm_70
ptxas fatal   : Unresolved extern function '_Z17__enzyme_autodiffPFvPdS_EiS_S_iS_S_'

#include <stdio.h>

void __device__ foo_impl(double* x_in, double *x_out) {
    x_out[0] = x_in[0] * x_in[0];    
}

typedef void (*f_ptr)(double*, double*);

extern void __device__ __enzyme_autodiff(f_ptr,
    int, double*, double*,
    int, double*, double*);

void __global__ foo(double* x_in, double *x_out) {
    foo_impl(x_in, x_out);
}

void __global__ foo_grad(double* x, double *d_x, double *y, double *d_y) {

    int enzyme_dup;
    int enzyme_out;
    int enzyme_const;

    __enzyme_autodiff(foo_impl,
        enzyme_dup, x, d_x,
        enzyme_dup, y, d_y);

}

int main() {

    double *x, *d_x, *y, *d_y; // all on the devic

    cudaMalloc(&x, sizeof(*x));
    cudaMalloc(&d_x, sizeof(*d_x));
    cudaMalloc(&y, sizeof(*y));
    cudaMalloc(&d_y, sizeof(*d_y));

    double host_x = 1.4;
    double host_d_x = 0.0;
    double host_y;
    double host_d_y = 1.0;

    cudaMemcpy(x, &host_x, sizeof(*x), cudaMemcpyHostToDevice);
    cudaMemcpy(d_x, &host_d_x, sizeof(*d_x), cudaMemcpyHostToDevice);
    cudaMemcpy(y, &host_y, sizeof(*y), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, &host_d_y, sizeof(*d_y), cudaMemcpyHostToDevice);

    foo<<<1,1>>>(x, y);

    cudaMemcpy(&host_x, x, sizeof(*x), cudaMemcpyDeviceToHost);
    cudaMemcpy(&host_d_x, d_x, sizeof(*d_x), cudaMemcpyDeviceToHost);
    cudaMemcpy(&host_y, y, sizeof(*y), cudaMemcpyDeviceToHost);
    cudaMemcpy(&host_d_y, d_y, sizeof(*d_y), cudaMemcpyDeviceToHost);

    printf("%f %f\n", host_x, host_y);
    printf("%f %f\n", host_d_x, host_d_y);

}

from enzyme.

proteneer commented on May 21, 2024 1

Thank you, that solved the issue, and I'm pleased to report successful compilation and correct derivatives.

Would you like me to submit a PR detailing what was discussed here into the "Getting Started" guide?

Here's a fully contained example:

#include <stdio.h>

// one-liner
// clang test3.cu -Xclang -load -Xclang /home/yutong/Code/Enzyme/enzyme/build/Enzyme/ClangEnzyme-11.so -O2 -fno-vectorize -fno-unroll-loops -fPIC --cuda-gpu-arch=sm_70 -lcudart -L/usr/local/cuda-10.1/lib64
void __device__ foo_impl(double* x_in, double *x_out) {
    x_out[0] = x_in[0] * x_in[0];    
}

typedef void (*f_ptr)(double*, double*);

extern void __device__ __enzyme_autodiff(f_ptr,
    int, double*, double*,
    int, double*, double*
);

void __global__ foo(double* x_in, double *x_out) {
    foo_impl(x_in, x_out);
}

int __device__ enzyme_dup;
int __device__ enzyme_out;
int __device__ enzyme_const;

void __global__ foo_grad(double* x, double *d_x, double *y, double *d_y) {

    __enzyme_autodiff(foo_impl,
        enzyme_dup, x, d_x,
        enzyme_dup, y, d_y);

}

int main() {

    double *x, *d_x, *y, *d_y; // all on the devic

    cudaMalloc(&x, sizeof(*x));
    cudaMalloc(&d_x, sizeof(*d_x));
    cudaMalloc(&y, sizeof(*y));
    cudaMalloc(&d_y, sizeof(*d_y));

    double host_x = 1.4;
    double host_d_x = 0.0;
    double host_y;
    double host_d_y = 1.0;

    cudaMemcpy(x, &host_x, sizeof(*x), cudaMemcpyHostToDevice);
    cudaMemcpy(d_x, &host_d_x, sizeof(*d_x), cudaMemcpyHostToDevice);
    cudaMemcpy(y, &host_y, sizeof(*y), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, &host_d_y, sizeof(*d_y), cudaMemcpyHostToDevice);

    // foo<<<1,1>>>(x, y); fwd-pass only
    foo_grad<<<1,1>>>(x, d_x, y, d_y); // fwd and bkwd pass

    cudaDeviceSynchronize();

    cudaMemcpy(&host_x, x, sizeof(*x), cudaMemcpyDeviceToHost);
    cudaMemcpy(&host_d_x, d_x, sizeof(*d_x), cudaMemcpyDeviceToHost);
    cudaMemcpy(&host_y, y, sizeof(*y), cudaMemcpyDeviceToHost);
    cudaMemcpy(&host_d_y, d_y, sizeof(*d_y), cudaMemcpyDeviceToHost);

    printf("%f %f\n", host_x, host_y);
    printf("%f %f\n", host_d_x, host_d_y);

}

from enzyme.

wsmoses commented on May 21, 2024 1

Go for it, the relevant branch is "www". Also note that the load into Clang doesn't ensure that optimizations are always run before and after Enzyme which may have performance implications.

from enzyme.

proteneer commented on May 21, 2024

Manually invoking AD on the intermediate test3.ll file results in a segfault (truncated log):

; Function Attrs: norecurse uwtable
define internal void @diffe_Z18__device_stub__fooPdS_.2(double* %x_in, double* %"x_in'", double* %x_out, double* %"x_out'", i8* %tapeArg) #0 {
entry:
  %malloccall2 = tail call i8* @malloc(i64 12)
  %"malloccall2'mi" = phi i8* 
  %grid_dim = bitcast i8* %malloccall2 to %struct.dim3*
  %malloccall3 = tail call i8* @malloc(i64 12)
  %"malloccall3'mi" = phi i8* 
  %block_dim = bitcast i8* %malloccall3 to %struct.dim3*
  %malloccall1 = tail call i8* @malloc(i64 8)
  %"malloccall1'mi" = phi i8* 
  %x_out.addr = bitcast i8* %malloccall1 to double**
  %malloccall4 = tail call i8* @malloc(i64 8)
  %"malloccall4'mi" = phi i8* 
  %shmem_size = bitcast i8* %malloccall4 to i64*
  %malloccall = tail call i8* @malloc(i64 8)
  %"malloccall'mi" = phi i8* 
  %x_in.addr = bitcast i8* %malloccall to double**
  %malloccall5 = tail call i8* @malloc(i64 8)
  %"malloccall5'mi" = phi i8* 
  %stream = bitcast i8* %malloccall5 to i8**
  store double* %x_in, double** %x_in.addr, align 8, !tbaa !4
  store double* %x_out, double** %x_out.addr, align 8, !tbaa !4
  %malloccall6 = tail call i8* @malloc(i64 16)
  %"malloccall6'mi" = phi i8* 
  %"kernel_args3'ipc" = bitcast i8* %"malloccall6'mi" to [2 x i8*]*
  %kernel_args3 = bitcast i8* %malloccall6 to [2 x i8*]*
  %"kernel_args3.sub'ipg" = getelementptr inbounds [2 x i8*], [2 x i8*]* %"kernel_args3'ipc", i64 0, i64 0
  %kernel_args3.sub = getelementptr inbounds [2 x i8*], [2 x i8*]* %kernel_args3, i64 0, i64 0
  %0 = bitcast [2 x i8*]* %kernel_args3 to double***
  store double** %x_in.addr, double*** %0, align 16
  %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %kernel_args3, i64 0, i64 1
  %2 = bitcast i8** %1 to double***
  store double** %x_out.addr, double*** %2, align 8
  %3 = call i32 @__cudaPopCallConfiguration(%struct.dim3* nonnull %grid_dim, %struct.dim3* nonnull %block_dim, i64* nonnull %shmem_size, i8** nonnull %stream)
  %"'ip_phi" = phi i32 
  %4 = load i64, i64* %shmem_size, align 8
  %"'il_phi" = phi i64 
  %5 = bitcast i8** %stream to %struct.CUstream_st**
  %6 = load %struct.CUstream_st*, %struct.CUstream_st** %5, align 8
  %"'il_phi1" = phi %struct.CUstream_st* 
  %grid_dim.coerce.sroa.0.0..sroa_cast = bitcast %struct.dim3* %grid_dim to i64*
  %grid_dim.coerce.sroa.0.0.copyload = load i64, i64* %grid_dim.coerce.sroa.0.0..sroa_cast, align 8
  %"grid_dim.coerce.sroa.0.0.copyload'il_phi" = phi i64 
  %grid_dim.coerce.sroa.2.0..sroa_idx2 = getelementptr inbounds %struct.dim3, %struct.dim3* %grid_dim, i64 0, i32 2
  %grid_dim.coerce.sroa.2.0.copyload = load i32, i32* %grid_dim.coerce.sroa.2.0..sroa_idx2, align 8
  %"grid_dim.coerce.sroa.2.0.copyload'il_phi" = phi i32 
  %block_dim.coerce.sroa.0.0..sroa_cast = bitcast %struct.dim3* %block_dim to i64*
  %block_dim.coerce.sroa.0.0.copyload = load i64, i64* %block_dim.coerce.sroa.0.0..sroa_cast, align 8
  %"block_dim.coerce.sroa.0.0.copyload'il_phi" = phi i64 
  %block_dim.coerce.sroa.2.0..sroa_idx1 = getelementptr inbounds %struct.dim3, %struct.dim3* %block_dim, i64 0, i32 2
  %block_dim.coerce.sroa.2.0.copyload = load i32, i32* %block_dim.coerce.sroa.2.0..sroa_idx1, align 8
  %"block_dim.coerce.sroa.2.0.copyload'il_phi" = phi i32 
  %call = call i32 @cudaLaunchKernel(i8* bitcast (void (double*, double*)* @_Z18__device_stub__fooPdS_ to i8*), i64 %grid_dim.coerce.sroa.0.0.copyload, i32 %grid_dim.coerce.sroa.2.0.copyload, i64 %block_dim.coerce.sroa.0.0.copyload, i32 %block_dim.coerce.sroa.2.0.copyload, i8** nonnull %kernel_args3.sub, i64 %4, %struct.CUstream_st* %6)
  %"call'ip_phi" = phi i32 
  br label %invertentry

allocsForInversion:                               ; No predecessors!

invertentry:                                      ; preds = %entry
}

declare i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) local_unnamed_addr

opt: ../Enzyme/AdjointGenerator.h:3480: void AdjointGenerator<AugmentedReturnType>::visitCallInst(llvm::CallInst&) [with AugmentedReturnType = const AugmentedReturn*]: Assertion `subdata' failed.
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace.
Stack dump:
0.	Program arguments: opt test3.ll -load=/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so -enzyme -o output.ll -S 
1.	Running pass 'Enzyme Pass' on module 'test3.ll'.
 #0 0x000055da2bfcc85a llvm::sys::PrintStackTrace(llvm::raw_ostream&) (/home/yutong/local/bin/opt+0x193e85a)
 #1 0x000055da2bfca564 llvm::sys::RunSignalHandlers() (/home/yutong/local/bin/opt+0x193c564)
 #2 0x000055da2bfca6b3 SignalHandler(int) (/home/yutong/local/bin/opt+0x193c6b3)
 #3 0x00007f3db88e7890 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x12890)
 #4 0x00007f3db77c2e97 raise /build/glibc-OTsEL5/glibc-2.27/signal/../sysdeps/unix/sysv/linux/raise.c:51:0
 #5 0x00007f3db77c4801 abort /build/glibc-OTsEL5/glibc-2.27/stdlib/abort.c:81:0
 #6 0x00007f3db77b439a __assert_fail_base /build/glibc-OTsEL5/glibc-2.27/assert/assert.c:89:0
 #7 0x00007f3db77b4412 (/lib/x86_64-linux-gnu/libc.so.6+0x30412)
 #8 0x00007f3db733c810 AdjointGenerator<AugmentedReturn const*>::visitCallInst(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x45d810)
 #9 0x00007f3db7333fe9 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::delegateCallInst(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x454fe9)
#10 0x00007f3db7314e0b llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visitCall(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x435e0b)
#11 0x00007f3db7306d97 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x427d97)
#12 0x00007f3db72fb383 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction*) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x41c383)
#13 0x00007f3db72dcf94 CreatePrimalAndGradient(llvm::Function*, DIFFE_TYPE, std::vector<DIFFE_TYPE, std::allocator<DIFFE_TYPE> > const&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, bool, bool, llvm::Type*, FnTypeInfo const&, std::map<llvm::Argument*, bool, std::less<llvm::Argument*>, std::allocator<std::pair<llvm::Argument* const, bool> > >, AugmentedReturn const*, bool, bool, bool) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x3fdf94)
#14 0x00007f3db739f68c GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x4c068c)
#15 0x00007f3db739fc53 GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x4c0c53)
#16 0x00007f3db73271bc AdjointGenerator<AugmentedReturn*>::visitCallInst(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x4481bc)
#17 0x00007f3db731f5d3 llvm::InstVisitor<AdjointGenerator<AugmentedReturn*>, void>::delegateCallInst(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x4405d3)
#18 0x00007f3db7310a01 llvm::InstVisitor<AdjointGenerator<AugmentedReturn*>, void>::visitCall(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x431a01)
#19 0x00007f3db7303ea7 llvm::InstVisitor<AdjointGenerator<AugmentedReturn*>, void>::visit(llvm::Instruction&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x424ea7)
#20 0x00007f3db72f8dad llvm::InstVisitor<AdjointGenerator<AugmentedReturn*>, void>::visit(llvm::Instruction*) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x419dad)
#21 0x00007f3db72d469e CreateAugmentedPrimal(llvm::Function*, DIFFE_TYPE, std::vector<DIFFE_TYPE, std::allocator<DIFFE_TYPE> > const&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, FnTypeInfo const&, std::map<llvm::Argument*, bool, std::less<llvm::Argument*>, std::allocator<std::pair<llvm::Argument* const, bool> > >, bool, bool, bool, bool) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x3f569e)
#22 0x00007f3db739f5c3 GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x4c05c3)
#23 0x00007f3db739fc53 GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x4c0c53)
#24 0x00007f3db733ba8f AdjointGenerator<AugmentedReturn const*>::visitCallInst(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x45ca8f)
#25 0x00007f3db7333fe9 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::delegateCallInst(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x454fe9)
#26 0x00007f3db7314e0b llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visitCall(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x435e0b)
#27 0x00007f3db7306d97 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x427d97)
#28 0x00007f3db72fb383 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction*) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x41c383)
#29 0x00007f3db72dcf94 CreatePrimalAndGradient(llvm::Function*, DIFFE_TYPE, std::vector<DIFFE_TYPE, std::allocator<DIFFE_TYPE> > const&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, bool, bool, llvm::Type*, FnTypeInfo const&, std::map<llvm::Argument*, bool, std::less<llvm::Argument*>, std::allocator<std::pair<llvm::Argument* const, bool> > >, AugmentedReturn const*, bool, bool, bool) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x3fdf94)
#30 0x00007f3db72bcc9b HandleAutoDiff(llvm::CallInst*, llvm::TargetLibraryInfo&, llvm::AAResults&, bool) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x3ddc9b)
#31 0x00007f3db72bdc17 (anonymous namespace)::Enzyme::lowerEnzymeCalls(llvm::Function&, bool, bool&, std::set<llvm::Function*, std::less<llvm::Function*>, std::allocator<llvm::Function*> >&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x3dec17)
#32 0x00007f3db72bddb3 (anonymous namespace)::Enzyme::runOnModule(llvm::Module&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x3dedb3)
#33 0x000055da2b8393b1 llvm::legacy::PassManagerImpl::run(llvm::Module&) (/home/yutong/local/bin/opt+0x11ab3b1)
#34 0x000055da2ab41ddc main (/home/yutong/local/bin/opt+0x4b3ddc)
#35 0x00007f3db77a5b97 __libc_start_main /build/glibc-OTsEL5/glibc-2.27/csu/../csu/libc-start.c:344:0
#36 0x000055da2abb639a _start (/home/yutong/local/bin/opt+0x52839a)
Aborted (core dumped)

Full test3.ll file

; ModuleID = 'test3.cu'
source_filename = "test3.cu"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%struct.dim3 = type { i32, i32, i32 }
%struct.CUstream_st = type opaque

@enzyme_dup = local_unnamed_addr global i32 0, align 4
@enzyme_out = local_unnamed_addr global i32 0, align 4
@enzyme_const = local_unnamed_addr global i32 0, align 4
@.str = private unnamed_addr constant [7 x i8] c"%f %f\0A\00", align 1

; Function Attrs: norecurse uwtable
define void @_Z18__device_stub__fooPdS_(double* %x_in, double* %x_out) #0 {
entry:
  %x_in.addr = alloca double*, align 8
  %x_out.addr = alloca double*, align 8
  %grid_dim = alloca %struct.dim3, align 8
  %block_dim = alloca %struct.dim3, align 8
  %shmem_size = alloca i64, align 8
  %stream = alloca i8*, align 8
  store double* %x_in, double** %x_in.addr, align 8, !tbaa !4
  store double* %x_out, double** %x_out.addr, align 8, !tbaa !4
  %kernel_args3 = alloca [2 x i8*], align 16
  %kernel_args3.sub = getelementptr inbounds [2 x i8*], [2 x i8*]* %kernel_args3, i64 0, i64 0
  %0 = bitcast [2 x i8*]* %kernel_args3 to double***
  store double** %x_in.addr, double*** %0, align 16
  %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %kernel_args3, i64 0, i64 1
  %2 = bitcast i8** %1 to double***
  store double** %x_out.addr, double*** %2, align 8
  %3 = call i32 @__cudaPopCallConfiguration(%struct.dim3* nonnull %grid_dim, %struct.dim3* nonnull %block_dim, i64* nonnull %shmem_size, i8** nonnull %stream)
  %4 = load i64, i64* %shmem_size, align 8
  %5 = bitcast i8** %stream to %struct.CUstream_st**
  %6 = load %struct.CUstream_st*, %struct.CUstream_st** %5, align 8
  %grid_dim.coerce.sroa.0.0..sroa_cast = bitcast %struct.dim3* %grid_dim to i64*
  %grid_dim.coerce.sroa.0.0.copyload = load i64, i64* %grid_dim.coerce.sroa.0.0..sroa_cast, align 8
  %grid_dim.coerce.sroa.2.0..sroa_idx2 = getelementptr inbounds %struct.dim3, %struct.dim3* %grid_dim, i64 0, i32 2
  %grid_dim.coerce.sroa.2.0.copyload = load i32, i32* %grid_dim.coerce.sroa.2.0..sroa_idx2, align 8
  %block_dim.coerce.sroa.0.0..sroa_cast = bitcast %struct.dim3* %block_dim to i64*
  %block_dim.coerce.sroa.0.0.copyload = load i64, i64* %block_dim.coerce.sroa.0.0..sroa_cast, align 8
  %block_dim.coerce.sroa.2.0..sroa_idx1 = getelementptr inbounds %struct.dim3, %struct.dim3* %block_dim, i64 0, i32 2
  %block_dim.coerce.sroa.2.0.copyload = load i32, i32* %block_dim.coerce.sroa.2.0..sroa_idx1, align 8
  %call = call i32 @cudaLaunchKernel(i8* bitcast (void (double*, double*)* @_Z18__device_stub__fooPdS_ to i8*), i64 %grid_dim.coerce.sroa.0.0.copyload, i32 %grid_dim.coerce.sroa.2.0.copyload, i64 %block_dim.coerce.sroa.0.0.copyload, i32 %block_dim.coerce.sroa.2.0.copyload, i8** nonnull %kernel_args3.sub, i64 %4, %struct.CUstream_st* %6)
  ret void
}

declare i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**) local_unnamed_addr

declare i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) local_unnamed_addr

; Function Attrs: norecurse uwtable
define i32 @main() local_unnamed_addr #0 {
entry:
  %x = alloca double*, align 8
  %d_x = alloca double*, align 8
  %y = alloca double*, align 8
  %d_y = alloca double*, align 8
  %host_x = alloca double, align 8
  %host_d_x = alloca double, align 8
  %host_y = alloca double, align 8
  %host_d_y = alloca double, align 8
  %0 = bitcast double** %x to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %0) #4
  %1 = bitcast double** %d_x to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %1) #4
  %2 = bitcast double** %y to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %2) #4
  %3 = bitcast double** %d_y to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %3) #4
  %4 = bitcast double** %x to i8**
  %call.i = call i32 @cudaMalloc(i8** nonnull %4, i64 8)
  %5 = bitcast double** %d_x to i8**
  %call.i18 = call i32 @cudaMalloc(i8** nonnull %5, i64 8)
  %6 = bitcast double** %y to i8**
  %call.i19 = call i32 @cudaMalloc(i8** nonnull %6, i64 8)
  %7 = bitcast double** %d_y to i8**
  %call.i20 = call i32 @cudaMalloc(i8** nonnull %7, i64 8)
  %8 = bitcast double* %host_x to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %8) #4
  store double 1.400000e+00, double* %host_x, align 8, !tbaa !8
  %9 = bitcast double* %host_d_x to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %9) #4
  store double 0.000000e+00, double* %host_d_x, align 8, !tbaa !8
  %10 = bitcast double* %host_y to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %10) #4
  %11 = bitcast double* %host_d_y to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %11) #4
  store double 1.000000e+00, double* %host_d_y, align 8, !tbaa !8
  %12 = load i8*, i8** %4, align 8, !tbaa !4
  %call4 = call i32 @cudaMemcpy(i8* %12, i8* nonnull %8, i64 8, i32 1)
  %13 = load i8*, i8** %5, align 8, !tbaa !4
  %call5 = call i32 @cudaMemcpy(i8* %13, i8* nonnull %9, i64 8, i32 1)
  %14 = load i8*, i8** %6, align 8, !tbaa !4
  %call6 = call i32 @cudaMemcpy(i8* %14, i8* nonnull %10, i64 8, i32 1)
  %15 = load i8*, i8** %7, align 8, !tbaa !4
  %call7 = call i32 @cudaMemcpy(i8* %15, i8* nonnull %11, i64 8, i32 1)
  %call9 = call i32 @__cudaPushCallConfiguration(i64 4294967297, i32 1, i64 4294967297, i32 1, i64 0, i8* null)
  %tobool.not = icmp eq i32 %call9, 0
  br i1 %tobool.not, label %kcall.configok, label %kcall.end

kcall.configok:                                   ; preds = %entry
  %16 = load i32, i32* @enzyme_dup, align 4, !tbaa !10
  %17 = load double*, double** %x, align 8, !tbaa !4
  %18 = load double*, double** %d_x, align 8, !tbaa !4
  %19 = load double*, double** %y, align 8, !tbaa !4
  %20 = load double*, double** %d_y, align 8, !tbaa !4
  call void @_Z32__device_stub____enzyme_autodiffPFvPdS_EiS_S_iS_S_(void (double*, double*)* nonnull @_Z18__device_stub__fooPdS_, i32 %16, double* %17, double* %18, i32 %16, double* %19, double* %20)
  br label %kcall.end

kcall.end:                                        ; preds = %entry, %kcall.configok
  %21 = load i8*, i8** %4, align 8, !tbaa !4
  %call10 = call i32 @cudaMemcpy(i8* nonnull %8, i8* %21, i64 8, i32 2)
  %22 = load i8*, i8** %5, align 8, !tbaa !4
  %call11 = call i32 @cudaMemcpy(i8* nonnull %9, i8* %22, i64 8, i32 2)
  %23 = load i8*, i8** %6, align 8, !tbaa !4
  %call12 = call i32 @cudaMemcpy(i8* nonnull %10, i8* %23, i64 8, i32 2)
  %24 = load i8*, i8** %7, align 8, !tbaa !4
  %call13 = call i32 @cudaMemcpy(i8* nonnull %11, i8* %24, i64 8, i32 2)
  %25 = load double, double* %host_x, align 8, !tbaa !8
  %26 = load double, double* %host_y, align 8, !tbaa !8
  %call14 = call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i64 0, i64 0), double %25, double %26)
  %27 = load double, double* %host_d_x, align 8, !tbaa !8
  %28 = load double, double* %host_d_y, align 8, !tbaa !8
  %call15 = call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i64 0, i64 0), double %27, double %28)
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %11) #4
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %10) #4
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %9) #4
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %8) #4
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %3) #4
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %2) #4
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %1) #4
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %0) #4
  ret i32 0
}

; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1

declare i32 @cudaMemcpy(i8*, i8*, i64, i32) local_unnamed_addr #2

declare i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) local_unnamed_addr #2

declare void @_Z32__device_stub____enzyme_autodiffPFvPdS_EiS_S_iS_S_(void (double*, double*)*, i32, double*, double*, i32, double*, double*) local_unnamed_addr #2

; Function Attrs: nofree nounwind
declare i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #3

; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1

declare i32 @cudaMalloc(i8**, i64) local_unnamed_addr #2

attributes #0 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind willreturn }
attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { nounwind }

!llvm.module.flags = !{!0, !1, !2}
!llvm.ident = !{!3}

!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 7, !"PIC Level", i32 2}
!3 = !{!"clang version 11.0.1 (https://github.com/llvm/llvm-project.git 43ff75f2c3feef64f9d73328230d34dac8832a91)"}
!4 = !{!5, !5, i64 0}
!5 = !{!"any pointer", !6, i64 0}
!6 = !{!"omnipotent char", !7, i64 0}
!7 = !{!"Simple C++ TBAA"}
!8 = !{!9, !9, i64 0}
!9 = !{!"double", !6, i64 0}
!10 = !{!11, !11, i64 0}
!11 = !{!"int", !6, i64 0}

from enzyme.

wsmoses commented on May 21, 2024

Hi and thanks for helping beta test the CUDA AD.

Using Enzyme to AD through a kernel launch isn't supported yet (though perhaps we should). However, you should be able to call enzyme_autodiff within a cuda kernel on other cuda code.

from enzyme.

waltsims commented on May 21, 2024

I was just about to try this on an accelerated wave propagation simulation. Would also be interested in it being supported.

from enzyme.

wsmoses commented on May 21, 2024

Try loading /path/to/ClangEnzyme-version.so rather than /path/to/LLVMEnzyme-version.so The LLVMEnzyme binary loads the auto diff package but does not add the enzyme pass to be called (expecting it to be manually added to opt). The aptly named ClangEnzyme automatically enables the Enzyme AD pass and thus is more suitable for use in Clang directly.

from enzyme.

CUDA kernels not being autodiff'd about enzyme HOT 7 CLOSED

Comments (7)

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent