Git Product home page Git Product logo

Comments (7)

proteneer avatar proteneer commented on May 21, 2024 1

Thanks for the prompt reply, I've modified the code so that it calls other device code, unfortunately I still seem to be getting unresolved extern errors:

07:42 $ clang -c test3.cu -Xclang -load -Xclang /home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so -O2 -fno-vectorize -fno-unroll-loops --cuda-gpu-arch=sm_70
ptxas fatal   : Unresolved extern function '_Z17__enzyme_autodiffPFvPdS_EiS_S_iS_S_'
#include <stdio.h>

void __device__ foo_impl(double* x_in, double *x_out) {
    x_out[0] = x_in[0] * x_in[0];    
}

typedef void (*f_ptr)(double*, double*);

extern void __device__ __enzyme_autodiff(f_ptr,
    int, double*, double*,
    int, double*, double*);

void __global__ foo(double* x_in, double *x_out) {
    foo_impl(x_in, x_out);
}

void __global__ foo_grad(double* x, double *d_x, double *y, double *d_y) {

    int enzyme_dup;
    int enzyme_out;
    int enzyme_const;

    __enzyme_autodiff(foo_impl,
        enzyme_dup, x, d_x,
        enzyme_dup, y, d_y);

}

int main() {

    double *x, *d_x, *y, *d_y; // all on the devic

    cudaMalloc(&x, sizeof(*x));
    cudaMalloc(&d_x, sizeof(*d_x));
    cudaMalloc(&y, sizeof(*y));
    cudaMalloc(&d_y, sizeof(*d_y));

    double host_x = 1.4;
    double host_d_x = 0.0;
    double host_y;
    double host_d_y = 1.0;

    cudaMemcpy(x, &host_x, sizeof(*x), cudaMemcpyHostToDevice);
    cudaMemcpy(d_x, &host_d_x, sizeof(*d_x), cudaMemcpyHostToDevice);
    cudaMemcpy(y, &host_y, sizeof(*y), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, &host_d_y, sizeof(*d_y), cudaMemcpyHostToDevice);

    foo<<<1,1>>>(x, y);

    cudaMemcpy(&host_x, x, sizeof(*x), cudaMemcpyDeviceToHost);
    cudaMemcpy(&host_d_x, d_x, sizeof(*d_x), cudaMemcpyDeviceToHost);
    cudaMemcpy(&host_y, y, sizeof(*y), cudaMemcpyDeviceToHost);
    cudaMemcpy(&host_d_y, d_y, sizeof(*d_y), cudaMemcpyDeviceToHost);

    printf("%f %f\n", host_x, host_y);
    printf("%f %f\n", host_d_x, host_d_y);

}

from enzyme.

proteneer avatar proteneer commented on May 21, 2024 1

Thank you, that solved the issue, and I'm pleased to report successful compilation and correct derivatives.

Would you like me to submit a PR detailing what was discussed here into the "Getting Started" guide?

Here's a fully contained example:

#include <stdio.h>

// one-liner
// clang test3.cu -Xclang -load -Xclang /home/yutong/Code/Enzyme/enzyme/build/Enzyme/ClangEnzyme-11.so -O2 -fno-vectorize -fno-unroll-loops -fPIC --cuda-gpu-arch=sm_70 -lcudart -L/usr/local/cuda-10.1/lib64
void __device__ foo_impl(double* x_in, double *x_out) {
    x_out[0] = x_in[0] * x_in[0];    
}

typedef void (*f_ptr)(double*, double*);

extern void __device__ __enzyme_autodiff(f_ptr,
    int, double*, double*,
    int, double*, double*
);

void __global__ foo(double* x_in, double *x_out) {
    foo_impl(x_in, x_out);
}

int __device__ enzyme_dup;
int __device__ enzyme_out;
int __device__ enzyme_const;

void __global__ foo_grad(double* x, double *d_x, double *y, double *d_y) {

    __enzyme_autodiff(foo_impl,
        enzyme_dup, x, d_x,
        enzyme_dup, y, d_y);

}

int main() {

    double *x, *d_x, *y, *d_y; // all on the devic

    cudaMalloc(&x, sizeof(*x));
    cudaMalloc(&d_x, sizeof(*d_x));
    cudaMalloc(&y, sizeof(*y));
    cudaMalloc(&d_y, sizeof(*d_y));

    double host_x = 1.4;
    double host_d_x = 0.0;
    double host_y;
    double host_d_y = 1.0;

    cudaMemcpy(x, &host_x, sizeof(*x), cudaMemcpyHostToDevice);
    cudaMemcpy(d_x, &host_d_x, sizeof(*d_x), cudaMemcpyHostToDevice);
    cudaMemcpy(y, &host_y, sizeof(*y), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, &host_d_y, sizeof(*d_y), cudaMemcpyHostToDevice);

    // foo<<<1,1>>>(x, y); fwd-pass only
    foo_grad<<<1,1>>>(x, d_x, y, d_y); // fwd and bkwd pass

    cudaDeviceSynchronize();

    cudaMemcpy(&host_x, x, sizeof(*x), cudaMemcpyDeviceToHost);
    cudaMemcpy(&host_d_x, d_x, sizeof(*d_x), cudaMemcpyDeviceToHost);
    cudaMemcpy(&host_y, y, sizeof(*y), cudaMemcpyDeviceToHost);
    cudaMemcpy(&host_d_y, d_y, sizeof(*d_y), cudaMemcpyDeviceToHost);

    printf("%f %f\n", host_x, host_y);
    printf("%f %f\n", host_d_x, host_d_y);

}

from enzyme.

wsmoses avatar wsmoses commented on May 21, 2024 1

Go for it, the relevant branch is "www". Also note that the load into Clang doesn't ensure that optimizations are always run before and after Enzyme which may have performance implications.

from enzyme.

proteneer avatar proteneer commented on May 21, 2024

Manually invoking AD on the intermediate test3.ll file results in a segfault (truncated log):

; Function Attrs: norecurse uwtable
define internal void @diffe_Z18__device_stub__fooPdS_.2(double* %x_in, double* %"x_in'", double* %x_out, double* %"x_out'", i8* %tapeArg) #0 {
entry:
  %malloccall2 = tail call i8* @malloc(i64 12)
  %"malloccall2'mi" = phi i8* 
  %grid_dim = bitcast i8* %malloccall2 to %struct.dim3*
  %malloccall3 = tail call i8* @malloc(i64 12)
  %"malloccall3'mi" = phi i8* 
  %block_dim = bitcast i8* %malloccall3 to %struct.dim3*
  %malloccall1 = tail call i8* @malloc(i64 8)
  %"malloccall1'mi" = phi i8* 
  %x_out.addr = bitcast i8* %malloccall1 to double**
  %malloccall4 = tail call i8* @malloc(i64 8)
  %"malloccall4'mi" = phi i8* 
  %shmem_size = bitcast i8* %malloccall4 to i64*
  %malloccall = tail call i8* @malloc(i64 8)
  %"malloccall'mi" = phi i8* 
  %x_in.addr = bitcast i8* %malloccall to double**
  %malloccall5 = tail call i8* @malloc(i64 8)
  %"malloccall5'mi" = phi i8* 
  %stream = bitcast i8* %malloccall5 to i8**
  store double* %x_in, double** %x_in.addr, align 8, !tbaa !4
  store double* %x_out, double** %x_out.addr, align 8, !tbaa !4
  %malloccall6 = tail call i8* @malloc(i64 16)
  %"malloccall6'mi" = phi i8* 
  %"kernel_args3'ipc" = bitcast i8* %"malloccall6'mi" to [2 x i8*]*
  %kernel_args3 = bitcast i8* %malloccall6 to [2 x i8*]*
  %"kernel_args3.sub'ipg" = getelementptr inbounds [2 x i8*], [2 x i8*]* %"kernel_args3'ipc", i64 0, i64 0
  %kernel_args3.sub = getelementptr inbounds [2 x i8*], [2 x i8*]* %kernel_args3, i64 0, i64 0
  %0 = bitcast [2 x i8*]* %kernel_args3 to double***
  store double** %x_in.addr, double*** %0, align 16
  %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %kernel_args3, i64 0, i64 1
  %2 = bitcast i8** %1 to double***
  store double** %x_out.addr, double*** %2, align 8
  %3 = call i32 @__cudaPopCallConfiguration(%struct.dim3* nonnull %grid_dim, %struct.dim3* nonnull %block_dim, i64* nonnull %shmem_size, i8** nonnull %stream)
  %"'ip_phi" = phi i32 
  %4 = load i64, i64* %shmem_size, align 8
  %"'il_phi" = phi i64 
  %5 = bitcast i8** %stream to %struct.CUstream_st**
  %6 = load %struct.CUstream_st*, %struct.CUstream_st** %5, align 8
  %"'il_phi1" = phi %struct.CUstream_st* 
  %grid_dim.coerce.sroa.0.0..sroa_cast = bitcast %struct.dim3* %grid_dim to i64*
  %grid_dim.coerce.sroa.0.0.copyload = load i64, i64* %grid_dim.coerce.sroa.0.0..sroa_cast, align 8
  %"grid_dim.coerce.sroa.0.0.copyload'il_phi" = phi i64 
  %grid_dim.coerce.sroa.2.0..sroa_idx2 = getelementptr inbounds %struct.dim3, %struct.dim3* %grid_dim, i64 0, i32 2
  %grid_dim.coerce.sroa.2.0.copyload = load i32, i32* %grid_dim.coerce.sroa.2.0..sroa_idx2, align 8
  %"grid_dim.coerce.sroa.2.0.copyload'il_phi" = phi i32 
  %block_dim.coerce.sroa.0.0..sroa_cast = bitcast %struct.dim3* %block_dim to i64*
  %block_dim.coerce.sroa.0.0.copyload = load i64, i64* %block_dim.coerce.sroa.0.0..sroa_cast, align 8
  %"block_dim.coerce.sroa.0.0.copyload'il_phi" = phi i64 
  %block_dim.coerce.sroa.2.0..sroa_idx1 = getelementptr inbounds %struct.dim3, %struct.dim3* %block_dim, i64 0, i32 2
  %block_dim.coerce.sroa.2.0.copyload = load i32, i32* %block_dim.coerce.sroa.2.0..sroa_idx1, align 8
  %"block_dim.coerce.sroa.2.0.copyload'il_phi" = phi i32 
  %call = call i32 @cudaLaunchKernel(i8* bitcast (void (double*, double*)* @_Z18__device_stub__fooPdS_ to i8*), i64 %grid_dim.coerce.sroa.0.0.copyload, i32 %grid_dim.coerce.sroa.2.0.copyload, i64 %block_dim.coerce.sroa.0.0.copyload, i32 %block_dim.coerce.sroa.2.0.copyload, i8** nonnull %kernel_args3.sub, i64 %4, %struct.CUstream_st* %6)
  %"call'ip_phi" = phi i32 
  br label %invertentry

allocsForInversion:                               ; No predecessors!

invertentry:                                      ; preds = %entry
}

declare i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) local_unnamed_addr

opt: ../Enzyme/AdjointGenerator.h:3480: void AdjointGenerator<AugmentedReturnType>::visitCallInst(llvm::CallInst&) [with AugmentedReturnType = const AugmentedReturn*]: Assertion `subdata' failed.
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace.
Stack dump:
0.	Program arguments: opt test3.ll -load=/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so -enzyme -o output.ll -S 
1.	Running pass 'Enzyme Pass' on module 'test3.ll'.
 #0 0x000055da2bfcc85a llvm::sys::PrintStackTrace(llvm::raw_ostream&) (/home/yutong/local/bin/opt+0x193e85a)
 #1 0x000055da2bfca564 llvm::sys::RunSignalHandlers() (/home/yutong/local/bin/opt+0x193c564)
 #2 0x000055da2bfca6b3 SignalHandler(int) (/home/yutong/local/bin/opt+0x193c6b3)
 #3 0x00007f3db88e7890 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x12890)
 #4 0x00007f3db77c2e97 raise /build/glibc-OTsEL5/glibc-2.27/signal/../sysdeps/unix/sysv/linux/raise.c:51:0
 #5 0x00007f3db77c4801 abort /build/glibc-OTsEL5/glibc-2.27/stdlib/abort.c:81:0
 #6 0x00007f3db77b439a __assert_fail_base /build/glibc-OTsEL5/glibc-2.27/assert/assert.c:89:0
 #7 0x00007f3db77b4412 (/lib/x86_64-linux-gnu/libc.so.6+0x30412)
 #8 0x00007f3db733c810 AdjointGenerator<AugmentedReturn const*>::visitCallInst(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x45d810)
 #9 0x00007f3db7333fe9 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::delegateCallInst(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x454fe9)
#10 0x00007f3db7314e0b llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visitCall(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x435e0b)
#11 0x00007f3db7306d97 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x427d97)
#12 0x00007f3db72fb383 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction*) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x41c383)
#13 0x00007f3db72dcf94 CreatePrimalAndGradient(llvm::Function*, DIFFE_TYPE, std::vector<DIFFE_TYPE, std::allocator<DIFFE_TYPE> > const&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, bool, bool, llvm::Type*, FnTypeInfo const&, std::map<llvm::Argument*, bool, std::less<llvm::Argument*>, std::allocator<std::pair<llvm::Argument* const, bool> > >, AugmentedReturn const*, bool, bool, bool) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x3fdf94)
#14 0x00007f3db739f68c GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x4c068c)
#15 0x00007f3db739fc53 GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x4c0c53)
#16 0x00007f3db73271bc AdjointGenerator<AugmentedReturn*>::visitCallInst(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x4481bc)
#17 0x00007f3db731f5d3 llvm::InstVisitor<AdjointGenerator<AugmentedReturn*>, void>::delegateCallInst(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x4405d3)
#18 0x00007f3db7310a01 llvm::InstVisitor<AdjointGenerator<AugmentedReturn*>, void>::visitCall(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x431a01)
#19 0x00007f3db7303ea7 llvm::InstVisitor<AdjointGenerator<AugmentedReturn*>, void>::visit(llvm::Instruction&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x424ea7)
#20 0x00007f3db72f8dad llvm::InstVisitor<AdjointGenerator<AugmentedReturn*>, void>::visit(llvm::Instruction*) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x419dad)
#21 0x00007f3db72d469e CreateAugmentedPrimal(llvm::Function*, DIFFE_TYPE, std::vector<DIFFE_TYPE, std::allocator<DIFFE_TYPE> > const&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, FnTypeInfo const&, std::map<llvm::Argument*, bool, std::less<llvm::Argument*>, std::allocator<std::pair<llvm::Argument* const, bool> > >, bool, bool, bool, bool) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x3f569e)
#22 0x00007f3db739f5c3 GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x4c05c3)
#23 0x00007f3db739fc53 GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x4c0c53)
#24 0x00007f3db733ba8f AdjointGenerator<AugmentedReturn const*>::visitCallInst(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x45ca8f)
#25 0x00007f3db7333fe9 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::delegateCallInst(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x454fe9)
#26 0x00007f3db7314e0b llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visitCall(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x435e0b)
#27 0x00007f3db7306d97 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x427d97)
#28 0x00007f3db72fb383 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction*) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x41c383)
#29 0x00007f3db72dcf94 CreatePrimalAndGradient(llvm::Function*, DIFFE_TYPE, std::vector<DIFFE_TYPE, std::allocator<DIFFE_TYPE> > const&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, bool, bool, llvm::Type*, FnTypeInfo const&, std::map<llvm::Argument*, bool, std::less<llvm::Argument*>, std::allocator<std::pair<llvm::Argument* const, bool> > >, AugmentedReturn const*, bool, bool, bool) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x3fdf94)
#30 0x00007f3db72bcc9b HandleAutoDiff(llvm::CallInst*, llvm::TargetLibraryInfo&, llvm::AAResults&, bool) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x3ddc9b)
#31 0x00007f3db72bdc17 (anonymous namespace)::Enzyme::lowerEnzymeCalls(llvm::Function&, bool, bool&, std::set<llvm::Function*, std::less<llvm::Function*>, std::allocator<llvm::Function*> >&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x3dec17)
#32 0x00007f3db72bddb3 (anonymous namespace)::Enzyme::runOnModule(llvm::Module&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x3dedb3)
#33 0x000055da2b8393b1 llvm::legacy::PassManagerImpl::run(llvm::Module&) (/home/yutong/local/bin/opt+0x11ab3b1)
#34 0x000055da2ab41ddc main (/home/yutong/local/bin/opt+0x4b3ddc)
#35 0x00007f3db77a5b97 __libc_start_main /build/glibc-OTsEL5/glibc-2.27/csu/../csu/libc-start.c:344:0
#36 0x000055da2abb639a _start (/home/yutong/local/bin/opt+0x52839a)
Aborted (core dumped)

Full test3.ll file

; ModuleID = 'test3.cu'
source_filename = "test3.cu"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%struct.dim3 = type { i32, i32, i32 }
%struct.CUstream_st = type opaque

@enzyme_dup = local_unnamed_addr global i32 0, align 4
@enzyme_out = local_unnamed_addr global i32 0, align 4
@enzyme_const = local_unnamed_addr global i32 0, align 4
@.str = private unnamed_addr constant [7 x i8] c"%f %f\0A\00", align 1

; Function Attrs: norecurse uwtable
define void @_Z18__device_stub__fooPdS_(double* %x_in, double* %x_out) #0 {
entry:
  %x_in.addr = alloca double*, align 8
  %x_out.addr = alloca double*, align 8
  %grid_dim = alloca %struct.dim3, align 8
  %block_dim = alloca %struct.dim3, align 8
  %shmem_size = alloca i64, align 8
  %stream = alloca i8*, align 8
  store double* %x_in, double** %x_in.addr, align 8, !tbaa !4
  store double* %x_out, double** %x_out.addr, align 8, !tbaa !4
  %kernel_args3 = alloca [2 x i8*], align 16
  %kernel_args3.sub = getelementptr inbounds [2 x i8*], [2 x i8*]* %kernel_args3, i64 0, i64 0
  %0 = bitcast [2 x i8*]* %kernel_args3 to double***
  store double** %x_in.addr, double*** %0, align 16
  %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %kernel_args3, i64 0, i64 1
  %2 = bitcast i8** %1 to double***
  store double** %x_out.addr, double*** %2, align 8
  %3 = call i32 @__cudaPopCallConfiguration(%struct.dim3* nonnull %grid_dim, %struct.dim3* nonnull %block_dim, i64* nonnull %shmem_size, i8** nonnull %stream)
  %4 = load i64, i64* %shmem_size, align 8
  %5 = bitcast i8** %stream to %struct.CUstream_st**
  %6 = load %struct.CUstream_st*, %struct.CUstream_st** %5, align 8
  %grid_dim.coerce.sroa.0.0..sroa_cast = bitcast %struct.dim3* %grid_dim to i64*
  %grid_dim.coerce.sroa.0.0.copyload = load i64, i64* %grid_dim.coerce.sroa.0.0..sroa_cast, align 8
  %grid_dim.coerce.sroa.2.0..sroa_idx2 = getelementptr inbounds %struct.dim3, %struct.dim3* %grid_dim, i64 0, i32 2
  %grid_dim.coerce.sroa.2.0.copyload = load i32, i32* %grid_dim.coerce.sroa.2.0..sroa_idx2, align 8
  %block_dim.coerce.sroa.0.0..sroa_cast = bitcast %struct.dim3* %block_dim to i64*
  %block_dim.coerce.sroa.0.0.copyload = load i64, i64* %block_dim.coerce.sroa.0.0..sroa_cast, align 8
  %block_dim.coerce.sroa.2.0..sroa_idx1 = getelementptr inbounds %struct.dim3, %struct.dim3* %block_dim, i64 0, i32 2
  %block_dim.coerce.sroa.2.0.copyload = load i32, i32* %block_dim.coerce.sroa.2.0..sroa_idx1, align 8
  %call = call i32 @cudaLaunchKernel(i8* bitcast (void (double*, double*)* @_Z18__device_stub__fooPdS_ to i8*), i64 %grid_dim.coerce.sroa.0.0.copyload, i32 %grid_dim.coerce.sroa.2.0.copyload, i64 %block_dim.coerce.sroa.0.0.copyload, i32 %block_dim.coerce.sroa.2.0.copyload, i8** nonnull %kernel_args3.sub, i64 %4, %struct.CUstream_st* %6)
  ret void
}

declare i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**) local_unnamed_addr

declare i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) local_unnamed_addr

; Function Attrs: norecurse uwtable
define i32 @main() local_unnamed_addr #0 {
entry:
  %x = alloca double*, align 8
  %d_x = alloca double*, align 8
  %y = alloca double*, align 8
  %d_y = alloca double*, align 8
  %host_x = alloca double, align 8
  %host_d_x = alloca double, align 8
  %host_y = alloca double, align 8
  %host_d_y = alloca double, align 8
  %0 = bitcast double** %x to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %0) #4
  %1 = bitcast double** %d_x to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %1) #4
  %2 = bitcast double** %y to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %2) #4
  %3 = bitcast double** %d_y to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %3) #4
  %4 = bitcast double** %x to i8**
  %call.i = call i32 @cudaMalloc(i8** nonnull %4, i64 8)
  %5 = bitcast double** %d_x to i8**
  %call.i18 = call i32 @cudaMalloc(i8** nonnull %5, i64 8)
  %6 = bitcast double** %y to i8**
  %call.i19 = call i32 @cudaMalloc(i8** nonnull %6, i64 8)
  %7 = bitcast double** %d_y to i8**
  %call.i20 = call i32 @cudaMalloc(i8** nonnull %7, i64 8)
  %8 = bitcast double* %host_x to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %8) #4
  store double 1.400000e+00, double* %host_x, align 8, !tbaa !8
  %9 = bitcast double* %host_d_x to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %9) #4
  store double 0.000000e+00, double* %host_d_x, align 8, !tbaa !8
  %10 = bitcast double* %host_y to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %10) #4
  %11 = bitcast double* %host_d_y to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %11) #4
  store double 1.000000e+00, double* %host_d_y, align 8, !tbaa !8
  %12 = load i8*, i8** %4, align 8, !tbaa !4
  %call4 = call i32 @cudaMemcpy(i8* %12, i8* nonnull %8, i64 8, i32 1)
  %13 = load i8*, i8** %5, align 8, !tbaa !4
  %call5 = call i32 @cudaMemcpy(i8* %13, i8* nonnull %9, i64 8, i32 1)
  %14 = load i8*, i8** %6, align 8, !tbaa !4
  %call6 = call i32 @cudaMemcpy(i8* %14, i8* nonnull %10, i64 8, i32 1)
  %15 = load i8*, i8** %7, align 8, !tbaa !4
  %call7 = call i32 @cudaMemcpy(i8* %15, i8* nonnull %11, i64 8, i32 1)
  %call9 = call i32 @__cudaPushCallConfiguration(i64 4294967297, i32 1, i64 4294967297, i32 1, i64 0, i8* null)
  %tobool.not = icmp eq i32 %call9, 0
  br i1 %tobool.not, label %kcall.configok, label %kcall.end

kcall.configok:                                   ; preds = %entry
  %16 = load i32, i32* @enzyme_dup, align 4, !tbaa !10
  %17 = load double*, double** %x, align 8, !tbaa !4
  %18 = load double*, double** %d_x, align 8, !tbaa !4
  %19 = load double*, double** %y, align 8, !tbaa !4
  %20 = load double*, double** %d_y, align 8, !tbaa !4
  call void @_Z32__device_stub____enzyme_autodiffPFvPdS_EiS_S_iS_S_(void (double*, double*)* nonnull @_Z18__device_stub__fooPdS_, i32 %16, double* %17, double* %18, i32 %16, double* %19, double* %20)
  br label %kcall.end

kcall.end:                                        ; preds = %entry, %kcall.configok
  %21 = load i8*, i8** %4, align 8, !tbaa !4
  %call10 = call i32 @cudaMemcpy(i8* nonnull %8, i8* %21, i64 8, i32 2)
  %22 = load i8*, i8** %5, align 8, !tbaa !4
  %call11 = call i32 @cudaMemcpy(i8* nonnull %9, i8* %22, i64 8, i32 2)
  %23 = load i8*, i8** %6, align 8, !tbaa !4
  %call12 = call i32 @cudaMemcpy(i8* nonnull %10, i8* %23, i64 8, i32 2)
  %24 = load i8*, i8** %7, align 8, !tbaa !4
  %call13 = call i32 @cudaMemcpy(i8* nonnull %11, i8* %24, i64 8, i32 2)
  %25 = load double, double* %host_x, align 8, !tbaa !8
  %26 = load double, double* %host_y, align 8, !tbaa !8
  %call14 = call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i64 0, i64 0), double %25, double %26)
  %27 = load double, double* %host_d_x, align 8, !tbaa !8
  %28 = load double, double* %host_d_y, align 8, !tbaa !8
  %call15 = call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i64 0, i64 0), double %27, double %28)
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %11) #4
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %10) #4
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %9) #4
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %8) #4
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %3) #4
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %2) #4
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %1) #4
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %0) #4
  ret i32 0
}

; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1

declare i32 @cudaMemcpy(i8*, i8*, i64, i32) local_unnamed_addr #2

declare i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) local_unnamed_addr #2

declare void @_Z32__device_stub____enzyme_autodiffPFvPdS_EiS_S_iS_S_(void (double*, double*)*, i32, double*, double*, i32, double*, double*) local_unnamed_addr #2

; Function Attrs: nofree nounwind
declare i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #3

; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1

declare i32 @cudaMalloc(i8**, i64) local_unnamed_addr #2

attributes #0 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind willreturn }
attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { nounwind }

!llvm.module.flags = !{!0, !1, !2}
!llvm.ident = !{!3}

!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 7, !"PIC Level", i32 2}
!3 = !{!"clang version 11.0.1 (https://github.com/llvm/llvm-project.git 43ff75f2c3feef64f9d73328230d34dac8832a91)"}
!4 = !{!5, !5, i64 0}
!5 = !{!"any pointer", !6, i64 0}
!6 = !{!"omnipotent char", !7, i64 0}
!7 = !{!"Simple C++ TBAA"}
!8 = !{!9, !9, i64 0}
!9 = !{!"double", !6, i64 0}
!10 = !{!11, !11, i64 0}
!11 = !{!"int", !6, i64 0}

from enzyme.

wsmoses avatar wsmoses commented on May 21, 2024

Hi and thanks for helping beta test the CUDA AD.

Using Enzyme to AD through a kernel launch isn't supported yet (though perhaps we should). However, you should be able to call enzyme_autodiff within a cuda kernel on other cuda code.

from enzyme.

waltsims avatar waltsims commented on May 21, 2024

I was just about to try this on an accelerated wave propagation simulation. Would also be interested in it being supported.

from enzyme.

wsmoses avatar wsmoses commented on May 21, 2024

Try loading /path/to/ClangEnzyme-version.so rather than /path/to/LLVMEnzyme-version.so The LLVMEnzyme binary loads the auto diff package but does not add the enzyme pass to be called (expecting it to be manually added to opt). The aptly named ClangEnzyme automatically enables the Enzyme AD pass and thus is more suitable for use in Clang directly.

from enzyme.

Related Issues (20)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.