Comments (7)
Thanks for the prompt reply, I've modified the code so that it calls other device code, unfortunately I still seem to be getting unresolved extern errors:
07:42 $ clang -c test3.cu -Xclang -load -Xclang /home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so -O2 -fno-vectorize -fno-unroll-loops --cuda-gpu-arch=sm_70
ptxas fatal : Unresolved extern function '_Z17__enzyme_autodiffPFvPdS_EiS_S_iS_S_'
#include <stdio.h>
void __device__ foo_impl(double* x_in, double *x_out) {
x_out[0] = x_in[0] * x_in[0];
}
typedef void (*f_ptr)(double*, double*);
extern void __device__ __enzyme_autodiff(f_ptr,
int, double*, double*,
int, double*, double*);
void __global__ foo(double* x_in, double *x_out) {
foo_impl(x_in, x_out);
}
void __global__ foo_grad(double* x, double *d_x, double *y, double *d_y) {
int enzyme_dup;
int enzyme_out;
int enzyme_const;
__enzyme_autodiff(foo_impl,
enzyme_dup, x, d_x,
enzyme_dup, y, d_y);
}
int main() {
double *x, *d_x, *y, *d_y; // all on the devic
cudaMalloc(&x, sizeof(*x));
cudaMalloc(&d_x, sizeof(*d_x));
cudaMalloc(&y, sizeof(*y));
cudaMalloc(&d_y, sizeof(*d_y));
double host_x = 1.4;
double host_d_x = 0.0;
double host_y;
double host_d_y = 1.0;
cudaMemcpy(x, &host_x, sizeof(*x), cudaMemcpyHostToDevice);
cudaMemcpy(d_x, &host_d_x, sizeof(*d_x), cudaMemcpyHostToDevice);
cudaMemcpy(y, &host_y, sizeof(*y), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, &host_d_y, sizeof(*d_y), cudaMemcpyHostToDevice);
foo<<<1,1>>>(x, y);
cudaMemcpy(&host_x, x, sizeof(*x), cudaMemcpyDeviceToHost);
cudaMemcpy(&host_d_x, d_x, sizeof(*d_x), cudaMemcpyDeviceToHost);
cudaMemcpy(&host_y, y, sizeof(*y), cudaMemcpyDeviceToHost);
cudaMemcpy(&host_d_y, d_y, sizeof(*d_y), cudaMemcpyDeviceToHost);
printf("%f %f\n", host_x, host_y);
printf("%f %f\n", host_d_x, host_d_y);
}
from enzyme.
Thank you, that solved the issue, and I'm pleased to report successful compilation and correct derivatives.
Would you like me to submit a PR detailing what was discussed here into the "Getting Started" guide?
Here's a fully contained example:
#include <stdio.h>
// one-liner
// clang test3.cu -Xclang -load -Xclang /home/yutong/Code/Enzyme/enzyme/build/Enzyme/ClangEnzyme-11.so -O2 -fno-vectorize -fno-unroll-loops -fPIC --cuda-gpu-arch=sm_70 -lcudart -L/usr/local/cuda-10.1/lib64
void __device__ foo_impl(double* x_in, double *x_out) {
x_out[0] = x_in[0] * x_in[0];
}
typedef void (*f_ptr)(double*, double*);
extern void __device__ __enzyme_autodiff(f_ptr,
int, double*, double*,
int, double*, double*
);
void __global__ foo(double* x_in, double *x_out) {
foo_impl(x_in, x_out);
}
int __device__ enzyme_dup;
int __device__ enzyme_out;
int __device__ enzyme_const;
void __global__ foo_grad(double* x, double *d_x, double *y, double *d_y) {
__enzyme_autodiff(foo_impl,
enzyme_dup, x, d_x,
enzyme_dup, y, d_y);
}
int main() {
double *x, *d_x, *y, *d_y; // all on the devic
cudaMalloc(&x, sizeof(*x));
cudaMalloc(&d_x, sizeof(*d_x));
cudaMalloc(&y, sizeof(*y));
cudaMalloc(&d_y, sizeof(*d_y));
double host_x = 1.4;
double host_d_x = 0.0;
double host_y;
double host_d_y = 1.0;
cudaMemcpy(x, &host_x, sizeof(*x), cudaMemcpyHostToDevice);
cudaMemcpy(d_x, &host_d_x, sizeof(*d_x), cudaMemcpyHostToDevice);
cudaMemcpy(y, &host_y, sizeof(*y), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, &host_d_y, sizeof(*d_y), cudaMemcpyHostToDevice);
// foo<<<1,1>>>(x, y); fwd-pass only
foo_grad<<<1,1>>>(x, d_x, y, d_y); // fwd and bkwd pass
cudaDeviceSynchronize();
cudaMemcpy(&host_x, x, sizeof(*x), cudaMemcpyDeviceToHost);
cudaMemcpy(&host_d_x, d_x, sizeof(*d_x), cudaMemcpyDeviceToHost);
cudaMemcpy(&host_y, y, sizeof(*y), cudaMemcpyDeviceToHost);
cudaMemcpy(&host_d_y, d_y, sizeof(*d_y), cudaMemcpyDeviceToHost);
printf("%f %f\n", host_x, host_y);
printf("%f %f\n", host_d_x, host_d_y);
}
from enzyme.
Go for it, the relevant branch is "www". Also note that the load into Clang doesn't ensure that optimizations are always run before and after Enzyme which may have performance implications.
from enzyme.
Manually invoking AD on the intermediate test3.ll file results in a segfault (truncated log):
; Function Attrs: norecurse uwtable
define internal void @diffe_Z18__device_stub__fooPdS_.2(double* %x_in, double* %"x_in'", double* %x_out, double* %"x_out'", i8* %tapeArg) #0 {
entry:
%malloccall2 = tail call i8* @malloc(i64 12)
%"malloccall2'mi" = phi i8*
%grid_dim = bitcast i8* %malloccall2 to %struct.dim3*
%malloccall3 = tail call i8* @malloc(i64 12)
%"malloccall3'mi" = phi i8*
%block_dim = bitcast i8* %malloccall3 to %struct.dim3*
%malloccall1 = tail call i8* @malloc(i64 8)
%"malloccall1'mi" = phi i8*
%x_out.addr = bitcast i8* %malloccall1 to double**
%malloccall4 = tail call i8* @malloc(i64 8)
%"malloccall4'mi" = phi i8*
%shmem_size = bitcast i8* %malloccall4 to i64*
%malloccall = tail call i8* @malloc(i64 8)
%"malloccall'mi" = phi i8*
%x_in.addr = bitcast i8* %malloccall to double**
%malloccall5 = tail call i8* @malloc(i64 8)
%"malloccall5'mi" = phi i8*
%stream = bitcast i8* %malloccall5 to i8**
store double* %x_in, double** %x_in.addr, align 8, !tbaa !4
store double* %x_out, double** %x_out.addr, align 8, !tbaa !4
%malloccall6 = tail call i8* @malloc(i64 16)
%"malloccall6'mi" = phi i8*
%"kernel_args3'ipc" = bitcast i8* %"malloccall6'mi" to [2 x i8*]*
%kernel_args3 = bitcast i8* %malloccall6 to [2 x i8*]*
%"kernel_args3.sub'ipg" = getelementptr inbounds [2 x i8*], [2 x i8*]* %"kernel_args3'ipc", i64 0, i64 0
%kernel_args3.sub = getelementptr inbounds [2 x i8*], [2 x i8*]* %kernel_args3, i64 0, i64 0
%0 = bitcast [2 x i8*]* %kernel_args3 to double***
store double** %x_in.addr, double*** %0, align 16
%1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %kernel_args3, i64 0, i64 1
%2 = bitcast i8** %1 to double***
store double** %x_out.addr, double*** %2, align 8
%3 = call i32 @__cudaPopCallConfiguration(%struct.dim3* nonnull %grid_dim, %struct.dim3* nonnull %block_dim, i64* nonnull %shmem_size, i8** nonnull %stream)
%"'ip_phi" = phi i32
%4 = load i64, i64* %shmem_size, align 8
%"'il_phi" = phi i64
%5 = bitcast i8** %stream to %struct.CUstream_st**
%6 = load %struct.CUstream_st*, %struct.CUstream_st** %5, align 8
%"'il_phi1" = phi %struct.CUstream_st*
%grid_dim.coerce.sroa.0.0..sroa_cast = bitcast %struct.dim3* %grid_dim to i64*
%grid_dim.coerce.sroa.0.0.copyload = load i64, i64* %grid_dim.coerce.sroa.0.0..sroa_cast, align 8
%"grid_dim.coerce.sroa.0.0.copyload'il_phi" = phi i64
%grid_dim.coerce.sroa.2.0..sroa_idx2 = getelementptr inbounds %struct.dim3, %struct.dim3* %grid_dim, i64 0, i32 2
%grid_dim.coerce.sroa.2.0.copyload = load i32, i32* %grid_dim.coerce.sroa.2.0..sroa_idx2, align 8
%"grid_dim.coerce.sroa.2.0.copyload'il_phi" = phi i32
%block_dim.coerce.sroa.0.0..sroa_cast = bitcast %struct.dim3* %block_dim to i64*
%block_dim.coerce.sroa.0.0.copyload = load i64, i64* %block_dim.coerce.sroa.0.0..sroa_cast, align 8
%"block_dim.coerce.sroa.0.0.copyload'il_phi" = phi i64
%block_dim.coerce.sroa.2.0..sroa_idx1 = getelementptr inbounds %struct.dim3, %struct.dim3* %block_dim, i64 0, i32 2
%block_dim.coerce.sroa.2.0.copyload = load i32, i32* %block_dim.coerce.sroa.2.0..sroa_idx1, align 8
%"block_dim.coerce.sroa.2.0.copyload'il_phi" = phi i32
%call = call i32 @cudaLaunchKernel(i8* bitcast (void (double*, double*)* @_Z18__device_stub__fooPdS_ to i8*), i64 %grid_dim.coerce.sroa.0.0.copyload, i32 %grid_dim.coerce.sroa.2.0.copyload, i64 %block_dim.coerce.sroa.0.0.copyload, i32 %block_dim.coerce.sroa.2.0.copyload, i8** nonnull %kernel_args3.sub, i64 %4, %struct.CUstream_st* %6)
%"call'ip_phi" = phi i32
br label %invertentry
allocsForInversion: ; No predecessors!
invertentry: ; preds = %entry
}
declare i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) local_unnamed_addr
opt: ../Enzyme/AdjointGenerator.h:3480: void AdjointGenerator<AugmentedReturnType>::visitCallInst(llvm::CallInst&) [with AugmentedReturnType = const AugmentedReturn*]: Assertion `subdata' failed.
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace.
Stack dump:
0. Program arguments: opt test3.ll -load=/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so -enzyme -o output.ll -S
1. Running pass 'Enzyme Pass' on module 'test3.ll'.
#0 0x000055da2bfcc85a llvm::sys::PrintStackTrace(llvm::raw_ostream&) (/home/yutong/local/bin/opt+0x193e85a)
#1 0x000055da2bfca564 llvm::sys::RunSignalHandlers() (/home/yutong/local/bin/opt+0x193c564)
#2 0x000055da2bfca6b3 SignalHandler(int) (/home/yutong/local/bin/opt+0x193c6b3)
#3 0x00007f3db88e7890 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x12890)
#4 0x00007f3db77c2e97 raise /build/glibc-OTsEL5/glibc-2.27/signal/../sysdeps/unix/sysv/linux/raise.c:51:0
#5 0x00007f3db77c4801 abort /build/glibc-OTsEL5/glibc-2.27/stdlib/abort.c:81:0
#6 0x00007f3db77b439a __assert_fail_base /build/glibc-OTsEL5/glibc-2.27/assert/assert.c:89:0
#7 0x00007f3db77b4412 (/lib/x86_64-linux-gnu/libc.so.6+0x30412)
#8 0x00007f3db733c810 AdjointGenerator<AugmentedReturn const*>::visitCallInst(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x45d810)
#9 0x00007f3db7333fe9 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::delegateCallInst(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x454fe9)
#10 0x00007f3db7314e0b llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visitCall(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x435e0b)
#11 0x00007f3db7306d97 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x427d97)
#12 0x00007f3db72fb383 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction*) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x41c383)
#13 0x00007f3db72dcf94 CreatePrimalAndGradient(llvm::Function*, DIFFE_TYPE, std::vector<DIFFE_TYPE, std::allocator<DIFFE_TYPE> > const&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, bool, bool, llvm::Type*, FnTypeInfo const&, std::map<llvm::Argument*, bool, std::less<llvm::Argument*>, std::allocator<std::pair<llvm::Argument* const, bool> > >, AugmentedReturn const*, bool, bool, bool) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x3fdf94)
#14 0x00007f3db739f68c GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x4c068c)
#15 0x00007f3db739fc53 GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x4c0c53)
#16 0x00007f3db73271bc AdjointGenerator<AugmentedReturn*>::visitCallInst(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x4481bc)
#17 0x00007f3db731f5d3 llvm::InstVisitor<AdjointGenerator<AugmentedReturn*>, void>::delegateCallInst(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x4405d3)
#18 0x00007f3db7310a01 llvm::InstVisitor<AdjointGenerator<AugmentedReturn*>, void>::visitCall(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x431a01)
#19 0x00007f3db7303ea7 llvm::InstVisitor<AdjointGenerator<AugmentedReturn*>, void>::visit(llvm::Instruction&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x424ea7)
#20 0x00007f3db72f8dad llvm::InstVisitor<AdjointGenerator<AugmentedReturn*>, void>::visit(llvm::Instruction*) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x419dad)
#21 0x00007f3db72d469e CreateAugmentedPrimal(llvm::Function*, DIFFE_TYPE, std::vector<DIFFE_TYPE, std::allocator<DIFFE_TYPE> > const&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, FnTypeInfo const&, std::map<llvm::Argument*, bool, std::less<llvm::Argument*>, std::allocator<std::pair<llvm::Argument* const, bool> > >, bool, bool, bool, bool) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x3f569e)
#22 0x00007f3db739f5c3 GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x4c05c3)
#23 0x00007f3db739fc53 GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x4c0c53)
#24 0x00007f3db733ba8f AdjointGenerator<AugmentedReturn const*>::visitCallInst(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x45ca8f)
#25 0x00007f3db7333fe9 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::delegateCallInst(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x454fe9)
#26 0x00007f3db7314e0b llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visitCall(llvm::CallInst&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x435e0b)
#27 0x00007f3db7306d97 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x427d97)
#28 0x00007f3db72fb383 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction*) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x41c383)
#29 0x00007f3db72dcf94 CreatePrimalAndGradient(llvm::Function*, DIFFE_TYPE, std::vector<DIFFE_TYPE, std::allocator<DIFFE_TYPE> > const&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, bool, bool, llvm::Type*, FnTypeInfo const&, std::map<llvm::Argument*, bool, std::less<llvm::Argument*>, std::allocator<std::pair<llvm::Argument* const, bool> > >, AugmentedReturn const*, bool, bool, bool) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x3fdf94)
#30 0x00007f3db72bcc9b HandleAutoDiff(llvm::CallInst*, llvm::TargetLibraryInfo&, llvm::AAResults&, bool) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x3ddc9b)
#31 0x00007f3db72bdc17 (anonymous namespace)::Enzyme::lowerEnzymeCalls(llvm::Function&, bool, bool&, std::set<llvm::Function*, std::less<llvm::Function*>, std::allocator<llvm::Function*> >&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x3dec17)
#32 0x00007f3db72bddb3 (anonymous namespace)::Enzyme::runOnModule(llvm::Module&) (/home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so+0x3dedb3)
#33 0x000055da2b8393b1 llvm::legacy::PassManagerImpl::run(llvm::Module&) (/home/yutong/local/bin/opt+0x11ab3b1)
#34 0x000055da2ab41ddc main (/home/yutong/local/bin/opt+0x4b3ddc)
#35 0x00007f3db77a5b97 __libc_start_main /build/glibc-OTsEL5/glibc-2.27/csu/../csu/libc-start.c:344:0
#36 0x000055da2abb639a _start (/home/yutong/local/bin/opt+0x52839a)
Aborted (core dumped)
Full test3.ll file
; ModuleID = 'test3.cu'
source_filename = "test3.cu"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
%struct.dim3 = type { i32, i32, i32 }
%struct.CUstream_st = type opaque
@enzyme_dup = local_unnamed_addr global i32 0, align 4
@enzyme_out = local_unnamed_addr global i32 0, align 4
@enzyme_const = local_unnamed_addr global i32 0, align 4
@.str = private unnamed_addr constant [7 x i8] c"%f %f\0A\00", align 1
; Function Attrs: norecurse uwtable
define void @_Z18__device_stub__fooPdS_(double* %x_in, double* %x_out) #0 {
entry:
%x_in.addr = alloca double*, align 8
%x_out.addr = alloca double*, align 8
%grid_dim = alloca %struct.dim3, align 8
%block_dim = alloca %struct.dim3, align 8
%shmem_size = alloca i64, align 8
%stream = alloca i8*, align 8
store double* %x_in, double** %x_in.addr, align 8, !tbaa !4
store double* %x_out, double** %x_out.addr, align 8, !tbaa !4
%kernel_args3 = alloca [2 x i8*], align 16
%kernel_args3.sub = getelementptr inbounds [2 x i8*], [2 x i8*]* %kernel_args3, i64 0, i64 0
%0 = bitcast [2 x i8*]* %kernel_args3 to double***
store double** %x_in.addr, double*** %0, align 16
%1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %kernel_args3, i64 0, i64 1
%2 = bitcast i8** %1 to double***
store double** %x_out.addr, double*** %2, align 8
%3 = call i32 @__cudaPopCallConfiguration(%struct.dim3* nonnull %grid_dim, %struct.dim3* nonnull %block_dim, i64* nonnull %shmem_size, i8** nonnull %stream)
%4 = load i64, i64* %shmem_size, align 8
%5 = bitcast i8** %stream to %struct.CUstream_st**
%6 = load %struct.CUstream_st*, %struct.CUstream_st** %5, align 8
%grid_dim.coerce.sroa.0.0..sroa_cast = bitcast %struct.dim3* %grid_dim to i64*
%grid_dim.coerce.sroa.0.0.copyload = load i64, i64* %grid_dim.coerce.sroa.0.0..sroa_cast, align 8
%grid_dim.coerce.sroa.2.0..sroa_idx2 = getelementptr inbounds %struct.dim3, %struct.dim3* %grid_dim, i64 0, i32 2
%grid_dim.coerce.sroa.2.0.copyload = load i32, i32* %grid_dim.coerce.sroa.2.0..sroa_idx2, align 8
%block_dim.coerce.sroa.0.0..sroa_cast = bitcast %struct.dim3* %block_dim to i64*
%block_dim.coerce.sroa.0.0.copyload = load i64, i64* %block_dim.coerce.sroa.0.0..sroa_cast, align 8
%block_dim.coerce.sroa.2.0..sroa_idx1 = getelementptr inbounds %struct.dim3, %struct.dim3* %block_dim, i64 0, i32 2
%block_dim.coerce.sroa.2.0.copyload = load i32, i32* %block_dim.coerce.sroa.2.0..sroa_idx1, align 8
%call = call i32 @cudaLaunchKernel(i8* bitcast (void (double*, double*)* @_Z18__device_stub__fooPdS_ to i8*), i64 %grid_dim.coerce.sroa.0.0.copyload, i32 %grid_dim.coerce.sroa.2.0.copyload, i64 %block_dim.coerce.sroa.0.0.copyload, i32 %block_dim.coerce.sroa.2.0.copyload, i8** nonnull %kernel_args3.sub, i64 %4, %struct.CUstream_st* %6)
ret void
}
declare i32 @__cudaPopCallConfiguration(%struct.dim3*, %struct.dim3*, i64*, i8**) local_unnamed_addr
declare i32 @cudaLaunchKernel(i8*, i64, i32, i64, i32, i8**, i64, %struct.CUstream_st*) local_unnamed_addr
; Function Attrs: norecurse uwtable
define i32 @main() local_unnamed_addr #0 {
entry:
%x = alloca double*, align 8
%d_x = alloca double*, align 8
%y = alloca double*, align 8
%d_y = alloca double*, align 8
%host_x = alloca double, align 8
%host_d_x = alloca double, align 8
%host_y = alloca double, align 8
%host_d_y = alloca double, align 8
%0 = bitcast double** %x to i8*
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %0) #4
%1 = bitcast double** %d_x to i8*
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %1) #4
%2 = bitcast double** %y to i8*
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %2) #4
%3 = bitcast double** %d_y to i8*
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %3) #4
%4 = bitcast double** %x to i8**
%call.i = call i32 @cudaMalloc(i8** nonnull %4, i64 8)
%5 = bitcast double** %d_x to i8**
%call.i18 = call i32 @cudaMalloc(i8** nonnull %5, i64 8)
%6 = bitcast double** %y to i8**
%call.i19 = call i32 @cudaMalloc(i8** nonnull %6, i64 8)
%7 = bitcast double** %d_y to i8**
%call.i20 = call i32 @cudaMalloc(i8** nonnull %7, i64 8)
%8 = bitcast double* %host_x to i8*
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %8) #4
store double 1.400000e+00, double* %host_x, align 8, !tbaa !8
%9 = bitcast double* %host_d_x to i8*
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %9) #4
store double 0.000000e+00, double* %host_d_x, align 8, !tbaa !8
%10 = bitcast double* %host_y to i8*
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %10) #4
%11 = bitcast double* %host_d_y to i8*
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %11) #4
store double 1.000000e+00, double* %host_d_y, align 8, !tbaa !8
%12 = load i8*, i8** %4, align 8, !tbaa !4
%call4 = call i32 @cudaMemcpy(i8* %12, i8* nonnull %8, i64 8, i32 1)
%13 = load i8*, i8** %5, align 8, !tbaa !4
%call5 = call i32 @cudaMemcpy(i8* %13, i8* nonnull %9, i64 8, i32 1)
%14 = load i8*, i8** %6, align 8, !tbaa !4
%call6 = call i32 @cudaMemcpy(i8* %14, i8* nonnull %10, i64 8, i32 1)
%15 = load i8*, i8** %7, align 8, !tbaa !4
%call7 = call i32 @cudaMemcpy(i8* %15, i8* nonnull %11, i64 8, i32 1)
%call9 = call i32 @__cudaPushCallConfiguration(i64 4294967297, i32 1, i64 4294967297, i32 1, i64 0, i8* null)
%tobool.not = icmp eq i32 %call9, 0
br i1 %tobool.not, label %kcall.configok, label %kcall.end
kcall.configok: ; preds = %entry
%16 = load i32, i32* @enzyme_dup, align 4, !tbaa !10
%17 = load double*, double** %x, align 8, !tbaa !4
%18 = load double*, double** %d_x, align 8, !tbaa !4
%19 = load double*, double** %y, align 8, !tbaa !4
%20 = load double*, double** %d_y, align 8, !tbaa !4
call void @_Z32__device_stub____enzyme_autodiffPFvPdS_EiS_S_iS_S_(void (double*, double*)* nonnull @_Z18__device_stub__fooPdS_, i32 %16, double* %17, double* %18, i32 %16, double* %19, double* %20)
br label %kcall.end
kcall.end: ; preds = %entry, %kcall.configok
%21 = load i8*, i8** %4, align 8, !tbaa !4
%call10 = call i32 @cudaMemcpy(i8* nonnull %8, i8* %21, i64 8, i32 2)
%22 = load i8*, i8** %5, align 8, !tbaa !4
%call11 = call i32 @cudaMemcpy(i8* nonnull %9, i8* %22, i64 8, i32 2)
%23 = load i8*, i8** %6, align 8, !tbaa !4
%call12 = call i32 @cudaMemcpy(i8* nonnull %10, i8* %23, i64 8, i32 2)
%24 = load i8*, i8** %7, align 8, !tbaa !4
%call13 = call i32 @cudaMemcpy(i8* nonnull %11, i8* %24, i64 8, i32 2)
%25 = load double, double* %host_x, align 8, !tbaa !8
%26 = load double, double* %host_y, align 8, !tbaa !8
%call14 = call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i64 0, i64 0), double %25, double %26)
%27 = load double, double* %host_d_x, align 8, !tbaa !8
%28 = load double, double* %host_d_y, align 8, !tbaa !8
%call15 = call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i64 0, i64 0), double %27, double %28)
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %11) #4
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %10) #4
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %9) #4
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %8) #4
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %3) #4
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %2) #4
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %1) #4
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %0) #4
ret i32 0
}
; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1
declare i32 @cudaMemcpy(i8*, i8*, i64, i32) local_unnamed_addr #2
declare i32 @__cudaPushCallConfiguration(i64, i32, i64, i32, i64, i8*) local_unnamed_addr #2
declare void @_Z32__device_stub____enzyme_autodiffPFvPdS_EiS_S_iS_S_(void (double*, double*)*, i32, double*, double*, i32, double*, double*) local_unnamed_addr #2
; Function Attrs: nofree nounwind
declare i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #3
; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
declare i32 @cudaMalloc(i8**, i64) local_unnamed_addr #2
attributes #0 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind willreturn }
attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { nounwind }
!llvm.module.flags = !{!0, !1, !2}
!llvm.ident = !{!3}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 7, !"PIC Level", i32 2}
!3 = !{!"clang version 11.0.1 (https://github.com/llvm/llvm-project.git 43ff75f2c3feef64f9d73328230d34dac8832a91)"}
!4 = !{!5, !5, i64 0}
!5 = !{!"any pointer", !6, i64 0}
!6 = !{!"omnipotent char", !7, i64 0}
!7 = !{!"Simple C++ TBAA"}
!8 = !{!9, !9, i64 0}
!9 = !{!"double", !6, i64 0}
!10 = !{!11, !11, i64 0}
!11 = !{!"int", !6, i64 0}
from enzyme.
Hi and thanks for helping beta test the CUDA AD.
Using Enzyme to AD through a kernel launch isn't supported yet (though perhaps we should). However, you should be able to call enzyme_autodiff within a cuda kernel on other cuda code.
from enzyme.
I was just about to try this on an accelerated wave propagation simulation. Would also be interested in it being supported.
from enzyme.
Try loading /path/to/ClangEnzyme-version.so rather than /path/to/LLVMEnzyme-version.so The LLVMEnzyme binary loads the auto diff package but does not add the enzyme pass to be called (expecting it to be manually added to opt). The aptly named ClangEnzyme automatically enables the Enzyme AD pass and thus is more suitable for use in Clang directly.
from enzyme.
Related Issues (20)
- Kokkos Support HOT 10
- Unable to differentiate debug code HOT 2
- Forward mode only with custom derivative on GPU HOT 5
- [Bug] Updated activity analysis produces incorrect derivative HOT 1
- Compiling shared object? HOT 6
- Unable to activate optimization option up to O0 on the CUDA GPU test case HOT 5
- Broken InsertVector diffuse analysis HOT 1
- Improve compiletimes for large datastructures HOT 1
- enzyme crashing at build time on eigen function (from libigl) HOT 15
- File too big while compiling with Ninja HOT 3
- comparing unsigned to -2 HOT 1
- C++20 non-type template parameter wrapper not working for lambdas with captures HOT 2
- How to use lambdas and wrappers with reverse mode? HOT 7
- Error sparsifying an icmp HOT 14
- Enzyme crash HOT 3
- Performance of type-analysis HOT 6
- Many failures when running ninja check-enzyme HOT 2
- Enzyme: Cannot cast __enzyme_autodiff primal argument 16 HOT 4
- enzyme_dupped parameter doesn't return gradient
- New C++ interface with lambda HOT 2
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from enzyme.