enzymead / enzyme Goto Github PK
View Code? Open in Web Editor NEWHigh-performance automatic differentiation of LLVM and MLIR.
Home Page: https://enzyme.mit.edu
License: Other
High-performance automatic differentiation of LLVM and MLIR.
Home Page: https://enzyme.mit.edu
License: Other
Hello,
The following instruction
F[indj*d+l] += wjk * parts[indk*d+l];
Make the code needs quadratic memory in the backward pass.
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <random>
#include <math.h>
#include <vector>
#include <algorithm>
using namespace std;
struct Index
{
int* cellId;
int* start;
int* cellSize;
int size;
int* argsorted;
int n;
} ;
void buildIndex( Index& index , double * parts, int n )
{
int d = 3;
index.n = n;
index.cellId = new int[n];
index.start = new int[n];
index.cellSize = new int[n]; //Max Size is n but the end may be unused
index.argsorted = new int[n];
for( int i = 0 ; i < n ; i++)
{
int id = parts[d*i];
index.cellId[i] = id;
}
vector<pair<int,int> > v(n);
for( int i = 0 ; i < n ; i++)
{
v[i].first = index.cellId[i];
v[i].second = i;
}
sort( v.begin(), v.end() );
int i = 0 ;
int cur = -1;
int curCellId = -1;
for( int i = 0 ; i < n ; i++)
{
index.argsorted[i] = v[i].second;
if( v[i].first == curCellId)
{
index.cellSize[cur]++;
}
else
{
curCellId = v[i].first;
cur ++;
index.cellSize[cur] = 1;
index.start[cur] = i;
}
}
index.size = cur+1;
}
double foo( double* __restrict__ parts,int n, Index* __restrict__ index)
{
double out = 0;
const int d = 3;
double F[n*d];
double W[n];
for( int i = 0 ; i < n ; i++)
{
for( int j = 0 ; j < d ; j++)
{
F[i*d+j] = 0.0;
}
W[i] = 0.0;
}
for( int i = 0 ; i < index->size ; i++)
{
for( int j = 0 ; j < index->cellSize[i] ; j++ )
{
for( int k = 0 ; k < index->cellSize[i] ; k++ )
{
int indj = index->argsorted[index->start[i]+j];
int indk = index->argsorted[index->start[i]+k];
double djk = 0;
for( int l = 0 ; l < d ; l++)
{
double temp;
temp = parts[indj * d +l ]- parts[indk * d +l ];
djk += temp*temp;
}
//out += djk;
double wjk = 1.0+djk; // strictly positive
for( int l = 0 ; l < d ; l++)
{
F[indj*d+l] += wjk * parts[indk*d+l];
}
//W[indj] += wjk;
}
}
}
/*
//Normalize the field value
for( int i = 0 ; i < n ; i++)
{
for( int j = 0 ; j < d ; j++)
{
F[i*d+j] /= W[i*d+j];
}
}
*/
/*
//Compute the energy
for( int i = 0 ; i < n ; i++)
{
double e = 0.0;
for( int j = 0 ; j < d ; j++)
{
out += F[i*d+j]*F[i*d+j];
}
}
*/
//delete[] F;
//delete[] W;
return out;
}
int enzyme_dup;
int enzyme_out;
int enzyme_const;
typedef double (*f_ptr)(double *,int,Index*);
extern double __enzyme_autodiff(f_ptr,
int, double *, double *,
int, int,
int, Index*);
int main() {
std::mt19937 e2(42);
std::uniform_real_distribution<> dist(0, 10);
int n = 100000;
int d = 3;
double* x = new double[n*d];
double* d_x = new double[n*d];
for( int i = 0 ; i < n*d ; i++)
{
x[i] = dist(e2);
d_x[i] = 0.0;
}
Index index;
buildIndex(index, x, n);
for( int i = 0 ; i < 100 ; i++)
{
printf("cellId[%d] = %d\n ",i, index.cellId[i]);
}
printf("before autodiff\n");
__enzyme_autodiff(foo,
enzyme_dup, x, d_x,
enzyme_const, n,
enzyme_const, &index);
//printf("%f \n", y);
for( int i = 0 ; i < 100 ; i++)
{
printf("dx[%d] = [%f, %f, %f]\n ",i, d_x[d*i],d_x[d*i+1],d_x[d*i+2]);
}
}
Compiled with :
clang test2.cpp -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-7.so -O2 -o test2 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -fno-exceptions
Hi,
I cloned, build and played around with enzyme and I am very happy so far!:)
I want to point out that enzyme is not only calculating gradients, but also implements the adjoint semantics (this is great news for me!).
So do you have plans to add examples for the (AD) community?
I don't want to create a pull request yet, so here is my first quick and dirty test (modified the sumAndMul function to become a speelpenning(ish) function with 2 outputs).
put the attached files into $(ENZYME_ROOT)/ad/test, calling
$make
results in correct input adjoints for calling dF 3 times with (1,0), (0,1) and (1,1) for the outptut adjoints (d_mul and d_out2).
./output.exe
d(output)/darray[0] = 24.000000
d(output)/darray[1] = 12.000000
d(output)/darray[2] = 8.000000
d(output)/darray[3] = 6.000000
d(output)/darray[0] = 48.000000
d(output)/darray[1] = 24.000000
d(output)/darray[2] = 16.000000
d(output)/darray[3] = 12.000000
d(output)/darray[0] = 72.000000
d(output)/darray[1] = 36.000000
d(output)/darray[2] = 24.000000
d(output)/darray[3] = 18.000000
Very nice!
first_example.zip
I was not expecting it to work, and even if it works I'm still not sure I'd be using it, as I'll probably have to rewrite it anyway to make sure the derivative of the exponential form is properly handled.
I tried to use http://versor.mat.ucsb.edu/ which is a header-only geometric algebra (aka Clifford algebra) library. It's basically an extension of the complex numbers, called multivectors which you can use to represent things like rotations for some mathematical spaces.
I tried to differentiate through it, and the compiler crashed with some issue related to global variable missing some attribute.
@_ZN3vsr11MultivectorINS_7algebraINS_6metricILi3ELi0ELi0ELb0EEEfEENS_5BasisIJLs3ELs5ELs6EEEEE2xyE = linkonce_odr dso_local global %"struct.vsr::Multivector.0" zeroinitializer, comdat, align 4
clang-11: ../Enzyme/GradientUtils.cpp:1909: llvm::Value* GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<>&): Assertion `0 && "cannot compute with global variable that doesn't have " "marked shadow global"' failed.
I don't know how the versor library works internally. There are probably some basis, (x,y,z) to represent euclidean spaces, from which they derive a basis to represent the multivectors, (1, x, y,z, x^y, x^z, y^z, x^y^z). Some of them are probably declared globally because they are taken as the reference, aka constant.
Usually there are formula which allows to change basis, which is a just a matrix multiplication.
I'm not quite sure what exactly makes sense to do, whether or not we can consider those global basis element constant (aka derivative is zero), or create a shadow for them which would be ignored later on by the user. (Probably both can make sense depending on what you are working on)
It's probably related to issue #60
testVersor.cpp
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <math.h>
#include "vsr/vsr.h"
using namespace vsr;
using namespace vsr::nga;
using Vec = vsr::euclidean_vector<3,float>; //<-- A 3 dimensional euclidean vector defined over floats
using Biv = vsr::euclidean_bivector<3,float>; //<-- A 3 dimensional bivector or "directed area element"
int enzyme_dup;
int enzyme_out;
int enzyme_const;
void __enzyme_autodiff(...);
//-L/home/darkblue/versor/build/lib -lvsr
double multivectorNorm( const Vec& v)
{
Vec v2 = v.rotate( Biv::xy * .25 );
double n = v2.norm();
return n*n;
}
void testversor()
{
Vec v = Vec(1,2,3); //<-- A 3D vector at coordinates 1,2,3;
//v.print();
//v.rotate( Biv::xy * .25 ).print(); //<-- Rotate the vector in the xy plane and print result
Vec v2 = v.rotate( Biv::xy * .25 );
double n = v.norm();
cout <<"norm2 v " << n << endl;
cout <<"norm2 v2 " << v2.norm() << endl;
cout << "multivectorNorm " << multivectorNorm(v) << endl;
Vec dv = Vec(0,0,0);
//__enzyme_autodiff( &multivectorNorm,enzyme_dup, &v,&dv);
cout <<"dv "<< endl;
dv.print();
cout<<"testversor done" << endl;
}
int main() {
testversor();
}
Compilation with :
clang testVersor.cpp -Ipathto/versor/include/ -DVSR_PRECISION_DOUBLE -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-11.so -O2 -o testVersor -fno-exceptions
The Full error stack is 3000 lines long.
Begins with :
cannot shadow-inline global
@_ZN3vsr11MultivectorINS_7algebraINS_6metricILi3ELi0ELi0ELb0EEEfEENS_5BasisIJLs3ELs5ELs6EEEEE2xyE = linkonce_odr dso_local global %"struct.vsr::Multivector.0" zeroinitializer, comdat, align 4 due to %call1 = call { <2 x float>, float } @_ZNK3vsr11MultivectorINS_7algebraINS_6metricILi3ELi0ELi0ELb0EEEfEENS_5BasisIJLs1ELs2ELs4EEEEE6rotateINS5_IJLs3ELs5ELs6EEEEEES7_RKNS0_IS4_T_EE(%"struct.vsr::Multivector"* nonnull %v, %"struct.vsr::Multivector.0"* nonnull align 4 dereferenceable(12) %ref.tmp)``
and ends with :
@_ZN3vsr11MultivectorINS_7algebraINS_6metricILi3ELi0ELi0ELb0EEEfEENS_5BasisIJLs3ELs5ELs6EEEEE2xyE = linkonce_odr dso_local global %"struct.vsr::Multivector.0" zeroinitializer, comdat, align 4
clang-11: ../Enzyme/GradientUtils.cpp:1909: llvm::Value* GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<>&): Assertion `0 && "cannot compute with global variable that doesn't have " "marked shadow global"' failed.
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace, preprocessed source, and associated run script.
Stack dump:
0. Program arguments: /home/username/usrlocal/bin/clang-11 -cc1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -main-file-name testVersor.cpp -mrelocation-model static -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /home/username/usrlocal/lib/clang/11.1.0 -I /home/username/versor/include/ -D VSR_PRECISION_DOUBLE -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /home/username/usrlocal/lib/clang/11.1.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -fdeprecated-macro -fdebug-compilation-dir /home/username/testenzyme -ferror-limit 19 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -load /usr/local/lib/ClangEnzyme-11.so -faddrsig -o /tmp/testVersor-7a3fdd.o -x c++ testVersor.cpp
1. <eof> parser at end of file
2. Per-module optimization passes
3. Running pass 'Enzyme Pass' on module 'testVersor.cpp'.
#0 0x0000563b94cf498a llvm::sys::PrintStackTrace(llvm::raw_ostream&) (/home/username/usrlocal/bin/clang-11+0x1d0f98a)
#1 0x0000563b94cf2654 llvm::sys::RunSignalHandlers() (/home/username/usrlocal/bin/clang-11+0x1d0d654)
#2 0x0000563b94cf27a3 SignalHandler(int) (/home/username/usrlocal/bin/clang-11+0x1d0d7a3)
#3 0x00007f4f46611980 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x12980)
#4 0x00007f4f452c2fb7 raise /build/glibc-S9d2JN/glibc-2.27/signal/../sysdeps/unix/sysv/linux/raise.c:51:0
#5 0x00007f4f452c4921 abort /build/glibc-S9d2JN/glibc-2.27/stdlib/abort.c:81:0
#6 0x00007f4f452b448a __assert_fail_base /build/glibc-S9d2JN/glibc-2.27/assert/assert.c:89:0
#7 0x00007f4f452b4502 (/lib/x86_64-linux-gnu/libc.so.6+0x30502)
#8 0x00007f4f44e784fc GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&) (/usr/local/lib/ClangEnzyme-11.so+0x5284fc)
#9 0x00007f4f44e79228 GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&) (/usr/local/lib/ClangEnzyme-11.so+0x529228)
#10 0x00007f4f44db10a0 DiffeGradientUtils::addToInvertedPtrDiffe(llvm::Value*, llvm::Value*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&, llvm::MaybeAlign) (/usr/local/lib/ClangEnzyme-11.so+0x4610a0)
#11 0x00007f4f44df5a78 AdjointGenerator<AugmentedReturn const*>::visitLoadInst(llvm::LoadInst&) (/usr/local/lib/ClangEnzyme-11.so+0x4a5a78)
#12 0x00007f4f44dd6e23 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visitLoad(llvm::LoadInst&) (/usr/local/lib/ClangEnzyme-11.so+0x486e23)
#13 0x00007f4f44dc9309 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction&) (/usr/local/lib/ClangEnzyme-11.so+0x479309)
#14 0x00007f4f44dbde3d llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction*) (/usr/local/lib/ClangEnzyme-11.so+0x46de3d)
#15 0x00007f4f44d9fae3 CreatePrimalAndGradient(llvm::Function*, DIFFE_TYPE, std::vector<DIFFE_TYPE, std::allocator<DIFFE_TYPE> > const&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, bool, bool, llvm::Type*, FnTypeInfo const&, std::map<llvm::Argument*, bool, std::less<llvm::Argument*>, std::allocator<std::pair<llvm::Argument* const, bool> > >, AugmentedReturn const*, bool, bool, bool) (/usr/local/lib/ClangEnzyme-11.so+0x44fae3)
#16 0x00007f4f44d79378 bool (anonymous namespace)::Enzyme::HandleAutoDiff<llvm::CallInst>(llvm::CallInst*, llvm::TargetLibraryInfo&, llvm::AAResults&, bool) (/usr/local/lib/ClangEnzyme-11.so+0x429378)
#17 0x00007f4f44d754d2 (anonymous namespace)::Enzyme::lowerEnzymeCalls(llvm::Function&, bool, bool&, std::set<llvm::Function*, std::less<llvm::Function*>, std::allocator<llvm::Function*> >&) (/usr/local/lib/ClangEnzyme-11.so+0x4254d2)
#18 0x00007f4f44d75dfb (anonymous namespace)::Enzyme::runOnModule(llvm::Module&) (/usr/local/lib/ClangEnzyme-11.so+0x425dfb)
#19 0x0000563b946b4a81 llvm::legacy::PassManagerImpl::run(llvm::Module&) (/home/username/usrlocal/bin/clang-11+0x16cfa81)
#20 0x0000563b94f74c54 (anonymous namespace)::EmitAssemblyHelper::EmitAssembly(clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream> >) (/home/username/usrlocal/bin/clang-11+0x1f8fc54)
#21 0x0000563b94f766f4 clang::EmitBackendOutput(clang::DiagnosticsEngine&, clang::HeaderSearchOptions const&, clang::CodeGenOptions const&, clang::TargetOptions const&, clang::LangOptions const&, llvm::DataLayout const&, llvm::Module*, clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream> >) (/home/username/usrlocal/bin/clang-11+0x1f916f4)
#22 0x0000563b95b1edf5 clang::BackendConsumer::HandleTranslationUnit(clang::ASTContext&) (/home/username/usrlocal/bin/clang-11+0x2b39df5)
#23 0x0000563b966a2fe9 clang::ParseAST(clang::Sema&, bool, bool) (/home/username/usrlocal/bin/clang-11+0x36bdfe9)
#24 0x0000563b95b1efa8 clang::CodeGenAction::ExecuteAction() (/home/username/usrlocal/bin/clang-11+0x2b39fa8)
#25 0x0000563b95506d39 clang::FrontendAction::Execute() (/home/username/usrlocal/bin/clang-11+0x2521d39)
#26 0x0000563b954c167a clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (/home/username/usrlocal/bin/clang-11+0x24dc67a)
#27 0x0000563b955d1486 clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (/home/username/usrlocal/bin/clang-11+0x25ec486)
#28 0x0000563b93bf20fc cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (/home/username/usrlocal/bin/clang-11+0xc0d0fc)
#29 0x0000563b93bed479 ExecuteCC1Tool(llvm::SmallVectorImpl<char const*>&) (/home/username/usrlocal/bin/clang-11+0xc08479)
#30 0x0000563b93b70fd4 main (/home/username/usrlocal/bin/clang-11+0xb8bfd4)
#31 0x00007f4f452a5bf7 __libc_start_main /build/glibc-S9d2JN/glibc-2.27/csu/../csu/libc-start.c:344:0
#32 0x0000563b93becc4a _start (/home/username/usrlocal/bin/clang-11+0xc07c4a)
clang-11: error: unable to execute command: Aborted (core dumped)
clang-11: error: clang frontend command failed due to signal (use -v to see invocation)
clang version 11.1.0
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/local/bin
clang-11: note: diagnostic msg:
https://github.com/tensor-compiler/taco
I want to make taco's tensor computation with auto-diff.
Is it possible?
It takes a while to check out this repo, which is quite large. Uncompressed the repo size is 2 GB, which makes it a bit slow to build dockerfiles with. To cut down on the initial download for users, it would be nice to distribute tarballs with either
This last option is orders of magnitudes smaller, so would be awesome. With some small configurations this github repo can produce such an archive for every git tag. Just add this to the .gitattributes:
/LICENSE export-ignore
/Readme.md export-ignore
/clang export-ignore
/clang-tools-extra export-ignore
/compiler-rt export-ignore
/contrib export-ignore
/debuginfo-tests export-ignore
/libcxx export-ignore
/libcxxabi export-ignore
/libunwind export-ignore
/lld export-ignore
/lldb export-ignore
/llvm export-ignore
/openmp export-ignore
/polly export-ignore
/tests export-ignore
/enzyme/benchmarks export-ignore
and push a tag to github
git tag v0.1.0
git push --tags origin
The benchmarks build would also need to be made an optional part of the CMakeLists
for this to work.
Because they're both based on LLVM, I was wondering what would be involved in integrating Numba with Enzyme, so that I could decorate a python function and get (optimized) gradients. I poked around the documentation a little but didn't see anything relevant.
We should add some information on this to the website
Hello,
I'm trying to sort vector of integers inside the function I want to differentiate.
It should be a no operation in the backward pass.
But the compilation hangs.
Can you advise for the proper way to tell enzyme to just ignore some variables or function ?
Thanks
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <random>
#include <math.h>
#include <vector>
#include <algorithm>
using namespace std;
double foo( double* __restrict__ parts,int n, int* cellId)
{
vector< int > sorted(n);
for( int i = 0 ; i < n ; i++)
{
sorted[i] = cellId[i];
}
sort( sorted.begin(),sorted.end());
double out = parts[0];
return out;
}
int enzyme_dup;
int enzyme_out;
int enzyme_const;
typedef double (*f_ptr)(double *,int,int*);
extern double __enzyme_autodiff(f_ptr,
int, double *, double *,
int, int,
int, int*);
int main() {
srand(42);
std::mt19937 e2(42);
std::uniform_real_distribution<> dist(0, 10);
int n = 100000;
int d = 3;
double* x = new double[n*d];
double* d_x = new double[n*d];
for( int i = 0 ; i < n*d ; i++)
{
x[i] = dist(e2);
d_x[i] = 0.0;
}
int * cellId = new int[n];
for( int i = 0 ; i < n ; i++)
{
cellId[i] = i;
}
printf("before autodiff\n");
__enzyme_autodiff(foo,
enzyme_dup, x, d_x,
enzyme_const, n,
enzyme_const, cellId);
//printf("%f \n", y);
for( int i = 0 ; i < 100 ; i++)
{
printf("dx[%d] = %f\n",i, d_x[i]);
}
}
Compilation command :
clang test2.cpp -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-7.so -O2 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o test2
There have been some updates to the memory behavior (specifically user-provided mallocs inside code being differentiated). We should investigate and ensure that these and relevant shadows do not memory leak.
The cache does not have a potential leak as it maintains the property where all cache deallocations are free'd at the corresponding location in the reverse pass.
Error: opt: /efs/home/tfk/Enzyme-plugin/enzyme/Enzyme/Enzyme.cpp:2700: llvm::Value* GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<>&): Assertion `0 && "cannot find deal with ptr that isnt arg"' failed.
IR
; ModuleID = 'segfault.c'
source_filename = "segfault.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@.str = private unnamed_addr constant [36 x i8] c"hello! %f, res2 %f, da: %f, db: %f\0A\00", align 1
; Function Attrs: noinline nounwind optnone uwtable
define dso_local float @man_max(float* %a, float* %b) #0 {
entry:
%retval = alloca float, align 4
%a.addr = alloca float*, align 8
%b.addr = alloca float*, align 8
store float* %a, float** %a.addr, align 8
store float* %b, float** %b.addr, align 8
%0 = load float*, float** %a.addr, align 8
%1 = load float, float* %0, align 4
%2 = load float*, float** %b.addr, align 8
%3 = load float, float* %2, align 4
%cmp = fcmp ogt float %1, %3
br i1 %cmp, label %if.then, label %if.else
if.then: ; preds = %entry
%4 = load float*, float** %a.addr, align 8
%5 = load float, float* %4, align 4
store float %5, float* %retval, align 4
br label %return
if.else: ; preds = %entry
%6 = load float*, float** %b.addr, align 8
%7 = load float, float* %6, align 4
store float %7, float* %retval, align 4
br label %return
return: ; preds = %if.else, %if.then
%8 = load float, float* %retval, align 4
ret float %8
}
; Function Attrs: noinline nounwind optnone uwtable
define dso_local void @compute_max(float* %a, float* %b, float* %ret) #0 {
entry:
%a.addr = alloca float*, align 8
%b.addr = alloca float*, align 8
%ret.addr = alloca float*, align 8
store float* %a, float** %a.addr, align 8
store float* %b, float** %b.addr, align 8
store float* %ret, float** %ret.addr, align 8
%0 = load float*, float** %a.addr, align 8
%1 = load float*, float** %b.addr, align 8
%call = call float @man_max(float* %0, float* %1)
%2 = load float*, float** %ret.addr, align 8
store float %call, float* %2, align 4
ret void
}
; Function Attrs: noinline norecurse optnone uwtable
define dso_local i32 @main(i32 %argc, i8** %argv) #1 {
entry:
%retval = alloca i32, align 4
%argc.addr = alloca i32, align 4
%argv.addr = alloca i8**, align 8
%a = alloca float, align 4
%b = alloca float, align 4
%da = alloca float, align 4
%db = alloca float, align 4
%ret = alloca float, align 4
%dret = alloca float, align 4
store i32 0, i32* %retval, align 4
store i32 %argc, i32* %argc.addr, align 4
store i8** %argv, i8*** %argv.addr, align 8
store float 2.000000e+00, float* %a, align 4
store float 3.000000e+00, float* %b, align 4
store float 0.000000e+00, float* %da, align 4
store float 0.000000e+00, float* %db, align 4
store float 0.000000e+00, float* %ret, align 4
store float 1.000000e+00, float* %dret, align 4
call void @compute_max(float* %a, float* %b, float* %ret)
%0 = call double (...) @__enzyme_autodiff.f64(void (float*, float*, float*)* @compute_max, float* %a, float* %da, float* %b, float* %db, float* %ret, float* %dret)
%1 = load float, float* %ret, align 4
%conv = fpext float %1 to double
%2 = load float, float* %ret, align 4
%conv1 = fpext float %2 to double
%3 = load float, float* %da, align 4
%conv2 = fpext float %3 to double
%4 = load float, float* %db, align 4
%conv3 = fpext float %4 to double
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str, i32 0, i32 0), double %conv, double %conv1, double %conv2, double %conv3)
ret i32 0
}
declare double @__enzyme_autodiff.f64(...)
declare dso_local i32 @printf(i8*, ...) #2
attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 7.1.0 "}
Hello,
I'm trying to build a proto-neural-network with enzyme, aka two successive Matrix-vector product.
I tried to keep the code as simple and minimalist as possible.
The code runs fine but when I pass -Rpass=enzyme it indicates that it's caching and recomputing whereas it shouldn't need any memory allocation, as I'm preallocating the intermediate buffers, nor recomputation as I'm preserving the intermediate layers.
I have put restrict everywhere I can, but what am I doing wrong ?
Thanks
bugDense.cpp
#include <iostream>
using namespace std;
extern int enzyme_dup;
extern int enzyme_dupnoneed;
extern int enzyme_out;
extern int enzyme_const;
void __enzyme_autodiff(...);
inline void zero( double*__restrict__ v, int n)
{
for( int i = 0 ; i < n ; i++) v[i] = 0.0;
}
void dense( double*__restrict__ A, double* __restrict__ x, double* __restrict__ out, int n, int m)
{
zero( out, n );
for( int i = 0 ; i < n ; i++ )
for( int j = 0 ; j < m ; j++)
out[i] += A[i*m+j] *x[j];
}
inline void rangep1( double*__restrict__ v, int n)
{
for( int i = 0 ; i < n ; i++) v[i] = i+1;
}
template<typename T>
T sq( T x)
{
return x*x;
}
inline void printVector( double*__restrict__ x, int n )
{
for( int i = 0 ; i < n ; i++)
{
cout << x[i] << endl;
}
cout << endl;
}
inline void printMatrix( double*__restrict__ A, int n1, int n2 )
{
for( int i = 0 ; i < n1 ; i++ )
{
for( int j = 0 ; j < n2 ; j++)
{
cout << A[i*n2+j] << " ";
}
cout << endl;
}
}
class Fun2Params
{
public:
Fun2Params(int featDim, int d)
{
A = new double[featDim*d];
B = new double[featDim*featDim];
rangep1(A,featDim*d);
rangep1(B,featDim*featDim);
}
double* __restrict__ A;
double* __restrict__ B;
};
class Fun2Memory
{
public:
Fun2Memory(int featDim)
{
y0 = new double[featDim];
y1 = new double[featDim];
zero(y0,featDim);
zero(y1,featDim);
}
double* __restrict__ y0;
double* __restrict__ y1;
};
class Fun2
{
public:
Fun2(int featDim, int d):featDim(featDim),d(d)
{
p = new double[d];
rangep1(p,d);
}
double* __restrict__ p;
int featDim;
int d;
};
void structuredFun2 (Fun2Params* __restrict__ x, Fun2Memory* __restrict__ y, double* __restrict__ out ,Fun2* __restrict__ parameters )
{
int d = parameters->d;
int featDim = parameters->featDim;
printf("featDim %d\n", featDim);
dense( x->A, parameters->p, y->y0,featDim,d);
dense( x->B, y->y0, y->y1,featDim,featDim);
double temp = 0.0;
for( int i= 0; i < featDim ; i++)
{
temp += sq(y->y0[i]) ;
temp += sq(y->y1[i]);
}
*out = temp;
}
void testFun2()
{
int d = 2;
int featDim = 6;
Fun2Params fp(d,featDim);
Fun2Params dfp(d,featDim);
Fun2Memory fm(featDim);
Fun2Memory dfm(featDim);
Fun2 fun2(featDim,d);
double dout = 1.0;
double out=0.0;
__enzyme_autodiff(structuredFun2,
enzyme_dup, &fp,&dfp,
enzyme_dup, &fm ,&dfm,
enzyme_dup,&out,&dout,
enzyme_const, fun2);
cout << "out " << endl;
cout << out << endl;
cout << "dfp.A " << endl;
printMatrix( dfp.A,featDim,d);
cout << "dfp.B" << endl;
printMatrix( dfp.B,featDim,featDim);
cout << endl;
}
int main(int argc, char** argv )
{
cout<<"bugDense "<<endl;
testFun2();
return 0;
}
Compilation with :
clang bugDense.cpp -lstdc++ -lm -fno-exceptions -Rpass=enzyme -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-11.so -O2 -o bugDense
Output :
remark: Load may need caching %arrayidx9.promoted.i = load double, double* %arrayidx9.i, align 8, !tbaa !44, !alias.scope !46, !noalias !38 due to store double %add10.i, double* %arrayidx9.i, align 8, !dbg !47, !tbaa !44, !alias.scope !46, !noalias !38 [-Rpass=enzyme]
bugDense.cpp:21:21: remark: Load may need caching %9 = load double, double* %arrayidx.i, align 8, !dbg !53, !tbaa !44, !alias.scope !54, !noalias !55 due to store double %add10.i, double* %arrayidx9.i, align 8, !dbg !47, !tbaa !44, !alias.scope !46, !noalias !38 [-Rpass=enzyme]
out[i] += A[i*m+j] *x[j];
^
bugDense.cpp:21:31: remark: Load may need caching %10 = load double, double* %arrayidx6.i, align 8, !dbg !56, !tbaa !44, !alias.scope !57, !noalias !58 due to store double %add10.i, double* %arrayidx9.i, align 8, !dbg !47, !tbaa !44, !alias.scope !46, !noalias !38 [-Rpass=enzyme]
out[i] += A[i*m+j] *x[j];
^
remark: Load may need caching %arrayidx9.promoted.i45 = load double, double* %arrayidx9.i44, align 8, !tbaa !44, !alias.scope !80, !noalias !75 due to store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
bugDense.cpp:21:21: remark: Load may need caching %15 = load double, double* %arrayidx.i53, align 8, !dbg !89, !tbaa !44, !alias.scope !90, !noalias !91 due to store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
out[i] += A[i*m+j] *x[j];
^
bugDense.cpp:21:31: remark: Load may need caching %16 = load double, double* %arrayidx6.i54, align 8, !dbg !92, !tbaa !44, !alias.scope !93, !noalias !94 due to store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
out[i] += A[i*m+j] *x[j];
^
bugDense.cpp:21:31: remark: Load must be recomputed %10 = load double, double* %arrayidx6.i, align 8, !dbg !56, !tbaa !44, !alias.scope !57, !noalias !58 in reverse_invertfor.body4.i due to store double %add10.i, double* %arrayidx9.i, align 8, !dbg !47, !tbaa !44, !alias.scope !46, !noalias !38 [-Rpass=enzyme]
bugDense.cpp:21:31: remark: Caching instruction %13 = load double, double* %arrayidx6.i, align 8, !dbg !55, !tbaa !44, !alias.scope !56, !noalias !57 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugDense.cpp:21:31: remark: Load must be recomputed %16 = load double, double* %arrayidx6.i54, align 8, !dbg !92, !tbaa !44, !alias.scope !93, !noalias !94 in reverse_invertfor.body4.i59 due to store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
bugDense.cpp:21:31: remark: Caching instruction %31 = load double, double* %arrayidx6.i54, align 8, !dbg !93, !tbaa !45, !alias.scope !94, !noalias !95 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugDense.cpp:21:21: remark: Load must be recomputed %15 = load double, double* %arrayidx.i53, align 8, !dbg !89, !tbaa !44, !alias.scope !90, !noalias !91 in reverse_invertfor.body4.i59 due to store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
out[i] += A[i*m+j] *x[j];
^
bugDense.cpp:21:21: remark: Caching instruction %35 = load double, double* %arrayidx.i53, align 8, !dbg !91, !tbaa !45, !alias.scope !92, !noalias !93 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugDense
featDim 6
out
5.72725e+08
dfp.A
7.83671e+06 1.56734e+07
9.69974e+06 1.93995e+07
1.76338e+07 3.52676e+07
2.8622e+07 5.7244e+07
4.65207e+07 9.30413e+07
7.4193e+07 1.48386e+08
dfp.B
1.19345e+06 2.62559e+06 4.05773e+06 5.48987e+06 6.92201e+06 8.35415e+06
2.6271e+06 5.77963e+06 8.93216e+06 1.20847e+07 1.52372e+07 1.83897e+07
4.0644e+06 8.94168e+06 1.38191e+07 1.86968e+07 2.35748e+07 2.84511e+07
5.48659e+06 1.20907e+07 1.86488e+07 2.52308e+07 7.83671e+06 9.69973e+06
1.76338e+07 2.8622e+07 4.65206e+07 7.4193e+07 4.01854e+07 4.84996e+07
238690 525420 812880 1.09699e+06 1.3857e+06 1.71612e+06
I tried this simple input sin2.ll
:
; ModuleID = '<stdin>'
source_filename = "<stdin>"
; Function Attrs: norecurse nounwind readnone
define double @tester(double %x) #0 {
entry:
%0 = fmul double %x, %x
ret double %0
}
define double @test_derivative(double %x) local_unnamed_addr {
entry:
%0 = tail call double (double (double)*, ...) @__enzyme_autodiff(double (double)* nonnull @tester, double %x)
ret double %0
}
declare double @__enzyme_autodiff(double (double)*, ...) local_unnamed_addr
attributes #0 = { norecurse nounwind readnone }
and run it through like this:
opt-6.0 < sin2.ll -load ../../build/Enzyme/LLVMEnzyme-6.so -enzyme -enzyme_preopt=false -O3 -S
with the result:
; ModuleID = '<stdin>'
source_filename = "<stdin>"
; Function Attrs: norecurse nounwind readnone
define double @tester(double %x) local_unnamed_addr #0 {
entry:
%0 = fmul double %x, %x
ret double %0
}
; Function Attrs: norecurse nounwind readnone
define double @test_derivative(double %x) local_unnamed_addr #0 {
entry:
%factor.i = fmul fast double %x, 2.000000e+00
ret double %factor.i
}
; Function Attrs: norecurse nounwind readnone
define double @preprocess_tester(double %x) local_unnamed_addr #0 {
entry:
%0 = fmul double %x, %x
ret double %0
}
attributes #0 = { norecurse nounwind readnone }
I can see that it left @tester
intact that returns x^2
. Then it optimized out the @__enzyme_autodiff
to just return 2*x
.
But why did it create the @preprocess_tester
function?
Hello,
I tried to follow : https://enzyme.mit.edu/Installation/
I am on ubuntu 18.04.
cmake --version
cmake version 3.19.4
gcc --version
gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
g++ --version
g++ (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
I add to do to make sure cmake use the right compiler (otherwise it use gcc-4.9 and fails when it needs CXX-17)
export CC=/usr/bin/gcc
export CXX=/usr/bin/g++
LLVM was compiled successfully with :
cd ~/Enzyme
mkdir build && cd build
cmake -G Ninja ../llvm -DLLVM_TARGETS_TO_BUILD="host" -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_PLUGINS=ON -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON
ninja
I even added a sudo ninja install
for good measure
~/Enzyme/enzyme/build$ cmake -G Ninja .. -DLLVM_DIR=../../build/lib/cmake/llvm
LLVM_SHLIBEXT=.so
found llvm dir /home/username/Enzyme/build/lib/cmake/llvm
found llvm lit /home/username/Enzyme/enzyme/build
CMAKE_PREFIX_PATH /home/username/Enzyme/build/lib/cmake/llvm
-- Linker detection: GNU ld
LLVM_INSTALL_PREFIX:
LLVM_INCLUDE_DIRS: /home/username/Enzyme/llvm/include;/home/username/Enzyme/build/include
found llvm definitions -D_GNU_SOURCE -D_DEBUG -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS
found llvm version 7
first llvm include directory/home/username/Enzyme/llvm/include
found bench flags: -I/home/username/Enzyme/enzyme/build/benchmarks/adept2/src/adept2/include -I/home/username/Enzyme/enzyme/build/benchmarks/tapenade/src/tapenade
-- Configuring done
-- Generating done
-- Build files have been written to: /home/username/Enzyme/enzyme/build
~/Enzyme/enzyme/build$ ninja
[19/43] Linking CXX shared library Enzyme/libEnzyme-7.so
FAILED: Enzyme/libEnzyme-7.so
: && /usr/bin/g++ -fPIC -Wall -fPIC -fno-rtti -shared -Wl,-soname,libEnzyme-7.so -o Enzyme/libEnzyme-7.so Enzyme/CMakeFiles/Enzyme-7.dir/ActivityAnalysis.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/CApi.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/CacheUtility.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/Enzyme.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/EnzymeLogic.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/FunctionUtils.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/GradientUtils.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/MustExitScalarEvolution.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/Utils.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/SCEV/ScalarEvolutionExpander.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/TypeAnalysis/TypeTree.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/TypeAnalysis/TypeAnalysis.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/TypeAnalysis/TypeAnalysisPrinter.cpp.o -lLLVM && :
/usr/bin/ld: cannot find -lLLVM
collect2: error: ld returned 1 exit status
[28/43] Building CXX object Enzyme/CMa.../ClangEnzyme-7.dir/FunctionUtils.cpp.o
ninja: build stopped: subcommand failed.
It seems it can't find the LLVM libraries : there is no libLLVM.a but there are plenty of LLVM library either in
~/Enzyme/build/lib and /usr/local/lib/
~/Enzyme/build/lib$ ls libLLVM*
ls /usr/local/lib/libLLVM*
libLLVMAggressiveInstCombine.a libLLVMMC.a
libLLVMAnalysis.a libLLVMMCDisassembler.a
libLLVMAsmParser.a libLLVMMCJIT.a
libLLVMAsmPrinter.a libLLVMMCParser.a
libLLVMBinaryFormat.a libLLVMMIRParser.a
libLLVMBitReader.a libLLVMObjCARCOpts.a
libLLVMBitWriter.a libLLVMObject.a
libLLVMCFIVerify.a libLLVMObjectYAML.a
libLLVMCodeGen.a libLLVMOption.a
libLLVMCore.a libLLVMOrcJIT.a
libLLVMCoroutines.a libLLVMPasses.a
libLLVMCoverage.a libLLVMProfileData.a
libLLVMDebugInfoCodeView.a libLLVMRuntimeDyld.a
libLLVMDebugInfoDWARF.a libLLVMScalarOpts.a
libLLVMDebugInfoMSF.a libLLVMSelectionDAG.a
libLLVMDebugInfoPDB.a libLLVMSupport.a
libLLVMDemangle.a libLLVMSymbolize.a
libLLVMDlltoolDriver.a libLLVMTableGen.a
libLLVMExecutionEngine.a libLLVMTarget.a
libLLVMExegesis.a libLLVMTestingSupport.a
libLLVMExegesisX86.a libLLVMTransformUtils.a
libLLVMFuzzMutate.a libLLVMVectorize.a
libLLVMGlobalISel.a libLLVMWindowsManifest.a
libLLVMInstCombine.a libLLVMX86AsmParser.a
libLLVMInstrumentation.a libLLVMX86AsmPrinter.a
libLLVMInterpreter.a libLLVMX86CodeGen.a
libLLVMipo.a libLLVMX86Desc.a
libLLVMIRReader.a libLLVMX86Disassembler.a
libLLVMLibDriver.a libLLVMX86Info.a
libLLVMLineEditor.a libLLVMX86Utils.a
libLLVMLinker.a libLLVMXRay.a
libLLVMLTO.a
Can you please advise
Thanks
Hello,
I tried to use Posit representation for floating point numbers .
I grab a header only library that does it as a drop-in replacement for double
git clone https://github.com/stillwater-sc/universal
testPosit.cpp
#include <iostream>
using namespace std;
// https://github.com/stillwater-sc/universal
#pragma clang diagnostic ignored "-Wc++17-extensions"
#include <universal/number/posit/posit.hpp>
extern int enzyme_dup;
extern int enzyme_dupnoneed;
extern int enzyme_out;
extern int enzyme_const;
void __enzyme_autodiff(...);
template<typename T>
T sq( T x)
{
return x*x;
}
template<typename T>
void fun1( T* x, T* out )
{
*out = sq(x[0] - 1.0);
}
template<typename Real>
Real MyKernel(const Real& a, const Real& b) {
return a * b; // replace this with your kernel computation
}
constexpr double pi = 3.14159265358979323846;
using Real = sw::universal::posit<32,2>;
int main(int argc, char** argv )
{
cout << "testMemoryAllocator "<< endl;
Real a = sqrt(2);
Real b = pi;
std::cout << "Result: " << MyKernel(a, b) << std::endl;
Real x = 3.0;
Real out = 0.0;
{
out = 0.0;
fun1( &x, &out );
cout << "out " << endl;
cout << out << endl;
}
{
Real dx = 0.0;
Real gout = 1.0;
Real out = 0.0;
__enzyme_autodiff( fun1<Real>, enzyme_dup, &x, &dx,
enzyme_dup,&out,&gout );
cout << "out " << endl;
cout << out << endl;
cout << "dx " << endl;
cout << dx << endl;
}
}
clang testPosit.cpp -I/home/username/universal/include -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-11.so -O2 -o testPosit
Here is the compilation log :
errorPosit.log
Thanks
Are integer types supported? I only get the expected answer for floats or doubles.
Correct answer with type 'float':
#include <stdio.h>
#include <stdint.h>
#define DTYPE float
extern DTYPE __enzyme_autodiff(void*, DTYPE);
DTYPE square(DTYPE x) {
return x * x;
}
DTYPE dsquare(DTYPE x) {
return __enzyme_autodiff(square, x);
}
int main() {
for(DTYPE i=1; i<5; i++)
printf("square(%f)=%f, dsquare(%f)=%f\n", (float) i, (float) square(i), (float) i, (float) dsquare(i));
}
square(1.000000)=1.000000, dsquare(1.000000)=2.000000
square(2.000000)=4.000000, dsquare(2.000000)=4.000000
square(3.000000)=9.000000, dsquare(3.000000)=6.000000
square(4.000000)=16.000000, dsquare(4.000000)=8.000000
Incorrect answer with type `int':
#include <stdio.h>
#include <stdint.h>
#define DTYPE int
extern DTYPE __enzyme_autodiff(void*, DTYPE);
DTYPE square(DTYPE x) {
return x * x;
}
DTYPE dsquare(DTYPE x) {
return __enzyme_autodiff(square, x);
}
int main() {
for(DTYPE i=1; i<5; i++)
printf("square(%f)=%f, dsquare(%f)=%f\n", (float) i, (float) square(i), (float) i, (float) dsquare(i));
}
square(1.000000)=1.000000, dsquare(1.000000)=0.000000
square(2.000000)=4.000000, dsquare(2.000000)=0.000000
square(3.000000)=9.000000, dsquare(3.000000)=0.000000
square(4.000000)=16.000000, dsquare(4.000000)=0.000000
Hello,
I was expecting the following to run without any allocations or tape usage, but it crashes.
bugStackAlloc.cpp
#include <iostream>
using namespace std;
extern int enzyme_dup;
extern int enzyme_dupnoneed;
extern int enzyme_out;
extern int enzyme_const;
void __enzyme_autodiff(...);
template<typename T>
void comp( double* __restrict__ x, double* __restrict__ out);
const int nbiter = 100000000;
const int d = 30;
class Fun1{};template<> void comp<Fun1>( double* __restrict__ x, double* __restrict__ out)
{
double buf[d];
double xinp = x[0];
for( int i = 0 ; i < d ; i++)
{
buf[i] = xinp * i;
}
for( int i =1 ; i < d-1 ; i++)
{
*out += (buf[i]-buf[i+1] )*(buf[i]-buf[i+1] ) ;
}
}
class Fun2{};template<> void comp<Fun2>( double* __restrict__ x, double*__restrict__ out)
{
for( int j = 0; j < nbiter ; j++)
{
comp<Fun1>(x,out);
}
}
class Fun3{};template<> void comp<Fun3>( double*__restrict__ x, double* __restrict__ out)
{
double xinp = x[0];
for( int j = 0; j < nbiter ; j++)
{
double buf[d];
double temp = 0.0;
for( int i = 0 ; i < d ; i++)
{
buf[i] = xinp * i;
}
for( int i =1 ; i < d-1 ; i++)
{
temp += (buf[i]-buf[i+1] ) * (buf[i]-buf[i+1] ) ;
}
*out += temp;
}
}
template< typename T>
void demo( )
{
double x = 3.0;
double out = 0.0;
{
out = 0.0;
comp<T>( &x, &out );
cout << "out without enzyme" << endl;
cout << out << endl;
}
{
double dx = 0.0;
double gout = 1.0;
double out = 0.0;
__enzyme_autodiff( comp<T>, enzyme_dup, &x, &dx,
enzyme_dup,&out,&gout);
cout << "out with enzyme" << endl;
cout << out << endl;
cout << "dx with enzyme" << endl;
cout << dx << endl;
}
}
int main(int argc, char** argv )
{
cout << "testMemoryAllocator "<< endl;
cout << "demofun1() " << endl;
demo<Fun1>();
cout << "demofun2() " << endl;
demo<Fun2>(); // Fails
cout << "demofun3() " << endl;
demo<Fun3>(); // Fails
}
clang bugStackAlloc.cpp -lstdc++ -lm -fno-exceptions -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-11.so -O2 -o bugStackAlloc
testMemoryAllocator
demofun1()
out without enzyme
252
out with enzyme
252
dx with enzyme
168
demofun2()
out without enzyme
2.52e+10
Killed
Compiles fine, and runs fine until it get Killed due to excessive memory usage.
I tried to use complex numbers :
And functions like std::abs and std::arg (it works with .real() and .imag() )
It makes the compiler crash with the error :
clang-11: ../Enzyme/EnzymeLogic.cpp:1181: const AugmentedReturn& CreateAugmentedPrimal(llvm::Function*, DIFFE_TYPE, const std::vector<DIFFE_TYPE>&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, const FnTypeInfo&, std::map<llvm::Argument*, bool>, bool, bool, bool, bool): Assertion
0 && "attempting to differentiate function without definition"' failed`.
I don't know exactly where the standard complex library is defined, or wether or not it is header only, but it seems enzyme can't get its source code.
complex.cpp
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <math.h>
#include <complex> // std::complex, std::abs, std::arg
int enzyme_dup;
int enzyme_out;
int enzyme_const;
void __enzyme_autodiff(...);
using namespace std;
double h( const complex<double>& c )
{
//double theta = c.real()+3*c.imag();
double theta = std::abs(c);
//double theta = arg(c);
return theta * theta;
}
double h2( const double& c )
{
double theta = std::abs(c);
return theta * theta;
}
int main ()
{
std::complex<double> mycomplex (3.0,4.0);
std::cout << "The polar form of " << mycomplex;
std::cout << " is " << abs(mycomplex) << "*e^i*" << arg(mycomplex) << "rad\n";
std::cout << "energy of particle is : " << h(mycomplex) << endl;
std::complex<double> dc(0.0,0.0);
double x = -3.0;
double dx = 0.0;
//Works with real number
__enzyme_autodiff(&h2, enzyme_dup, &x,&dx);
//Compilation fails for complex number
//"attempting to differentiate function without definition"' failed.
__enzyme_autodiff(&h, enzyme_dup, &mycomplex,&dc);
std::cout << "grad energy of particle is : " << dc << endl;
return 0;
}
Compilation with :
clang complex.cpp -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-11.so -O2 -o complex -fno-exceptions
Full Error message :
mod: ; ModuleID = 'complex.cpp'
source_filename = "complex.cpp"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
%"class.std::ios_base::Init" = type { i8 }
%"class.std::basic_ostream" = type { i32 (...)**, %"class.std::basic_ios" }
%"class.std::basic_ios" = type { %"class.std::ios_base", %"class.std::basic_ostream"*, i8, i8, %"class.std::basic_streambuf"*, %"class.std::ctype"*, %"class.std::num_put"*, %"class.std::num_get"* }
%"class.std::ios_base" = type { i32 (...)**, i64, i64, i32, i32, i32, %"struct.std::ios_base::_Callback_list"*, %"struct.std::ios_base::_Words", [8 x %"struct.std::ios_base::_Words"], i32, %"struct.std::ios_base::_Words"*, %"class.std::locale" }
%"struct.std::ios_base::_Callback_list" = type { %"struct.std::ios_base::_Callback_list"*, void (i32, %"class.std::ios_base"*, i32)*, i32, i32 }
%"struct.std::ios_base::_Words" = type { i8*, i64 }
%"class.std::locale" = type { %"class.std::locale::_Impl"* }
%"class.std::locale::_Impl" = type { i32, %"class.std::locale::facet"**, i64, %"class.std::locale::facet"**, i8** }
%"class.std::locale::facet" = type <{ i32 (...)**, i32, [4 x i8] }>
%"class.std::basic_streambuf" = type { i32 (...)**, i8*, i8*, i8*, i8*, i8*, i8*, %"class.std::locale" }
%"class.std::ctype" = type <{ %"class.std::locale::facet.base", [4 x i8], %struct.__locale_struct*, i8, [7 x i8], i32*, i32*, i16*, i8, [256 x i8], [256 x i8], i8, [6 x i8] }>
%"class.std::locale::facet.base" = type <{ i32 (...)**, i32 }>
%struct.__locale_struct = type { [13 x %struct.__locale_data*], i16*, i32*, i32*, [13 x i8*] }
%struct.__locale_data = type opaque
%"class.std::num_put" = type { %"class.std::locale::facet.base", [4 x i8] }
%"class.std::num_get" = type { %"class.std::locale::facet.base", [4 x i8] }
%"struct.std::complex" = type { { double, double } }
@_ZStL8__ioinit = internal global %"class.std::ios_base::Init" zeroinitializer, align 1
@__dso_handle = external hidden global i8
@enzyme_dup = dso_local local_unnamed_addr global i32 0, align 4
@enzyme_out = dso_local local_unnamed_addr global i32 0, align 4
@enzyme_const = dso_local local_unnamed_addr global i32 0, align 4
@_ZSt4cout = external dso_local global %"class.std::basic_ostream", align 8
@.str = private unnamed_addr constant [19 x i8] c"The polar form of \00", align 1
@.str.1 = private unnamed_addr constant [5 x i8] c" is \00", align 1
@.str.2 = private unnamed_addr constant [6 x i8] c"*e^i*\00", align 1
@.str.3 = private unnamed_addr constant [5 x i8] c"rad\0A\00", align 1
@.str.4 = private unnamed_addr constant [25 x i8] c"energy of particle is : \00", align 1
@.str.5 = private unnamed_addr constant [30 x i8] c"grad energy of particle is : \00", align 1
@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_complex.cpp, i8* null }]
declare dso_local void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"*) unnamed_addr #0
; Function Attrs: nounwind
declare dso_local void @_ZNSt8ios_base4InitD1Ev(%"class.std::ios_base::Init"*) unnamed_addr #1
; Function Attrs: nofree nounwind
declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) local_unnamed_addr #2
; Function Attrs: nofree nounwind uwtable
define dso_local double @_Z1hRKSt7complexIdE(%"struct.std::complex"* nocapture nonnull readonly align 8 dereferenceable(16) %c) #3 {
entry:
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %c, i64 0, i32 0, i32 0
%_M_value.real.i.i = load double, double* %_M_value.realp.i.i, align 8
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %c, i64 0, i32 0, i32 1
%_M_value.imag.i.i = load double, double* %_M_value.imagp.i.i, align 8
%call.i.i = tail call double @cabs(double %_M_value.real.i.i, double %_M_value.imag.i.i) #11
%mul = fmul double %call.i.i, %call.i.i
ret double %mul
}
; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #4
; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #4
; Function Attrs: nounwind readonly uwtable
define dso_local double @_Z2h2RKd(double* nocapture nonnull readonly align 8 dereferenceable(8) %c) #5 {
entry:
%0 = load double, double* %c, align 8, !tbaa !2
%mul = fmul double %0, %0
ret double %mul
}
; Function Attrs: norecurse nounwind uwtable
define dso_local i32 @main() local_unnamed_addr #6 {
entry:
%mycomplex = alloca %"struct.std::complex", align 8
%dc = alloca %"struct.std::complex", align 8
%x = alloca double, align 8
%dx = alloca double, align 8
%0 = bitcast %"struct.std::complex"* %mycomplex to i8*
call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %0) #11
%_M_value.realp.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %mycomplex, i64 0, i32 0, i32 0
%_M_value.imagp.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %mycomplex, i64 0, i32 0, i32 1
store double 3.000000e+00, double* %_M_value.realp.i, align 8
store double 4.000000e+00, double* %_M_value.imagp.i, align 8
%call1.i17 = tail call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8) @_ZSt4cout, i8* nonnull getelementptr inbounds ([19 x i8], [19 x i8]* @.str, i64 0, i64 0), i64 18) #11
%call1 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZStlsIdcSt11char_traitsIcEERSt13basic_ostreamIT0_T1_ES6_RKSt7complexIT_E(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8) @_ZSt4cout, %"struct.std::complex"* nonnull align 8 dereferenceable(16) %mycomplex) #11
%call1.i19 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8) @_ZSt4cout, i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @.str.1, i64 0, i64 0), i64 4) #11
%_M_value.real.i.i = load double, double* %_M_value.realp.i, align 8
%_M_value.imag.i.i = load double, double* %_M_value.imagp.i, align 8
%call.i.i20 = call double @cabs(double %_M_value.real.i.i, double %_M_value.imag.i.i) #11
%call.i21 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull @_ZSt4cout, double %call.i.i20) #11
%call1.i23 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8) %call.i21, i8* nonnull getelementptr inbounds ([6 x i8], [6 x i8]* @.str.2, i64 0, i64 0), i64 5) #11
%_M_value.real.i.i25 = load double, double* %_M_value.realp.i, align 8
%_M_value.imag.i.i27 = load double, double* %_M_value.imagp.i, align 8
%call.i.i28 = call double @carg(double %_M_value.real.i.i25, double %_M_value.imag.i.i27) #11
%call.i29 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %call.i21, double %call.i.i28) #11
%call1.i31 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8) %call.i29, i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @.str.3, i64 0, i64 0), i64 4) #11
%call1.i33 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8) @_ZSt4cout, i8* nonnull getelementptr inbounds ([25 x i8], [25 x i8]* @.str.4, i64 0, i64 0), i64 24) #11
%_M_value.real.i.i.i = load double, double* %_M_value.realp.i, align 8
%_M_value.imag.i.i.i = load double, double* %_M_value.imagp.i, align 8
%call.i.i.i = call double @cabs(double %_M_value.real.i.i.i, double %_M_value.imag.i.i.i) #11
%mul.i = fmul double %call.i.i.i, %call.i.i.i
%call.i34 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull @_ZSt4cout, double %mul.i) #11
%1 = bitcast %"class.std::basic_ostream"* %call.i34 to i8**
%vtable.i36 = load i8*, i8** %1, align 8, !tbaa !6
%vbase.offset.ptr.i37 = getelementptr i8, i8* %vtable.i36, i64 -24
%2 = bitcast i8* %vbase.offset.ptr.i37 to i64*
%vbase.offset.i38 = load i64, i64* %2, align 8
%3 = bitcast %"class.std::basic_ostream"* %call.i34 to i8*
%add.ptr.i39 = getelementptr inbounds i8, i8* %3, i64 %vbase.offset.i38
%_M_ctype.i50 = getelementptr inbounds i8, i8* %add.ptr.i39, i64 240
%4 = bitcast i8* %_M_ctype.i50 to %"class.std::ctype"**
%5 = load %"class.std::ctype"*, %"class.std::ctype"** %4, align 8, !tbaa !8
%tobool.not.i65 = icmp eq %"class.std::ctype"* %5, null
br i1 %tobool.not.i65, label %if.then.i66, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit68
if.then.i66: ; preds = %entry
call void @_ZSt16__throw_bad_castv() #12
unreachable
_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit68: ; preds = %entry
%_M_widen_ok.i52 = getelementptr inbounds %"class.std::ctype", %"class.std::ctype"* %5, i64 0, i32 8
%6 = load i8, i8* %_M_widen_ok.i52, align 8, !tbaa !12
%tobool.not.i53 = icmp eq i8 %6, 0
br i1 %tobool.not.i53, label %if.end.i59, label %if.then.i55
if.then.i55: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit68
%arrayidx.i54 = getelementptr inbounds %"class.std::ctype", %"class.std::ctype"* %5, i64 0, i32 9, i64 10
%7 = load i8, i8* %arrayidx.i54, align 1, !tbaa !14
br label %_ZNKSt5ctypeIcE5widenEc.exit61
if.end.i59: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit68
call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* nonnull %5) #11
%8 = bitcast %"class.std::ctype"* %5 to i8 (%"class.std::ctype"*, i8)***
%vtable.i56 = load i8 (%"class.std::ctype"*, i8)**, i8 (%"class.std::ctype"*, i8)*** %8, align 8, !tbaa !6
%vfn.i57 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)*, i8 (%"class.std::ctype"*, i8)** %vtable.i56, i64 6
%9 = load i8 (%"class.std::ctype"*, i8)*, i8 (%"class.std::ctype"*, i8)** %vfn.i57, align 8
%call.i58 = call signext i8 %9(%"class.std::ctype"* nonnull %5, i8 signext 10) #11
br label %_ZNKSt5ctypeIcE5widenEc.exit61
_ZNKSt5ctypeIcE5widenEc.exit61: ; preds = %if.then.i55, %if.end.i59
%retval.0.i60 = phi i8 [ %7, %if.then.i55 ], [ %call.i58, %if.end.i59 ]
%call1.i41 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* nonnull %call.i34, i8 signext %retval.0.i60) #11
%call.i42 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* nonnull %call1.i41) #11
%10 = bitcast %"struct.std::complex"* %dc to i8*
call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %10) #11
%11 = bitcast double* %x to i8*
call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(16) %10, i8 0, i64 16, i1 false)
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %11) #11
store double -3.000000e+00, double* %x, align 8, !tbaa !2
%12 = bitcast double* %dx to i8*
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %12) #11
store double 0.000000e+00, double* %dx, align 8, !tbaa !2
%13 = load i32, i32* @enzyme_dup, align 4, !tbaa !15
%14 = load double, double* %x, align 8, !tbaa !2
%15 = fadd fast double %14, %14
%16 = load double, double* %dx, align 8
%17 = fadd fast double %16, %15
store double %17, double* %dx, align 8
%18 = load i32, i32* @enzyme_dup, align 4, !tbaa !15
call void (...) @_Z17__enzyme_autodiffz(double (%"struct.std::complex"*)* nonnull @_Z1hRKSt7complexIdE, i32 %18, %"struct.std::complex"* nonnull %mycomplex, %"struct.std::complex"* nonnull %dc) #11
%call1.i46 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8) @_ZSt4cout, i8* nonnull getelementptr inbounds ([30 x i8], [30 x i8]* @.str.5, i64 0, i64 0), i64 29) #11
%call14 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZStlsIdcSt11char_traitsIcEERSt13basic_ostreamIT0_T1_ES6_RKSt7complexIT_E(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8) @_ZSt4cout, %"struct.std::complex"* nonnull align 8 dereferenceable(16) %dc) #11
%19 = bitcast %"class.std::basic_ostream"* %call14 to i8**
%vtable.i = load i8*, i8** %19, align 8, !tbaa !6
%vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24
%20 = bitcast i8* %vbase.offset.ptr.i to i64*
%vbase.offset.i = load i64, i64* %20, align 8
%21 = bitcast %"class.std::basic_ostream"* %call14 to i8*
%add.ptr.i = getelementptr inbounds i8, i8* %21, i64 %vbase.offset.i
%_M_ctype.i = getelementptr inbounds i8, i8* %add.ptr.i, i64 240
%22 = bitcast i8* %_M_ctype.i to %"class.std::ctype"**
%23 = load %"class.std::ctype"*, %"class.std::ctype"** %22, align 8, !tbaa !8
%tobool.not.i62 = icmp eq %"class.std::ctype"* %23, null
br i1 %tobool.not.i62, label %if.then.i63, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit
if.then.i63: ; preds = %_ZNKSt5ctypeIcE5widenEc.exit61
call void @_ZSt16__throw_bad_castv() #12
unreachable
_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit: ; preds = %_ZNKSt5ctypeIcE5widenEc.exit61
%_M_widen_ok.i = getelementptr inbounds %"class.std::ctype", %"class.std::ctype"* %23, i64 0, i32 8
%24 = load i8, i8* %_M_widen_ok.i, align 8, !tbaa !12
%tobool.not.i = icmp eq i8 %24, 0
br i1 %tobool.not.i, label %if.end.i, label %if.then.i
if.then.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit
%arrayidx.i = getelementptr inbounds %"class.std::ctype", %"class.std::ctype"* %23, i64 0, i32 9, i64 10
%25 = load i8, i8* %arrayidx.i, align 1, !tbaa !14
br label %_ZNKSt5ctypeIcE5widenEc.exit
if.end.i: ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit
call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* nonnull %23) #11
%26 = bitcast %"class.std::ctype"* %23 to i8 (%"class.std::ctype"*, i8)***
%vtable.i48 = load i8 (%"class.std::ctype"*, i8)**, i8 (%"class.std::ctype"*, i8)*** %26, align 8, !tbaa !6
%vfn.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)*, i8 (%"class.std::ctype"*, i8)** %vtable.i48, i64 6
%27 = load i8 (%"class.std::ctype"*, i8)*, i8 (%"class.std::ctype"*, i8)** %vfn.i, align 8
%call.i49 = call signext i8 %27(%"class.std::ctype"* nonnull %23, i8 signext 10) #11
br label %_ZNKSt5ctypeIcE5widenEc.exit
_ZNKSt5ctypeIcE5widenEc.exit: ; preds = %if.then.i, %if.end.i
%retval.0.i = phi i8 [ %25, %if.then.i ], [ %call.i49, %if.end.i ]
%call1.i = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* nonnull %call14, i8 signext %retval.0.i) #11
%call.i = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* nonnull %call1.i) #11
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %12) #11
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %11) #11
call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %10) #11
call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %0) #11
ret i32 0
}
declare dso_local nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZStlsIdcSt11char_traitsIcEERSt13basic_ostreamIT0_T1_ES6_RKSt7complexIT_E(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8), %"struct.std::complex"* nonnull align 8 dereferenceable(16)) local_unnamed_addr #0
declare dso_local void @_Z17__enzyme_autodiffz(...) local_unnamed_addr #0
; Function Attrs: nofree nounwind
declare dso_local double @cabs(double, double) local_unnamed_addr #7
declare dso_local nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8), i8*, i64) local_unnamed_addr #0
declare dso_local nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"*, double) local_unnamed_addr #0
; Function Attrs: nounwind
declare dso_local double @carg(double, double) local_unnamed_addr #1
declare dso_local nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"*, i8 signext) local_unnamed_addr #0
declare dso_local nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"*) local_unnamed_addr #0
; Function Attrs: noreturn
declare dso_local void @_ZSt16__throw_bad_castv() local_unnamed_addr #8
declare dso_local void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"*) local_unnamed_addr #0
; Function Attrs: nounwind uwtable
define internal void @_GLOBAL__sub_I_complex.cpp() #9 section ".text.startup" {
entry:
tail call void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"* nonnull @_ZStL8__ioinit) #11
%0 = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::ios_base::Init"*)* @_ZNSt8ios_base4InitD1Ev to void (i8*)*), i8* getelementptr inbounds (%"class.std::ios_base::Init", %"class.std::ios_base::Init"* @_ZStL8__ioinit, i64 0, i32 0), i8* nonnull @__dso_handle) #11
ret void
}
; Function Attrs: argmemonly nounwind willreturn writeonly
declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #10
; Function Attrs: nounwind readonly uwtable
define dso_local double @preprocess__Z2h2RKd(double* nocapture nonnull readonly align 8 dereferenceable(8) %c) #5 {
entry:
%0 = load double, double* %c, align 8, !tbaa !2
%mul = fmul double %0, %0
ret double %mul
}
; Function Attrs: nounwind uwtable
define internal void @diffe_Z2h2RKd(double* nocapture nonnull readonly align 8 dereferenceable(8) %c, double* nocapture %"c'", double %differeturn) #9 {
entry:
%0 = load double, double* %c, align 8, !tbaa !2
%m0diffe = fmul fast double %differeturn, %0
%1 = fadd fast double %m0diffe, %m0diffe
%2 = load double, double* %"c'", align 8
%3 = fadd fast double %2, %1
store double %3, double* %"c'", align 8
ret void
}
; Function Attrs: nofree nounwind uwtable
define dso_local double @preprocess__Z1hRKSt7complexIdE(%"struct.std::complex"* nocapture nonnull readonly align 8 dereferenceable(16) %c) #3 {
entry:
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %c, i64 0, i32 0, i32 0
%_M_value.real.i.i = load double, double* %_M_value.realp.i.i, align 8
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %c, i64 0, i32 0, i32 1
%_M_value.imag.i.i = load double, double* %_M_value.imagp.i.i, align 8
%call.i.i = tail call double @cabs(double %_M_value.real.i.i, double %_M_value.imag.i.i) #11
%mul = fmul double %call.i.i, %call.i.i
ret double %mul
}
; Function Attrs: nofree nounwind uwtable
define internal void @diffe_Z1hRKSt7complexIdE(%"struct.std::complex"* nocapture nonnull readonly align 8 dereferenceable(16) %c, %"struct.std::complex"* nocapture %"c'", double %differeturn) #3 {
entry:
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %c, i64 0, i32 0, i32 0
%_M_value.real.i.i = load double, double* %_M_value.realp.i.i, align 8
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %c, i64 0, i32 0, i32 1
%_M_value.imag.i.i = load double, double* %_M_value.imagp.i.i, align 8
%call.i.i = tail call double @cabs(double %_M_value.real.i.i, double %_M_value.imag.i.i) #11
%mul_replacementA = phi double
br label %invertentry
allocsForInversion: ; No predecessors!
%"mul'de" = alloca double, align 8
store double 0.000000e+00, double* %"mul'de", align 8
%"call.i.i'de" = alloca double, align 8
store double 0.000000e+00, double* %"call.i.i'de", align 8
invertentry: ; preds = %entry
store double %differeturn, double* %"mul'de", align 8
%0 = load double, double* %"mul'de", align 8
%m0diffecall.i.i = fmul fast double %0, %call.i.i
%m1diffecall.i.i = fmul fast double %0, %call.i.i
store double 0.000000e+00, double* %"mul'de", align 8
%1 = load double, double* %"call.i.i'de", align 8
%2 = fadd fast double %1, %m0diffecall.i.i
store double %2, double* %"call.i.i'de", align 8
%3 = load double, double* %"call.i.i'de", align 8
%4 = fadd fast double %3, %m1diffecall.i.i
store double %4, double* %"call.i.i'de", align 8
}
attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nofree nounwind }
attributes #3 = { nofree nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { argmemonly nounwind willreturn }
attributes #5 = { nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #6 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #7 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #8 = { noreturn "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #9 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #10 = { argmemonly nounwind willreturn writeonly }
attributes #11 = { nounwind }
attributes #12 = { noreturn nounwind }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 11.1.0"}
!2 = !{!3, !3, i64 0}
!3 = !{!"double", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C++ TBAA"}
!6 = !{!7, !7, i64 0}
!7 = !{!"vtable pointer", !5, i64 0}
!8 = !{!9, !10, i64 240}
!9 = !{!"_ZTSSt9basic_iosIcSt11char_traitsIcEE", !10, i64 216, !4, i64 224, !11, i64 225, !10, i64 232, !10, i64 240, !10, i64 248, !10, i64 256}
!10 = !{!"any pointer", !4, i64 0}
!11 = !{!"bool", !4, i64 0}
!12 = !{!13, !4, i64 56}
!13 = !{!"_ZTSSt5ctypeIcE", !10, i64 16, !11, i64 24, !10, i64 32, !10, i64 40, !10, i64 48, !4, i64 56, !4, i64 57, !4, i64 313, !4, i64 569}
!14 = !{!4, !4, i64 0}
!15 = !{!16, !16, i64 0}
!16 = !{!"int", !4, i64 0}
; Function Attrs: nofree nounwind
declare dso_local double @cabs(double, double) local_unnamed_addr #7
clang-11: ../Enzyme/EnzymeLogic.cpp:1181: const AugmentedReturn& CreateAugmentedPrimal(llvm::Function*, DIFFE_TYPE, const std::vector<DIFFE_TYPE>&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, const FnTypeInfo&, std::map<llvm::Argument*, bool>, bool, bool, bool, bool): Assertion `0 && "attempting to differentiate function without definition"' failed.
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace, preprocessed source, and associated run script.
Stack dump:
0. Program arguments: /home/username/usrlocal/bin/clang-11 -cc1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -main-file-name complex.cpp -mrelocation-model static -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /home/username/usrlocal/lib/clang/11.1.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /home/username/usrlocal/lib/clang/11.1.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -fdeprecated-macro -fdebug-compilation-dir /home/username/testenzyme -ferror-limit 19 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -load /usr/local/lib/ClangEnzyme-11.so -faddrsig -o /tmp/complex-7d9a21.o -x c++ complex.cpp
1. <eof> parser at end of file
2. Per-module optimization passes
3. Running pass 'Enzyme Pass' on module 'complex.cpp'.
#0 0x0000557a0430998a llvm::sys::PrintStackTrace(llvm::raw_ostream&) (/home/username/usrlocal/bin/clang-11+0x1d0f98a)
#1 0x0000557a04307654 llvm::sys::RunSignalHandlers() (/home/username/usrlocal/bin/clang-11+0x1d0d654)
#2 0x0000557a043077a3 SignalHandler(int) (/home/username/usrlocal/bin/clang-11+0x1d0d7a3)
#3 0x00007f1bd04f3980 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x12980)
#4 0x00007f1bcf1a4fb7 raise /build/glibc-S9d2JN/glibc-2.27/signal/../sysdeps/unix/sysv/linux/raise.c:51:0
#5 0x00007f1bcf1a6921 abort /build/glibc-S9d2JN/glibc-2.27/stdlib/abort.c:81:0
#6 0x00007f1bcf19648a __assert_fail_base /build/glibc-S9d2JN/glibc-2.27/assert/assert.c:89:0
#7 0x00007f1bcf196502 (/lib/x86_64-linux-gnu/libc.so.6+0x30502)
#8 0x00007f1bcec77ef4 CreateAugmentedPrimal(llvm::Function*, DIFFE_TYPE, std::vector<DIFFE_TYPE, std::allocator<DIFFE_TYPE> > const&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, FnTypeInfo const&, std::map<llvm::Argument*, bool, std::less<llvm::Argument*>, std::allocator<std::pair<llvm::Argument* const, bool> > >, bool, bool, bool, bool) (/usr/local/lib/ClangEnzyme-11.so+0x445ef4)
#9 0x00007f1bcece1aeb AdjointGenerator<AugmentedReturn const*>::visitCallInst(llvm::CallInst&) (/usr/local/lib/ClangEnzyme-11.so+0x4afaeb)
#10 0x00007f1bcecd8a2b llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::delegateCallInst(llvm::CallInst&) (/usr/local/lib/ClangEnzyme-11.so+0x4a6a2b)
#11 0x00007f1bcecb91b3 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visitCall(llvm::CallInst&) (/usr/local/lib/ClangEnzyme-11.so+0x4871b3)
#12 0x00007f1bcecab549 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction&) (/usr/local/lib/ClangEnzyme-11.so+0x479549)
#13 0x00007f1bcec9fe3d llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction*) (/usr/local/lib/ClangEnzyme-11.so+0x46de3d)
#14 0x00007f1bcec81ae3 CreatePrimalAndGradient(llvm::Function*, DIFFE_TYPE, std::vector<DIFFE_TYPE, std::allocator<DIFFE_TYPE> > const&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, bool, bool, llvm::Type*, FnTypeInfo const&, std::map<llvm::Argument*, bool, std::less<llvm::Argument*>, std::allocator<std::pair<llvm::Argument* const, bool> > >, AugmentedReturn const*, bool, bool, bool) (/usr/local/lib/ClangEnzyme-11.so+0x44fae3)
#15 0x00007f1bcec5b378 bool (anonymous namespace)::Enzyme::HandleAutoDiff<llvm::CallInst>(llvm::CallInst*, llvm::TargetLibraryInfo&, llvm::AAResults&, bool) (/usr/local/lib/ClangEnzyme-11.so+0x429378)
#16 0x00007f1bcec574d2 (anonymous namespace)::Enzyme::lowerEnzymeCalls(llvm::Function&, bool, bool&, std::set<llvm::Function*, std::less<llvm::Function*>, std::allocator<llvm::Function*> >&) (/usr/local/lib/ClangEnzyme-11.so+0x4254d2)
#17 0x00007f1bcec57dfb (anonymous namespace)::Enzyme::runOnModule(llvm::Module&) (/usr/local/lib/ClangEnzyme-11.so+0x425dfb)
#18 0x0000557a03cc9a81 llvm::legacy::PassManagerImpl::run(llvm::Module&) (/home/username/usrlocal/bin/clang-11+0x16cfa81)
#19 0x0000557a04589c54 (anonymous namespace)::EmitAssemblyHelper::EmitAssembly(clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream> >) (/home/username/usrlocal/bin/clang-11+0x1f8fc54)
#20 0x0000557a0458b6f4 clang::EmitBackendOutput(clang::DiagnosticsEngine&, clang::HeaderSearchOptions const&, clang::CodeGenOptions const&, clang::TargetOptions const&, clang::LangOptions const&, llvm::DataLayout const&, llvm::Module*, clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream> >) (/home/username/usrlocal/bin/clang-11+0x1f916f4)
#21 0x0000557a05133df5 clang::BackendConsumer::HandleTranslationUnit(clang::ASTContext&) (/home/username/usrlocal/bin/clang-11+0x2b39df5)
#22 0x0000557a05cb7fe9 clang::ParseAST(clang::Sema&, bool, bool) (/home/username/usrlocal/bin/clang-11+0x36bdfe9)
#23 0x0000557a05133fa8 clang::CodeGenAction::ExecuteAction() (/home/username/usrlocal/bin/clang-11+0x2b39fa8)
#24 0x0000557a04b1bd39 clang::FrontendAction::Execute() (/home/username/usrlocal/bin/clang-11+0x2521d39)
#25 0x0000557a04ad667a clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (/home/username/usrlocal/bin/clang-11+0x24dc67a)
#26 0x0000557a04be6486 clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (/home/username/usrlocal/bin/clang-11+0x25ec486)
#27 0x0000557a032070fc cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (/home/username/usrlocal/bin/clang-11+0xc0d0fc)
#28 0x0000557a03202479 ExecuteCC1Tool(llvm::SmallVectorImpl<char const*>&) (/home/username/usrlocal/bin/clang-11+0xc08479)
#29 0x0000557a03185fd4 main (/home/username/usrlocal/bin/clang-11+0xb8bfd4)
#30 0x00007f1bcf187bf7 __libc_start_main /build/glibc-S9d2JN/glibc-2.27/csu/../csu/libc-start.c:344:0
#31 0x0000557a03201c4a _start (/home/username/usrlocal/bin/clang-11+0xc07c4a)
clang-11: error: unable to execute command: Aborted (core dumped)
clang-11: error: clang frontend command failed due to signal (use -v to see invocation)
clang version 11.1.0
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/local/bin
clang-11: note: diagnostic msg:
********************
PLEASE ATTACH THE FOLLOWING FILES TO THE BUG REPORT:
Preprocessed source(s) and associated run script(s) are located at:
clang-11: note: diagnostic msg: /tmp/complex-8fd898.cpp
clang-11: note: diagnostic msg: /tmp/complex-8fd898.sh
clang-11: note: diagnostic msg:
********************
Yggdrasil audit points out that we currently don't and this causes issues when developing Enzyme.jl on a dev build of Enzyme.
patchelf --set-soname /path/to/LLVMEnzyme-9.so LLVMEnzyme-9.so
Hello,
The compilation fails when using printf of strings of length 1 inside the function that is differentiated when it has been wrapped.
Clang version 11.1.0
Enzyme recent from a few days. ae09f3d
bugprintf.cpp
#include <iostream>
using namespace std;
int enzyme_dup;
void __enzyme_autodiff(...);
template <typename T>
void compute( double* x, double* out );
template< typename T>
void compute( double* x, double* out)
{
T::comp(x,out);
}
template<typename T>
void compute_d(double* x, double* res)
{
double out = 0.0;
double dout = 1.0;
__enzyme_autodiff(compute<T >,
enzyme_dup, x,res,
enzyme_dup,&out,&dout);
}
template<typename T>
class D
{
public:
D()
{}
static void comp(double* v, double* out )
{
compute_d< T >(v,out);
}
};
class Fun2{};
template<>
void compute<Fun2>( double* x, double* out )
{
printf("a");// single character string make the compiler crash
printf("ab");//but multi character printf work fine
*out = 3*x[0] * x[0] ;
}
int main(int argc, char** argv )
{
cout<<"bug printf "<<endl;
const int d = 1;
double* x = new double[d] ;
x[0] = 5.0;
//Make the compiler crash when a single character printf is present above
double g=0.0;
D<Fun2>::comp(x, &g);
cout << "g " << g << endl ;
}
Compilation with :
clang bugprintf.cpp -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-11.so -O2 -o bugprintf -fno-exceptions
Here is the output of the compilation : (as a file to keep the thread readable)
bugprintf.log
Hi, how can I get the torch_enzyme and tf_enzyme described in the paper? It seems that they have not been open-sourced?
I had to do some gymnastics with the linker to find enzyme in the system search path. I don't know much about the LLVM build system, but
if (UNIX)
set(CMAKE_SHARED_LIBRARY_PREFIX "lib")
endif()
had no effect.
The instruction
W[indj] += wjk;
make the compiler crash.
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <random>
#include <math.h>
#include <vector>
#include <algorithm>
using namespace std;
struct Index
{
int* cellId;
int* start;
int* cellSize;
int size;
int* argsorted;
int n;
} ;
void buildIndex( Index& index , double * parts, int n )
{
int d = 3;
index.n = n;
index.cellId = new int[n];
index.start = new int[n];
index.cellSize = new int[n]; //Max Size is n but the end may be unused
index.argsorted = new int[n];
for( int i = 0 ; i < n ; i++)
{
int id = parts[d*i];
index.cellId[i] = id;
}
vector<pair<int,int> > v(n);
for( int i = 0 ; i < n ; i++)
{
v[i].first = index.cellId[i];
v[i].second = i;
}
sort( v.begin(), v.end() );
int i = 0 ;
int cur = -1;
int curCellId = -1;
for( int i = 0 ; i < n ; i++)
{
index.argsorted[i] = v[i].second;
if( v[i].first == curCellId)
{
index.cellSize[cur]++;
}
else
{
curCellId = v[i].first;
cur ++;
index.cellSize[cur] = 1;
index.start[cur] = i;
}
}
index.size = cur+1;
}
double foo( double* __restrict__ parts,int n, Index* __restrict__ index)
{
double out = 0;
const int d = 3;
double F[n*d];
double W[n];
for( int i = 0 ; i < n ; i++)
{
for( int j = 0 ; j < d ; j++)
{
F[i*d+j] = 0.0;
}
W[i] = 0.0;
}
for( int i = 0 ; i < index->size ; i++)
{
for( int j = 0 ; j < index->cellSize[i] ; j++ )
{
for( int k = 0 ; k < index->cellSize[i] ; k++ )
{
int indj = index->argsorted[index->start[i]+j];
int indk = index->argsorted[index->start[i]+k];
double djk = 0;
for( int l = 0 ; l < d ; l++)
{
double temp;
temp = parts[indj * d +l ]- parts[indk * d +l ];
djk += temp*temp;
}
//out += djk;
double wjk = 1.0+djk; // strictly positive
/*
for( int l = 0 ; l < d ; l++)
{
F[indj*d+l] += wjk * parts[indk*d+l];
}*/
W[indj] += wjk;
}
}
}
/*
//Normalize the field value
for( int i = 0 ; i < n ; i++)
{
for( int j = 0 ; j < d ; j++)
{
F[i*d+j] /= W[i*d+j];
}
}
*/
/*
//Compute the energy
for( int i = 0 ; i < n ; i++)
{
double e = 0.0;
for( int j = 0 ; j < d ; j++)
{
out += F[i*d+j]*F[i*d+j];
}
}
*/
//delete[] F;
//delete[] W;
return out;
}
int enzyme_dup;
int enzyme_out;
int enzyme_const;
typedef double (*f_ptr)(double *,int,Index*);
extern double __enzyme_autodiff(f_ptr,
int, double *, double *,
int, int,
int, Index*);
int main() {
std::mt19937 e2(42);
std::uniform_real_distribution<> dist(0, 10);
int n = 100000;
int d = 3;
double* x = new double[n*d];
double* d_x = new double[n*d];
for( int i = 0 ; i < n*d ; i++)
{
x[i] = dist(e2);
d_x[i] = 0.0;
}
Index index;
buildIndex(index, x, n);
for( int i = 0 ; i < 100 ; i++)
{
printf("cellId[%d] = %d\n ",i, index.cellId[i]);
}
printf("before autodiff\n");
__enzyme_autodiff(foo,
enzyme_dup, x, d_x,
enzyme_const, n,
enzyme_const, &index);
//printf("%f \n", y);
for( int i = 0 ; i < 100 ; i++)
{
printf("dx[%d] = [%f, %f, %f]\n ",i, d_x[d*i],d_x[d*i+1],d_x[d*i+2]);
}
}
compiled with :
clang test2.cpp -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-7.so -O2 -o test2 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -fno-exceptions
While deleting: i32 %_unwrap12
An asserting value handle still pointed to this value!
UNREACHABLE executed at /home/username/Enzyme/llvm/lib/IR/Value.cpp:887!
Stack dump:
0. Program arguments: /home/username/usrlocal/bin/clang-7 -cc1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -main-file-name test2.cpp -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -momit-leaf-frame-pointer -resource-dir /home/username/usrlocal/lib/clang/7.1.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /home/username/usrlocal/lib/clang/7.1.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -fdeprecated-macro -fdebug-compilation-dir /home/username/testenzyme -ferror-limit 19 -fmessage-length 80 -fno-unroll-loops -fobjc-runtime=gcc -fdiagnostics-show-option -fcolor-diagnostics -load /usr/local/lib/ClangEnzyme-7.so -o /tmp/test2-6e834d.o -x c++ test2.cpp -faddrsig
1. <eof> parser at end of file
2. Per-module optimization passes
3. Running pass 'Enzyme Pass' on module 'test2.cpp'.
#0 0x000055d999b1537a llvm::sys::PrintStackTrace(llvm::raw_ostream&) (/home/username/usrlocal/bin/clang-7+0x170637a)
#1 0x000055d999b137d4 llvm::sys::RunSignalHandlers() (/home/username/usrlocal/bin/clang-7+0x17047d4)
#2 0x000055d999b13912 SignalHandler(int) (/home/username/usrlocal/bin/clang-7+0x1704912)
#3 0x00007f2cc6eab980 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x12980)
#4 0x00007f2cc5b5cfb7 gsignal /build/glibc-S9d2JN/glibc-2.27/signal/../sysdeps/unix/sysv/linux/raise.c:51:0
#5 0x00007f2cc5b5e921 abort /build/glibc-S9d2JN/glibc-2.27/stdlib/abort.c:81:0
#6 0x000055d999ab8e1a (/home/username/usrlocal/bin/clang-7+0x16a9e1a)
#7 0x000055d999678d22 llvm::ValueHandleBase::ValueIsDeleted(llvm::Value*) (/home/username/usrlocal/bin/clang-7+0x1269d22)
#8 0x000055d99967993d llvm::Value::~Value() (/home/username/usrlocal/bin/clang-7+0x126a93d)
#9 0x000055d999679a20 llvm::Value::deleteValue() (/home/username/usrlocal/bin/clang-7+0x126aa20)
#10 0x000055d999604d24 llvm::Instruction::eraseFromParent() (/home/username/usrlocal/bin/clang-7+0x11f5d24)
#11 0x000055d99991d3ec llvm::GVN::processBlock(llvm::BasicBlock*) (/home/username/usrlocal/bin/clang-7+0x150e3ec)
#12 0x000055d99991d87f llvm::GVN::iterateOnFunction(llvm::Function&) (/home/username/usrlocal/bin/clang-7+0x150e87f)
#13 0x000055d99991da2f llvm::GVN::runImpl(llvm::Function&, llvm::AssumptionCache&, llvm::DominatorTree&, llvm::TargetLibraryInfo const&, llvm::AAResults&, llvm::MemoryDependenceResults*, llvm::LoopInfo*, llvm::OptimizationRemarkEmitter*) (/home/username/usrlocal/bin/clang-7+0x150ea2f)
#14 0x000055d99991e5ea llvm::GVN::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/username/usrlocal/bin/clang-7+0x150f5ea)
#15 0x00007f2cc56de0d5 optimizeIntermediate(GradientUtils*, bool, llvm::Function*) (/usr/local/lib/ClangEnzyme-7.so+0x4db0d5)
#16 0x00007f2cc563c857 CreatePrimalAndGradient(llvm::Function*, DIFFE_TYPE, std::vector<DIFFE_TYPE, std::allocator<DIFFE_TYPE> > const&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, bool, bool, llvm::Type*, FnTypeInfo const&, std::map<llvm::Argument*, bool, std::less<llvm::Argument*>, std::allocator<std::pair<llvm::Argument* const, bool> > >, AugmentedReturn const*, bool, bool, bool) (/usr/local/lib/ClangEnzyme-7.so+0x439857)
#17 0x00007f2cc5618065 bool HandleAutoDiff<llvm::CallInst>(llvm::CallInst*, llvm::TargetLibraryInfo&, llvm::AAResults&, bool) (/usr/local/lib/ClangEnzyme-7.so+0x415065)
#18 0x00007f2cc56103ad (anonymous namespace)::Enzyme::lowerEnzymeCalls(llvm::Function&, bool, bool&, std::set<llvm::Function*, std::less<llvm::Function*>, std::allocator<llvm::Function*> >&) (/usr/local/lib/ClangEnzyme-7.so+0x40d3ad)
#19 0x00007f2cc5610e12 (anonymous namespace)::Enzyme::runOnModule(llvm::Module&) (/usr/local/lib/ClangEnzyme-7.so+0x40de12)
#20 0x000055d999636a68 llvm::legacy::PassManagerImpl::run(llvm::Module&) (/home/username/usrlocal/bin/clang-7+0x1227a68)
#21 0x000055d999d1e34c clang::EmitBackendOutput(clang::DiagnosticsEngine&, clang::HeaderSearchOptions const&, clang::CodeGenOptions const&, clang::TargetOptions const&, clang::LangOptions const&, llvm::DataLayout const&, llvm::Module*, clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream> >) (/home/username/usrlocal/bin/clang-7+0x190f34c)
#22 0x000055d99a50e3b8 clang::BackendConsumer::HandleTranslationUnit(clang::ASTContext&) (/home/username/usrlocal/bin/clang-7+0x20ff3b8)
#23 0x000055d99add1e49 clang::ParseAST(clang::Sema&, bool, bool) (/home/username/usrlocal/bin/clang-7+0x29c2e49)
#24 0x000055d99a50cff8 clang::CodeGenAction::ExecuteAction() (/home/username/usrlocal/bin/clang-7+0x20fdff8)
#25 0x000055d99a18a01e clang::FrontendAction::Execute() (/home/username/usrlocal/bin/clang-7+0x1d7b01e)
#26 0x000055d99a14f32e clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (/home/username/usrlocal/bin/clang-7+0x1d4032e)
#27 0x000055d99a22d00b clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (/home/username/usrlocal/bin/clang-7+0x1e1e00b)
#28 0x000055d998d4beb8 cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (/home/username/usrlocal/bin/clang-7+0x93ceb8)
#29 0x000055d998cfde2d main (/home/username/usrlocal/bin/clang-7+0x8eee2d)
#30 0x00007f2cc5b3fbf7 __libc_start_main /build/glibc-S9d2JN/glibc-2.27/csu/../csu/libc-start.c:344:0
#31 0x000055d998d47c1a _start (/home/username/usrlocal/bin/clang-7+0x938c1a)
clang-7: error: unable to execute command: Aborted (core dumped)
clang-7: error: clang frontend command failed due to signal (use -v to see invocation)
clang version 7.1.0
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/local/bin
clang-7: note: diagnostic msg: PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace, preprocessed source, and associated run script.
clang-7: note: diagnostic msg:
********************
PLEASE ATTACH THE FOLLOWING FILES TO THE BUG REPORT:
Preprocessed source(s) and associated run script(s) are located at:
clang-7: note: diagnostic msg: /tmp/test2-3778d1.cpp
clang-7: note: diagnostic msg: /tmp/test2-3778d1.sh
clang-7: note: diagnostic msg:
The diagnostic msg /tmp/test2-3778d1.cpp is of size 3.7M I can add it if necessary.
//#include "XSbench_header.cuh"
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <stdint.h>
#include <string.h>
// Structures
typedef struct{
double energy;
double total_xs;
//double elastic_xs;
//double absorbtion_xs;
//double fission_xs;
} NuclideGridPoint;
typedef struct{
long n_isotopes;
long n_gridpoints;
int grid_type; // 0: Unionized Grid (default) 1: Nuclide Grid
} Inputs;
typedef struct{
int * num_nucs; // Length = length_num_nucs;
int * mats; // Length = length_mats
double * unionized_energy_array; // Length = length_unionized_energy_array
int * index_grid; // Length = length_index_grid
NuclideGridPoint * nuclide_grid; // Length = length_nuclide_grid
NuclideGridPoint * d_nuclide_grid;
int length_num_nucs;
int length_mats;
int length_unionized_energy_array;
long length_index_grid;
int length_nuclide_grid;
int max_num_nucs;
} SimulationData;
// Grid types
#define UNIONIZED 0
#define NUCLIDE 1
#define HASH 2
// Simulation types
#define HISTORY_BASED 1
#define EVENT_BASED 2
// Binary Mode Type
#define NONE 0
#define READ 1
#define WRITE 2
// Starting Seed
#define STARTING_SEED 1070
int double_compare(const void * a, const void * b)
{
double A = *((double *) a);
double B = *((double *) b);
if( A > B )
return 1;
else if( A < B )
return -1;
else
return 0;
}
int NGP_compare(const void * a, const void * b)
{
NuclideGridPoint A = *((NuclideGridPoint *) a);
NuclideGridPoint B = *((NuclideGridPoint *) b);
if( A.energy > B.energy )
return 1;
else if( A.energy < B.energy )
return -1;
else
return 0;
}
// num_nucs represents the number of nuclides that each material contains
int * load_num_nucs(long n_isotopes)
{
int * num_nucs = (int*)malloc(12*sizeof(int));
// Material 0 is a special case (fuel). The H-M small reactor uses
// 34 nuclides, while H-M larges uses 300.
if( n_isotopes == 68 )
num_nucs[0] = 34; // HM Small is 34, H-M Large is 321
else
num_nucs[0] = 321; // HM Small is 34, H-M Large is 321
num_nucs[1] = 5;
num_nucs[2] = 4;
num_nucs[3] = 4;
num_nucs[4] = 27;
num_nucs[5] = 21;
num_nucs[6] = 21;
num_nucs[7] = 21;
num_nucs[8] = 21;
num_nucs[9] = 21;
num_nucs[10] = 9;
num_nucs[11] = 9;
return num_nucs;
}
// Assigns an array of nuclide ID's to each material
int * load_mats( int * num_nucs, long n_isotopes, int * max_num_nucs )
{
*max_num_nucs = 0;
int num_mats = 12;
for( int m = 0; m < num_mats; m++ )
{
if( num_nucs[m] > *max_num_nucs )
*max_num_nucs = num_nucs[m];
}
int * mats = (int *) malloc( num_mats * (*max_num_nucs) * sizeof(int) );
// Small H-M has 34 fuel nuclides
int mats0_Sml[] = { 58, 59, 60, 61, 40, 42, 43, 44, 45, 46, 1, 2, 3, 7,
8, 9, 10, 29, 57, 47, 48, 0, 62, 15, 33, 34, 52, 53,
54, 55, 56, 18, 23, 41 }; //fuel
// Large H-M has 300 fuel nuclides
int mats0_Lrg[321] = { 58, 59, 60, 61, 40, 42, 43, 44, 45, 46, 1, 2, 3, 7,
8, 9, 10, 29, 57, 47, 48, 0, 62, 15, 33, 34, 52, 53,
54, 55, 56, 18, 23, 41 }; //fuel
for( int i = 0; i < 321-34; i++ )
mats0_Lrg[34+i] = 68 + i; // H-M large adds nuclides to fuel only
// These are the non-fuel materials
int mats1[] = { 63, 64, 65, 66, 67 }; // cladding
int mats2[] = { 24, 41, 4, 5 }; // cold borated water
int mats3[] = { 24, 41, 4, 5 }; // hot borated water
int mats4[] = { 19, 20, 21, 22, 35, 36, 37, 38, 39, 25, 27, 28, 29,
30, 31, 32, 26, 49, 50, 51, 11, 12, 13, 14, 6, 16,
17 }; // RPV
int mats5[] = { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
49, 50, 51, 11, 12, 13, 14 }; // lower radial reflector
int mats6[] = { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
49, 50, 51, 11, 12, 13, 14 }; // top reflector / plate
int mats7[] = { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
49, 50, 51, 11, 12, 13, 14 }; // bottom plate
int mats8[] = { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
49, 50, 51, 11, 12, 13, 14 }; // bottom nozzle
int mats9[] = { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
49, 50, 51, 11, 12, 13, 14 }; // top nozzle
int mats10[] = { 24, 41, 4, 5, 63, 64, 65, 66, 67 }; // top of FA's
int mats11[] = { 24, 41, 4, 5, 63, 64, 65, 66, 67 }; // bottom FA's
// H-M large v small dependency
if( n_isotopes == 68 )
memcpy( mats, mats0_Sml, num_nucs[0] * sizeof(int) );
else
memcpy( mats, mats0_Lrg, num_nucs[0] * sizeof(int) );
// Copy other materials
memcpy( mats + *max_num_nucs * 1, mats1, num_nucs[1] * sizeof(int) );
memcpy( mats + *max_num_nucs * 2, mats2, num_nucs[2] * sizeof(int) );
memcpy( mats + *max_num_nucs * 3, mats3, num_nucs[3] * sizeof(int) );
memcpy( mats + *max_num_nucs * 4, mats4, num_nucs[4] * sizeof(int) );
memcpy( mats + *max_num_nucs * 5, mats5, num_nucs[5] * sizeof(int) );
memcpy( mats + *max_num_nucs * 6, mats6, num_nucs[6] * sizeof(int) );
memcpy( mats + *max_num_nucs * 7, mats7, num_nucs[7] * sizeof(int) );
memcpy( mats + *max_num_nucs * 8, mats8, num_nucs[8] * sizeof(int) );
memcpy( mats + *max_num_nucs * 9, mats9, num_nucs[9] * sizeof(int) );
memcpy( mats + *max_num_nucs * 10, mats10, num_nucs[10] * sizeof(int) );
memcpy( mats + *max_num_nucs * 11, mats11, num_nucs[11] * sizeof(int) );
return mats;
}
double LCG_random_double(uint64_t * seed)
{
// LCG parameters
const uint64_t m = 9223372036854775808ULL; // 2^63
const uint64_t a = 2806196910506780709ULL;
const uint64_t c = 1ULL;
*seed = (a * (*seed) + c) % m;
return (double) (*seed) / (double) m;
}
template<typename... Args>
//__device__
void __enzyme_autodiff(void*, Args...);
int enzyme_dup, enzyme_const, enzyme_active;
// picks a material based on a probabilistic distribution
int pick_mat( uint64_t * seed )
{
// I have a nice spreadsheet supporting these numbers. They are
// the fractions (by volume) of material in the core. Not a
// *perfect* approximation of where XS lookups are going to occur,
// but this will do a good job of biasing the system nonetheless.
// Also could be argued that doing fractions by weight would be
// a better approximation, but volume does a good enough job for now.
double dist[12];
dist[0] = 0.140; // fuel
dist[1] = 0.052; // cladding
dist[2] = 0.275; // cold, borated water
dist[3] = 0.134; // hot, borated water
dist[4] = 0.154; // RPV
dist[5] = 0.064; // Lower, radial reflector
dist[6] = 0.066; // Upper reflector / top plate
dist[7] = 0.055; // bottom plate
dist[8] = 0.008; // bottom nozzle
dist[9] = 0.015; // top nozzle
dist[10] = 0.025; // top of fuel assemblies
dist[11] = 0.013; // bottom of fuel assemblies
double roll = LCG_random_double(seed);
// makes a pick based on the distro
for( int i = 0; i < 12; i++ )
{
double running = 0;
for( int j = i; j > 0; j-- )
running += dist[j];
if( roll < running )
return i;
}
return 0;
}
uint64_t fast_forward_LCG(uint64_t seed, uint64_t n)
{
// LCG parameters
const uint64_t m = 9223372036854775808ULL; // 2^63
uint64_t a = 2806196910506780709ULL;
uint64_t c = 1ULL;
n = n % m;
uint64_t a_new = 1;
uint64_t c_new = 0;
while(n > 0)
{
if(n & 1)
{
a_new *= a;
c_new = c_new * a + c;
}
c *= (a + 1);
a *= a;
n >>= 1;
}
return (a_new * seed + c_new) % m;
}
// Calculates the microscopic cross section for a given nuclide & energy
__attribute__((always_inline))
void calculate_micro_xs( int nuc, long n_isotopes,
long n_gridpoints,
int * __restrict__ index_data,
NuclideGridPoint * __restrict__ nuclide_grids,
double * __restrict__ xs_vector, int grid_type){
// Variables
long idx = 420020;
NuclideGridPoint * low;
// If using only the nuclide grid, we must perform a binary search
// to find the energy location in this particular nuclide's grid.
if( grid_type == NUCLIDE )
{
{
long lowerLimit = 0;
long upperLimit = n_gridpoints-1;
long examinationPoint;
long length = upperLimit - lowerLimit;
for (int j=0; j<10; j++)
{
examinationPoint = lowerLimit + (length / 2);
if( nuclide_grids[nuc*n_gridpoints + examinationPoint].energy > 0 )
upperLimit = examinationPoint;
else
lowerLimit = examinationPoint;
length = upperLimit - lowerLimit;
}
idx = lowerLimit;
}
// pull ptr from nuclide grid and check to ensure that
// we're not reading off the end of the nuclide's grid
if( idx == n_gridpoints - 1 )
low = &nuclide_grids[nuc*n_gridpoints + 1];
else
low = &nuclide_grids[nuc*n_gridpoints + idx];
}
else {
// pull ptr from energy grid and check to ensure that
// we're not reading off the end of the nuclide's grid
if( index_data[idx * n_isotopes + nuc] == n_gridpoints - 1 )
low = &nuclide_grids[nuc*n_gridpoints + 2];
else
low = &nuclide_grids[nuc*n_gridpoints + index_data[idx * n_isotopes + nuc]];
}
// Total XS
xs_vector[0] = 1 / (1.0 - low->energy);
}
void calculate_macro_xs( long n_isotopes,
long n_gridpoints,
int * __restrict__ index_data,
NuclideGridPoint * __restrict__ nuclide_grids,
int * __restrict__ mats,
double * __restrict__ macro_xs_vector, int grid_type){
// cleans out macro_xs_vector
macro_xs_vector[0] = 0;
for( int j = 0; j < 2; j++ )
{
double xs_vector;
int nuc = mats[j];
calculate_micro_xs( nuc, n_isotopes,
n_gridpoints, index_data,
nuclide_grids, &xs_vector, grid_type);
for( int k = 0; k < 3; k++ ) {
macro_xs_vector[k] += xs_vector;
//printf("xs_vector[k=%d] j=%d %f\n", k, j, xs_vector[k]);
//printf("xs_vector[k=%d] j=%d %f\n", k, j, 1.0);
}
//printf("mid\n");
}
}
void xs_lookup_kernel_baselineLocal(Inputs in, SimulationData GSD )
{
// The lookup ID. Used to set the seed, and to store the verification value
const int i = 0;
// Set the initial seed value
uint64_t seed = STARTING_SEED;
// Forward seed to lookup index (we need 2 samples per lookup)
seed = fast_forward_LCG(seed, 2*i);
// Randomly pick an energy and material for the particle
double p_energy = LCG_random_double(&seed);
double macro_xs_vector[5] = {0};
double d_macro_xs_vector[5] = {1.0};
// Perform macroscopic Cross Section Lookup
#if 0
calculate_macro_xs(
in.n_isotopes, // Total number of isotopes in simulation
in.n_gridpoints, // Number of gridpoints per isotope in simulation
GSD.index_grid, // Flattened 2-D grid holding indices into nuclide grid for each unionized energy level
GSD.nuclide_grid, // Flattened 2-D grid holding energy levels and XS_data for all nuclides in simulation
GSD.mats, // Flattened 2-D array with nuclide indices defining composition of each type of material
macro_xs_vector, // 1-D array with result of the macroscopic cross section (5 different reaction channels)
in.grid_type, // Lookup type (nuclide, hash, or unionized)
GSD.max_num_nucs // Maximum number of nuclides present in any material
);
#else
__enzyme_autodiff((void*)calculate_macro_xs,
enzyme_const, in.n_isotopes, // Total number of isotopes in simulation
enzyme_const, in.n_gridpoints, // Number of gridpoints per isotope in simulation
enzyme_const, GSD.index_grid, // Flattened 2-D grid holding indices into nuclide grid for each unionized energy level
//enzyme_const, GSD.nuclide_grid, // Flattened 2-D grid holding energy levels and XS_data for all nuclides in simulation
enzyme_dup, GSD.nuclide_grid, GSD.d_nuclide_grid, // Flattened 2-D grid holding energy levels and XS_data for all nuclides in simulation
enzyme_const, GSD.mats, // Flattened 2-D array with nuclide indices defining composition of each type of material
//enzyme_const, macro_xs_vector, // 1-D array with result of the macroscopic cross section (5 different reaction channels)
enzyme_dup, macro_xs_vector, d_macro_xs_vector,// 1-D array with result of the macroscopic cross section (5 different reaction channels)
enzyme_const, in.grid_type // Lookup type (nuclide, hash, or unionized)
);
#endif
if (i == 0) {
for(int j=0; j<5; j++)
printf("macro_xs_vector[%d]=%f\n", j, macro_xs_vector[j]);
}
}
SimulationData grid_init_do_not_profile( Inputs in, int mype )
{
// Structure to hold all allocated simuluation data arrays
SimulationData SD;
// Keep track of how much data we're allocating
size_t nbytes = 0;
// Set the initial seed value
uint64_t seed = 42;
// First, we need to initialize our nuclide grid. This comes in the form
// of a flattened 2D array that hold all the information we need to define
// the cross sections for all isotopes in the simulation.
// The grid is composed of "NuclideGridPoint" structures, which hold the
// energy level of the grid point and all associated XS data at that level.
// An array of structures (AOS) is used instead of
// a structure of arrays, as the grid points themselves are accessed in
// a random order, but all cross section interaction channels and the
// energy level are read whenever the gridpoint is accessed, meaning the
// AOS is more cache efficient.
// Initialize Nuclide Grid
SD.length_nuclide_grid = in.n_isotopes * in.n_gridpoints;
SD.nuclide_grid = (NuclideGridPoint *) malloc( SD.length_nuclide_grid * sizeof(NuclideGridPoint));
SD.d_nuclide_grid = (NuclideGridPoint *) calloc( SD.length_nuclide_grid , sizeof(NuclideGridPoint));
assert(SD.nuclide_grid != NULL);
nbytes += SD.length_nuclide_grid * sizeof(NuclideGridPoint);
for( int i = 0; i < SD.length_nuclide_grid; i++ )
{
SD.nuclide_grid[i].energy = LCG_random_double(&seed);
//SD.nuclide_grid[i].total_xs = LCG_random_double(&seed);
//SD.nuclide_grid[i].elastic_xs = LCG_random_double(&seed);
//SD.nuclide_grid[i].absorbtion_xs = LCG_random_double(&seed);
//SD.nuclide_grid[i].fission_xs = LCG_random_double(&seed);
}
// Sort so that each nuclide has data stored in ascending energy order.
for( int i = 0; i < in.n_isotopes; i++ )
qsort( &SD.nuclide_grid[i*in.n_gridpoints], in.n_gridpoints, sizeof(NuclideGridPoint), NGP_compare);
if( in.grid_type == UNIONIZED )
{
// Allocate space to hold the union of all nuclide energy data
SD.length_unionized_energy_array = in.n_isotopes * in.n_gridpoints;
SD.unionized_energy_array = (double *) malloc( SD.length_unionized_energy_array * sizeof(double));
assert(SD.unionized_energy_array != NULL );
nbytes += SD.length_unionized_energy_array * sizeof(double);
// Copy energy data over from the nuclide energy grid
for( int i = 0; i < SD.length_unionized_energy_array; i++ )
SD.unionized_energy_array[i] = SD.nuclide_grid[i].energy;
// Sort unionized energy array
qsort( SD.unionized_energy_array, SD.length_unionized_energy_array, sizeof(double), double_compare);
// Allocate space to hold the acceleration grid indices
SD.length_index_grid = SD.length_unionized_energy_array * in.n_isotopes;
SD.index_grid = (int *) malloc( SD.length_index_grid * sizeof(int));
assert(SD.index_grid != NULL);
nbytes += SD.length_index_grid * sizeof(int);
// Generates the double indexing grid
int * idx_low = (int *) calloc( in.n_isotopes, sizeof(int));
assert(idx_low != NULL );
double * energy_high = (double *) malloc( in.n_isotopes * sizeof(double));
assert(energy_high != NULL );
for( int i = 0; i < in.n_isotopes; i++ )
energy_high[i] = SD.nuclide_grid[i * in.n_gridpoints + 1].energy;
for( long e = 0; e < SD.length_unionized_energy_array; e++ )
{
double unionized_energy = SD.unionized_energy_array[e];
for( long i = 0; i < in.n_isotopes; i++ )
{
if( unionized_energy < energy_high[i] )
SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
else if( idx_low[i] == in.n_gridpoints - 2 )
SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
else
{
idx_low[i]++;
SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
energy_high[i] = SD.nuclide_grid[i * in.n_gridpoints + idx_low[i] + 1].energy;
}
}
}
free(idx_low);
free(energy_high);
}
////////////////////////////////////////////////////////////////////
// Initialize Materials and Concentrations
///////////////////////////////////////////////////////
// Set the number of nuclides in each material
SD.num_nucs = load_num_nucs(in.n_isotopes);
SD.length_num_nucs = 12; // There are always 12 materials in XSBench
// Intialize the flattened 2D grid of material data. The grid holds
// a list of nuclide indices for each of the 12 material types. The
// grid is allocated as a full square grid, even though not all
// materials have the same number of nuclides.
SD.mats = load_mats(SD.num_nucs, in.n_isotopes, &SD.max_num_nucs);
SD.length_mats = SD.length_num_nucs * SD.max_num_nucs;
return SD;
}
int main( int argc, char* argv[] )
{
int mype = 0;
Inputs in;
// defaults to 11303 (corresponding to H-M Large benchmark)
in.n_gridpoints = 11303;
// default to unionized grid
in.grid_type = UNIONIZED;
in.n_isotopes = 68;
SimulationData SD;
SD = grid_init_do_not_profile( in, mype );
xs_lookup_kernel_baselineLocal( in, SD );
return 0;
}
Interestingly correct forward pass when printf added back.
/mnt/sabrent/wmoses/llvm13/buildallfast/bin/clang++ -fno-experimental-new-pass-manager -std=c++11 -Xclang -load -Xclang /home/wmoses/git/Enzyme/enzyme/build13Fast/Enzyme/ClangEnzyme-13.so -O3 Main.cpp -o XSBench -lm
Edit: Working Solution at bottom
LLVM info: 11.0.1 (43ff75f2c3feef64f9d73328230d34dac8832a91), built from source with:
cmake ../llvm -DLLVM_TARGETS_TO_BUILD="host;NVPTX" -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_PLUGINS=ON -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DCMAKE_INSTALL_PREFIX=/home/yutong/local -DLLVM_BUILD_LLVM_DYLIB=ON -DLLVM_INSTALL_UTILS=ON
Enzyme info: fb2af2a
I'm trying to use enzyme to generate autodiff'd CUDA kernels. I'm invoking:
clang -c test3.cu -Xclang -load -Xclang /home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so -O2 -fno-vectorize -fno-unroll-loops --cuda-gpu-arch=sm_70 -fPIC
Running nm on the resulting test3.o shows an undefined symbol for the autodiff version:
U atexit
0000000000000000 d __cuda_fatbin_wrapper
0000000000000010 b __cuda_gpubin_handle
U cudaLaunchKernel
U cudaMalloc
U cudaMemcpy
0000000000000240 t __cuda_module_ctor
00000000000002b0 t __cuda_module_dtor
U __cudaPopCallConfiguration
U __cudaPushCallConfiguration
U __cudaRegisterFatBinary
U __cudaRegisterFatBinaryEnd
U __cudaRegisterFunction
U __cudaUnregisterFatBinary
0000000000000008 B enzyme_const
0000000000000000 B enzyme_dup
0000000000000004 B enzyme_out
0000000000000000 r .L.str
0000000000000007 r .L__unnamed_1
0000000000000070 T main
U printf
0000000000000000 T _Z18__device_stub__fooPdS_
U _Z32__device_stub____enzyme_autodiffPFvPdS_EiS_S_iS_S_
// test3.cu
#include <stdio.h>
void __global__ foo(double* x_in, double *x_out) {
x_out[0] = x_in[0] * x_in[0];
}
int enzyme_dup;
int enzyme_out;
int enzyme_const;
typedef void (*f_ptr)(double*, double*);
extern void __global__ __enzyme_autodiff(f_ptr,
int, double*, double*,
int, double*, double*);
int main() {
double *x, *d_x, *y, *d_y; // all on the devic
cudaMalloc(&x, sizeof(*x));
cudaMalloc(&d_x, sizeof(*d_x));
cudaMalloc(&y, sizeof(*y));
cudaMalloc(&d_y, sizeof(*d_y));
double host_x = 1.4;
double host_d_x = 0.0;
double host_y;
double host_d_y = 1.0;
cudaMemcpy(x, &host_x, sizeof(*x), cudaMemcpyHostToDevice);
cudaMemcpy(d_x, &host_d_x, sizeof(*d_x), cudaMemcpyHostToDevice);
cudaMemcpy(y, &host_y, sizeof(*y), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, &host_d_y, sizeof(*d_y), cudaMemcpyHostToDevice);
__enzyme_autodiff<<<1,1>>>(foo,
enzyme_dup, x, d_x,
enzyme_dup, y, d_y);
cudaMemcpy(&host_x, x, sizeof(*x), cudaMemcpyDeviceToHost);
cudaMemcpy(&host_d_x, d_x, sizeof(*d_x), cudaMemcpyDeviceToHost);
cudaMemcpy(&host_y, y, sizeof(*y), cudaMemcpyDeviceToHost);
cudaMemcpy(&host_d_y, d_y, sizeof(*d_y), cudaMemcpyDeviceToHost);
printf("%f %f\n", host_x, host_y);
printf("%f %f\n", host_d_x, host_d_y);
}
; ModuleID = 'silent_failure.c'
source_filename = "silent_failure.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
$_ZSt4fabsf = comdat any
@.str = private unnamed_addr constant [36 x i8] c"hello! %f, res2 %f, da: %f, db: %f\0A\00", align 1
; Function Attrs: noinline optnone uwtable
define dso_local void @compute_sumabs(float* %a, float* %b, float* %ret) #0 {
entry:
%a.addr = alloca float*, align 8
%b.addr = alloca float*, align 8
%ret.addr = alloca float*, align 8
store float* %a, float** %a.addr, align 8
store float* %b, float** %b.addr, align 8
store float* %ret, float** %ret.addr, align 8
%0 = load float*, float** %a.addr, align 8
%1 = load float, float* %0, align 4
%call = call float @_ZSt4fabsf(float %1)
%2 = load float*, float** %b.addr, align 8
%3 = load float, float* %2, align 4
%call1 = call float @_ZSt4fabsf(float %3)
%add = fadd float %call, %call1
%4 = load float*, float** %ret.addr, align 8
store float %add, float* %4, align 4
ret void
}
; Function Attrs: noinline nounwind optnone uwtable
define linkonce_odr dso_local float @_ZSt4fabsf(float %__x) #1 comdat {
entry:
%__x.addr = alloca float, align 4
store float %__x, float* %__x.addr, align 4
%0 = load float, float* %__x.addr, align 4
%1 = call float @llvm.fabs.f32(float %0)
ret float %1
}
; Function Attrs: noinline norecurse optnone uwtable
define dso_local i32 @main(i32 %argc, i8** %argv) #2 {
entry:
%retval = alloca i32, align 4
%argc.addr = alloca i32, align 4
%argv.addr = alloca i8**, align 8
%a = alloca float, align 4
%b = alloca float, align 4
%da = alloca float, align 4
%db = alloca float, align 4
%ret = alloca float, align 4
%dret = alloca float, align 4
store i32 0, i32* %retval, align 4
store i32 %argc, i32* %argc.addr, align 4
store i8** %argv, i8*** %argv.addr, align 8
store float 2.000000e+00, float* %a, align 4
store float 3.000000e+00, float* %b, align 4
store float 0.000000e+00, float* %da, align 4
store float 0.000000e+00, float* %db, align 4
store float 0.000000e+00, float* %ret, align 4
store float 1.000000e+00, float* %dret, align 4
call void @compute_sumabs(float* %a, float* %b, float* %ret)
%0 = call double (...) @__enzyme_autodiff.f64(void (float*, float*, float*)* @compute_sumabs, float* %a, float* %da, float* %b, float* %db, float* %ret, float* %dret)
%1 = load float, float* %ret, align 4
%conv = fpext float %1 to double
%2 = load float, float* %ret, align 4
%conv1 = fpext float %2 to double
%3 = load float, float* %da, align 4
%conv2 = fpext float %3 to double
%4 = load float, float* %db, align 4
%conv3 = fpext float %4 to double
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str, i32 0, i32 0), double %conv, double %conv1, double %conv2, double %conv3)
ret i32 0
}
declare double @__enzyme_autodiff.f64(...)
declare dso_local i32 @printf(i8*, ...) #3
; Function Attrs: nounwind readnone speculatable
declare float @llvm.fabs.f32(float) #4
attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { nounwind readnone speculatable }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 7.1.0 "}
We need a plan for automatically handling global variables without explicitly giving annotations. Can we summarize the challenges involved in handling globals here?
Here are a few preliminary thoughts that come to mind:
There are a few categories of global use (unsure if list is complete):
(a) globals that are constant for entire duration of program.
(b) globals that are precomputed during the program and constant for subsequent uses.
(c) globals acting as a scratch-space --- e.g. a static array of floats.
(d) globals for caching: e.g. a static lookup table.
(a),(b) can probably be identified via analysis; (c) can either be disallowed or handled by duplicating static storage; (d) is tricky because the data structure may be manipulated/initialized prior to the enzyme_autodiff call.
I am an incoming student at Julia lab and I am trying to do sth with Enzyme. I learn from the Enzyme paper that one can define custom gradient for precompiled libraries, but I am not sure about how to write the "augmented" and "gradient" functions for custom gradient. For example, I can compile the following program
#include <iostream>
#include "cblas.h"
using namespace std;
int main() {
int N = 3, incX = 1, incY = 1;
double X[3] = {1.0, 2.0, 3.0}, Y[3] = {4.0, 5.0, 6.0}, _X[3] = {0.0}, _Y[3] = {0.0};
double s = cblas_ddot(N, X, incX, Y, incY);
cout << s << endl;
return 0;
}
with clang++ blas.cpp -I/usr/local/opt/openblas/include -L/usr/local/opt/openblas/lib -lopenblas -o blas
. What should I do then to get the gradient w.r.t X
?
We currently do not seem to handle differential returns that are pointer types. e.g. a call %x = float* foo() where %x is not a constant value. A test case for this is attached --- if you look at the test case please spend a moment to verify that I set it up correctly as it was extracted from a larger file generated from eigen code. If the general fix isn't easy we should discuss exactly what we want to do here (e.g. create a wrapper function for such calls that put all of our differentiable calls into a common form where certain return values are additional arguments)
This should be asserting (and presently is a warning). Resolving this involves better interprocedural Activity Analysis, which will come in with the move to the Attributor.
I've been playing around with Enzyme. Enzyme seems to be able to find the derivative of the following function just fine
double fun(double x) {
if(x>=0) return x;
return x * 0;
}
However, when I do "return 0" instead of "return x * 0", it calculates the derivative always equal to zero. I assumed that somehow the constant confuses the library, but the following functions finds the correct derivative:
double fun(double x) {
if(x<=1) return x;
return 1;
}
so, I guess it might be something to do with zero? I am not sure.
Hello, I'm still learning Enzyme.
I was expecting the following simple code to not require quadratic memory for the backward pass with enzyme
test2.cpp
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <random>
void foo(double* parts, double *x_out,int n) {
int d = 3;
*x_out = 0.0;
for( int i = 0 ; i < n ; i++)
{
for( int j = 0 ; j < n ; j++)
{
for( int k = 0 ; k < d ; k++)
{
double temp = parts[d*i+k]-parts[d*j+k];
*x_out += temp*temp;
}
}
}
}
int enzyme_dup;
int enzyme_out;
int enzyme_const;
typedef void (*f_ptr)(double*, double*,int);
extern void __enzyme_autodiff(f_ptr,
int, double*, double*,
int, double*, double*,
int, int);
int main() {
srand(42);
std::mt19937 e2(42);
std::uniform_real_distribution<> dist(0, 10);
int n = 100000;
int d = 3;
double* x = new double[n*d];
double* d_x = new double[n*d];
for( int i = 0 ; i < n*d ; i++)
{
x[i] = dist(e2);
d_x[i] = 0.0;
}
double y;
double d_y = 1.0;
printf("before autodiff\n");
__enzyme_autodiff(foo,
enzyme_dup, x, d_x,
enzyme_dup, &y, &d_y,
enzyme_const, n);
printf("%f \n", y);
for( int i = 0 ; i < 100 ; i++)
{
printf("dx[%d] = %f\n",i, d_x[i]);
}
}
I used the one liner to compile, ( I also tried to emit O2 llvm and opt it but same results )
clang test2.cpp -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-7.so -o test2
It crashes with SegFault probably because it tries to allocate (or stack-allocate)
before autodiff
Segmentation fault (core dumped)
Can you please advise ?
Thanks
Hi,
the implementation of unary minus for ForwardMode seems to be broken.
I'm using the current brew version (LLVM 12) on Linux and opt produces a segfault.
I traced the error to
https://github.com/wsmoses/Enzyme/blob/9be50337dbf98a6a0489e94660d15586466a4b34/enzyme/Enzyme/AdjointGenerator.h#L252
where it tries to get a ReverseBuilder which I suppose doesn't exist in ForwardMode.
But I'm not sure how to procede from there.
Testcase to reproduce:
// clang++ fneg.cpp -S -emit-llvm -o input.ll -O2 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops
// opt input.ll -load=/home/linuxbrew/.linuxbrew/lib/LLVMEnzyme-12.so -enzyme -o output.ll -S
// clang++ -c output.ll
extern double __enzyme_fwddiff(void*, double, double);
double foo(double x){
return -x;
}
double dfoo(double x){
return __enzyme_fwddiff((void*)foo, x, 1.0);
}
Best
Markus
Hi,
I've been trying to assess how to use Enzyme together with more complex codes (that use Eigen) and created a pretty simple C++ test program:
#include <iostream>
#include <Eigen/Dense>
using Eigen::VectorXd;
void __enzyme_autodiff(...);
double foo( VectorXd v )
{
double out = 0.0;
for( int i = 0 ; i < v.rows() ; i++ )
out += v(i) * v(i);
return out;
}
int main()
{
size_t n = 3;
VectorXd v(n);
VectorXd dv(n);
for( int i = 0; i < n ; i++)
{
v(i) = i;
dv(i) = 0.0;
}
#ifdef __clang__
__enzyme_autodiff(foo, enzyme_dup, &v, &dv);
#endif
std::cout << foo(v) << std::endl << std::endl;
std::cout << dv << std::endl;
return 0;
}
I've been compiling it with the following:
clang++ test.cpp -I/path/to/eigen -S -emit-llvm -o input.ll -O3 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops
opt input.ll -load=ClangEnzyme-11.dylib -enzyme -o output.ll -S
clang++ output.ll -O3 -o matrix
I've tried to base all of the above on other examples (some posted as issues here). Several of the other examples compile and link properly. Unfortunately, it seems that the autodiffed function for my example never makes it into the executable, because I get a missing symbol error on linking:
Undefined symbols for architecture x86_64:
"__enzyme_autodiff(...)", referenced from:
_main in output-c75457.o
ld: symbol(s) not found for architecture x86_64
That's kind of perplexing to me, because other examples I build don't seem to have this issue. I've also tried more explicit declarations of the autodiff functions, but no luck. Am I missing something fundamental here, or are there any pointers you could give me to resolve this. Thanks!
The following code run fine for small n but crashes with segfault from extra memory allocations.
The code works if the integer-typed cellId is stackAllocated.
It also works if I pass an allocated cell_id as an enzyme_const parameter
test2.cpp
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <random>
#include <math.h>
using namespace std;
double foo( double* __restrict__ parts,int n)
{
const int d = 3;
double out = 0.0;
int* cellId = new int[n];
double minx = 0.0;
double maxx = 10.0;
double h = 3.1;
int nh = ceil((maxx-minx) / h);
int strides[d];
strides[0] = 1;
for( int i = 1 ; i < d; i++)
{
strides[i] = strides[i-1]*nh;
}
for( int i = 0 ; i < n ; i++)
{
cellId[i] = 0;
for( int k = 0 ; k < d ; k++)
{
cellId[i] += (int) ( (parts[i*d+k] - minx)/h * strides[k]) ;
}
}
for( int i = 0 ; i < n ; i++)
{
for( int j = 0; j < n ; j++)
{
double dij = 0.0;
for( int k = 0 ; k < d ; k++)
{
double temp = parts[d*i+k] - parts[d*j+k];
dij += temp*temp;
}
if( cellId[i] == cellId[j])
out += dij ;
}
}
delete[] cellId;
return out;
}
int enzyme_dup;
int enzyme_out;
int enzyme_const;
typedef double (*f_ptr)(double *,int);
extern double __enzyme_autodiff(f_ptr,
int, double *, double *,
int, int);
int main() {
srand(42);
std::mt19937 e2(42);
std::uniform_real_distribution<> dist(0, 10);
int n = 100000;
int d = 3;
double* x = new double[n*d];
double* d_x = new double[n*d];
for( int i = 0 ; i < n*d ; i++)
{
x[i] = dist(e2);
d_x[i] = 0.0;
}
int * cellid = new int[n];
printf("before autodiff\n");
__enzyme_autodiff(foo,
enzyme_dup, x, d_x,
enzyme_const, n);
//printf("%f \n", y);
for( int i = 0 ; i < 100 ; i++)
{
printf("dx[%d] = %f\n",i, d_x[i]);
}
}
Compiled with :
clang test2.cpp -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-7.so -O2 -o test2
Let's add compile time regression tests.
I've collected some useful resources:
https://www.npopov.com/2020/05/10/Make-LLVM-fast-again.html
http://llvm-compile-time-tracker.com
https://green.lab.llvm.org/green/view/Compile%20Time/
https://lnt.readthedocs.io/_/downloads/en/latest/pdf/
https://github.com/llvm/llvm-lnt
https://github.com/llvm/llvm-test-suite/tree/main/CTMark
Hello everyone,
I just wanted to begin by thanking the developping community for your incredible work on Enzyme, it really does seem like a dream come true regarding ease-to-use and its integration directly with the llvm building tools.
I would like to know if anyone has been using Enzyme for CFD applications, where as in ML, the need to get access to gradients and/or directional derivatives is crucial. In fact, I was recently testing Tapenade and comparing the performance with Enzyme in reverse mode (on a simple FD scheme and on a more complex Flux computation scheme) and I started to notice that if the number of operations and data IO stays relatively small to medium size, Enzyme fares much better than the reverse mode code generated by Tapenade (Fortran), however when the functions become more complex and the number DOF (degrees of freedom) increases, Enzyme starts lagging behind in performance and the problem gets worse the more the DOF are used.
Here are some code excerpts and results of the benchmarking tests (where I need to run enzyme on reverse mode on a flux computation) :
//original function
static void f(const T* rho, const T* velx, const T* vely, const T* velz, const T* temp,
T* flux1, T* flux2, T* flux3, T* flux4, T* flux5,
const int n_cell, const int gh, const T* surfx, const T* surfy, const T* surfz )
{
constexpr double gam = 1.4;
constexpr double gam1 = gam-1.;
constexpr double gam1_1 = 1./gam1;
constexpr double rgaz = 237.;
for(int i=gh; i<n_cell-gh ; ++i)
{
const double sc1 = surfx[i]; const double sc2 = surfy[i]; const double sc3 = surfz[i];
const double sn = std::sqrt(sc1*sc1 + sc2*sc2 + sc3*sc3);
const double invsn = 1./std::max(sn,1.e-32);
const double nx = sc1*invsn; const double ny = sc2*invsn; const double nz = sc3*invsn;
const auto wfl1 = rho[i-1]; const auto wfr1 = rho[i ];
const auto wfl2 = velx[i-1]; const auto wfr2 = velx[i ];
const auto wfl3 = vely[i-1]; const auto wfr3 = vely[i ];
const auto wfl4 = velz[i-1]; const auto wfr4 = velz[i ];
const auto wfl5 = temp[i-1]; const auto wfr5 = temp[i ];
const auto pm = wfl1*wfl5*rgaz; auto pp = wfr1*wfr5*rgaz;
const auto hm = gam*gam1_1*wfl5*rgaz + 0.5*(wfl2*wfl2 + wfl3*wfl3 + wfl4*wfl4);
const auto hp = gam*gam1_1*wfr5*rgaz + 0.5*(wfr2*wfr2 + wfr3*wfr3 + wfr4*wfr4);
const auto fcdx1 = wfr1*wfr2 + wfl1*wfl2;
const auto fcdy1 = wfr1*wfr3 + wfl1*wfl3;
const auto fcdz1 = wfr1*wfr4 + wfl1*wfl4;
const auto fcdx2 = wfr1*wfr2*wfr2 + pp + wfl1*wfl2*wfl2 + pm;
const auto fcdy2 = wfr1*wfr2*wfr3 + wfl1*wfl2*wfl3;
const auto fcdz2 = wfr1*wfr2*wfr4 + wfl1*wfl2*wfl4;
const auto fcdx3 = fcdy2;
const auto fcdy3 = wfr1*wfr3*wfr3 + pp + wfl1*wfl3*wfl3 + pm;
const auto fcdz3 = wfr1*wfr3*wfr4 + wfl1*wfl3*wfl4;
const auto fcdx4 = fcdz2;
const auto fcdy4 = fcdz3;
const auto fcdz4 = wfr1*wfr4*wfr4 + pp + wfl1*wfl4*wfl4 + pm;
const auto fcdx5 = wfr2*wfr1*hp + wfl2*wfl1*hm;
const auto fcdy5 = wfr3*wfr1*hp + wfl3*wfl1*hm;
const auto fcdz5 = wfr4*wfr1*hp + wfl4*wfl1*hm;
flux1[i] = 0.5*sn*(fcdx1*nx + fcdy1*ny + fcdz1*nz);
flux2[i] = 0.5*sn*(fcdx2*nx + fcdy2*ny + fcdz2*nz);
flux3[i] = 0.5*sn*(fcdx3*nx + fcdy3*ny + fcdz3*nz);
flux4[i] = 0.5*sn*(fcdx4*nx + fcdy4*ny + fcdz4*nz);
flux5[i] = 0.5*sn*(fcdx5*nx + fcdy5*ny + fcdz5*nz);
}
}
// differentiated function
static void df(const T *rho, T *rho_b, const T* velx, T* velx_b, const T* vely, T* vely_b,
const T* velz, T* velz_b, const T* temp, T* temp_b,
const T* flux1, T* flux1_b, const T* flux2, T* flux2_b, const T* flux3, T* flux3_b,
const T* flux4, T* flux4_b, const T* flux5, T* flux5_b, const int n_cell, const int gh,
const T* surfx, const T* surfy, const T* surfz )
{
__enzyme_autodiff(f,
enzyme_dup, rho,
rho_b,
enzyme_dup, velx,
velx_b,
enzyme_dup, vely,
vely_b,
enzyme_dup, velz,
velz_b,
enzyme_dup, temp,
temp_b,
enzyme_dupnoneed, flux1,
flux1_b,
enzyme_dupnoneed, flux2,
flux2_b,
enzyme_dupnoneed, flux3,
flux3_b,
enzyme_dupnoneed, flux4,
flux4_b,
enzyme_dupnoneed, flux5,
flux5_b,
enzyme_const, n_cell, enzyme_const, gh,
enzyme_const, surfx, enzyme_const, surfy, enzyme_const, surfz);
}
where the results are stored in the shadow arrays for the primitve variables (rho_b, velx_b, ..., temp_b). The results are the same as for Tapenade but performance stalls as the DOF get bigger :
Running ./out
Run on (48 X 2900 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x24)
L1 Instruction 32 KiB (x24)
L2 Unified 256 KiB (x24)
L3 Unified 30720 KiB (x2)
Load Average: 2.13, 7.63, 11.21
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
------------------------------------------------------------------------------------
tapenade_roe_flux/1024 144 us 143 us 4194 rho_b[n_cell/2]=330.698T
enzyme_roe_flux/1024 113 us 113 us 6064 rho_b[n_cell/2]=330.698T
tapenade_roe_flux/2048 258 us 257 us 2737 rho_b[n_cell/2]=7.82098P
enzyme_roe_flux/2048 279 us 279 us 2637 rho_b[n_cell/2]=7.82098P
tapenade_roe_flux/4096 522 us 522 us 1000 rho_b[n_cell/2]=206.173P
enzyme_roe_flux/4096 612 us 612 us 1176 rho_b[n_cell/2]=206.173P
tapenade_roe_flux/8192 1103 us 1103 us 623 rho_b[n_cell/2]=5.89262E
enzyme_roe_flux/8192 1253 us 1253 us 597 rho_b[n_cell/2]=5.89262E
tapenade_roe_flux/16384 2304 us 2304 us 324 rho_b[n_cell/2]=177.29E
enzyme_roe_flux/16384 2890 us 2890 us 245 rho_b[n_cell/2]=177.29E
tapenade_roe_flux/32768 3984 us 3983 us 159 rho_b[n_cell/2]=5.49296Z
enzyme_roe_flux/32768 5185 us 5185 us 136 rho_b[n_cell/2]=5.49296Z
tapenade_roe_flux/65536 8126 us 8125 us 88 rho_b[n_cell/2]=172.89Z
enzyme_roe_flux/65536 12211 us 12205 us 57 rho_b[n_cell/2]=172.89Z
tapenade_roe_flux/131072 15859 us 15859 us 45 rho_b[n_cell/2]=5.48632Y
enzyme_roe_flux/131072 35868 us 35866 us 20 rho_b[n_cell/2]=5.48632Y
tapenade_roe_flux/262144 33950 us 33947 us 21 rho_b[n_cell/2]=174.824Y
enzyme_roe_flux/262144 76974 us 76970 us 9 rho_b[n_cell/2]=174.824Y
tapenade_roe_flux/524288 65500 us 65496 us 9 rho_b[n_cell/2]=5.58255
enzyme_roe_flux/524288 159281 us 159270 us 5 rho_b[n_cell/2]=5.58255
tapenade_roe_flux/1048576 151165 us 151153 us 4 rho_b[n_cell/2]=178.452
enzyme_roe_flux/1048576 287164 us 287144 us 3 rho_b[n_cell/2]=178.452
Regarding the compilation options/flags, I have been using what is suggested on the website:
export F90LAGS="-O2 -DNDEBUG" # For Tapenade code
export CCFLAGS2="-O2 -DNDEBUG"
export ENZYMEPM_AND_OPTIONS="-enzyme -enzyme-inline=1 -enzyme-smallbool=1 -enzyme-cache-never=1"
As you can see, I'm quite new at using enzyme and I surely do not know all the options and best coding practices that might help increase performance, so I'm really interested in any suggestions that you in the community might have!
Thanks in advance for any information.
Maybe bad idea?
Hi,
I have been following the "Getting started" instructions and noticed that one needs to explicitly specify the path to LLVM_EXTERNAL_LIT when calling cmake.
I think this line in the instructions:
cmake -G Ninja .. -DLLVM_DIR=/path/to/llvm/lib/cmake/llvm
should probably be:
cmake -G Ninja .. -DLLVM_DIR=/path/to/llvm/lib/cmake/llvm -DLLVM_EXTERNAL_LIT=/path/to/llvm/lib/cmake/llvm/bin/llvm-lit
Otherwise the test check-enzyme later will fail with an error:
[0/1] Running enzyme regression tests
/bin/sh: line 1: <MY_BUILD_DIR>: Is a directory
Best,
Misha
There are instances in which a we need a shadow pointer from a function we can create a combined forward/reverse of (See #27). However, right now if that is the case we are conservative and fall back to individual forward/reverse and we should make sure that uses of the inverted pointer are moved to the right location when using the combined variant (and re-enable it in these cases).
Here is how I build Enzyme. How do I execute tests?
$ cd enzyme
$ mkdir build
$ cd build
$ cmake ..
-- The C compiler identification is GNU 7.5.0
-- The CXX compiler identification is GNU 7.5.0
-- Check for working C compiler: /usr/bin/cc
-- Check for working C compiler: /usr/bin/cc -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
LLVM_SHLIBEXT=.so
found llvm dir /home/ondrej/repos/Enzyme/enzyme/build
found llvm lit /home/ondrej/repos/Enzyme/enzyme/build
CMAKE_PREFIX_PATH /home/ondrej/repos/Enzyme/enzyme/build
-- Linker detection: GNU ld
found llvm include directory here: /usr/lib/llvm-6.0/include
found llvm definitions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS
found llvm version 6
first llvm include directory/usr/lib/llvm-6.0/include
found enzyme sources ActiveVariable.cppEnzyme.cppEnzymeLogic.cppFunctionUtils.cppGradientUtils.cppTypeAnalysis.cppUtils.cppSCEV/ScalarEvolutionExpander.cpp
-- Configuring done
-- Generating done
-- Build files have been written to: /home/ondrej/repos/Enzyme/enzyme/build
$ make -j4
Scanning dependencies of target intrinsics_gen
[ 0%] Built target intrinsics_gen
Scanning dependencies of target LLVMEnzyme-6
[ 11%] Building CXX object Enzyme/CMakeFiles/LLVMEnzyme-6.dir/ActiveVariable.cpp.o
[ 22%] Building CXX object Enzyme/CMakeFiles/LLVMEnzyme-6.dir/Enzyme.cpp.o
[ 33%] Building CXX object Enzyme/CMakeFiles/LLVMEnzyme-6.dir/EnzymeLogic.cpp.o
[ 44%] Building CXX object Enzyme/CMakeFiles/LLVMEnzyme-6.dir/FunctionUtils.cpp.o
/home/ondrej/repos/Enzyme/enzyme/Enzyme/ActiveVariable.cpp: In function ‘void addCallRemovingCycle(std::vector<llvm::CallInst*>&, llvm::CallInst*)’:
/home/ondrej/repos/Enzyme/enzyme/Enzyme/ActiveVariable.cpp:186:41: warning: comparison between signed and unsigned integer expressions [-Wsign-compare]
if (newtrace.size()-1-j == i) break;
~~~~~~~~~~~~~~~~~~~~^~~~
[ 55%] Building CXX object Enzyme/CMakeFiles/LLVMEnzyme-6.dir/GradientUtils.cpp.o
[ 66%] Building CXX object Enzyme/CMakeFiles/LLVMEnzyme-6.dir/TypeAnalysis.cpp.o
/home/ondrej/repos/Enzyme/enzyme/Enzyme/TypeAnalysis.cpp: In member function ‘ValueData ValueData::KeepForCast(const llvm::DataLayout&, llvm::Type*, llvm::Type*) const’:
/home/ondrej/repos/Enzyme/enzyme/Enzyme/TypeAnalysis.cpp:75:50: warning: comparison between signed and unsigned integer expressions [-Wsign-compare]
if (pair.first[0] != -1 && pair.first[0] < tosize) {
[ 77%] Building CXX object Enzyme/CMakeFiles/LLVMEnzyme-6.dir/Utils.cpp.o
[ 88%] Building CXX object Enzyme/CMakeFiles/LLVMEnzyme-6.dir/SCEV/ScalarEvolutionExpander.cpp.o
[100%] Linking CXX shared module LLVMEnzyme-6.so
[100%] Built target LLVMEnzyme-6
$ ctest
*********************************
No test configuration file found!
*********************************
Usage
ctest [options]
$ make check-enzyme-integration
[ 0%] Built target intrinsics_gen
[ 90%] Built target LLVMEnzyme-6
Scanning dependencies of target check-enzyme-integration
[100%] Running enzyme integration tests
/bin/sh: 1: ../../: Permission denied
test/Integration/CMakeFiles/check-enzyme-integration.dir/build.make:57: recipe for target 'test/Integration/CMakeFiles/check-enzyme-integration' failed
make[3]: *** [test/Integration/CMakeFiles/check-enzyme-integration] Error 126
CMakeFiles/Makefile2:296: recipe for target 'test/Integration/CMakeFiles/check-enzyme-integration.dir/all' failed
make[2]: *** [test/Integration/CMakeFiles/check-enzyme-integration.dir/all] Error 2
CMakeFiles/Makefile2:303: recipe for target 'test/Integration/CMakeFiles/check-enzyme-integration.dir/rule' failed
make[1]: *** [test/Integration/CMakeFiles/check-enzyme-integration.dir/rule] Error 2
Makefile:203: recipe for target 'check-enzyme-integration' failed
make: *** [check-enzyme-integration] Error 2
I am going to start work on the refined logic for Active/Inactive detection, as we discussed previously. As issues/questions come up (while we're working asynchronously), I'll write them here.
Hello,
I'm trying to differentiate a code which uses some function pointers. I've tried using function pointers with enzyme before and it worked, but in this example it segFaults, (maybe because Caching instruction >fp[i] (euler[i],R)
)
When I manually unroll it works fine.
(Additionally I have extracted this code from a bigger routine where the code is differentiated twice and there the compiler fails to compile because it complains about a bad number of parameters passed to enzyme, that I will create a separate issue for if solving this issue doesn't fix it).
bugFunctionPointer.cpp
#include <iostream>
#include <math.h>
using namespace std;
extern int enzyme_dup;
extern int enzyme_dupnoneed;
extern int enzyme_out;
extern int enzyme_const;
void __enzyme_autodiff(...);
inline void assign( double*__restrict__ v, double*__restrict__ out, int n)
{
for( int i = 0 ; i < n ; i++) out[i] = v[i];
}
void matvprod( double*__restrict__ A, double * __restrict__ v, double* __restrict__ out, int n, int m )
{
for( int i = 0 ; i < n ; i++)
{
out[i] = 0.0;
for( int j = 0 ; j < m ; j++)
{
out[i] += A[i*m+j]*v[j];
}
}
}
void Rx( double ang, double * __restrict__ out)
{
double mat[9] = {1.0, 0, 0,
0, cos(ang),sin(ang),
0,-sin(ang),cos(ang)};
for( int i = 0 ; i < 9 ; i++)
out[i] = mat[i];
}
void Ry( double ang, double * __restrict__ out)
{
double mat[9] = {cos(ang), 0, -sin(ang),
0, 1.0,0.0,
sin(ang),0,cos(ang)};
for( int i = 0 ; i < 9 ; i++)
out[i] = mat[i];
}
void Rz( double ang, double * __restrict__ out)
{
double mat[9] = {cos(ang), sin(ang),0 ,
-sin(ang), cos(ang),0.0,
0,0,1};
for( int i = 0 ; i < 9 ; i++)
out[i] = mat[i];
}
typedef void (*rotf)(double,double* __restrict__ );
void eulerRotate( double* __restrict__ euler, double* __restrict__ v, double* __restrict__ out)
{
const int dim = 3;
rotf fp[] = {Rz,Ry,Rx};
assign(v,out,dim);
double R[dim*dim]={0.0};
double tempv[dim]= {0.0};
//Manually unrolled works
/*
fp[0](euler[0],R);
matvprod(R,out,tempv,dim,dim );
assign(tempv,out,dim);
fp[1](euler[1],R);
matvprod(R,out,tempv,dim,dim );
assign(tempv,out,dim);
fp[2](euler[2],R);
matvprod(R,out,tempv,dim,dim );
assign(tempv,out,dim);
*/
for( int i = 0 ; i < dim ; i++)
{
//Works if we replace fp[i] by either Rx, Ry or Rz
printf("i = %d before fp \n ", i);
fp[i](euler[i],R);
printf("i = %d before matvprod \n", i);
matvprod(R,out,tempv,dim,dim );
printf("i = %d before assign \n", i);
assign(tempv,out,dim);
}
}
void testEulerRotate( )
{
double euler[3] = {1.0,0.0,0.0};
double deuler[3] = {0.0,0.0,0.0};
double v[3] = {1.0,1.0,1.0};
double dv[3] = {0.0,0.0,0.0};
double out[3] = {0.0};
double dout[3] = {0.0};
dout[0]= 1.0;
eulerRotate( euler,v,out);
cout << "forward pass work without enzyme" << endl;
cout << "out " << endl;
cout << out[0] << " " << out[1] << " " << out[2] << endl;
cout << "with enzyme : " << endl;
__enzyme_autodiff(eulerRotate, enzyme_dup,euler,deuler,
enzyme_dup, v, dv,
enzyme_dup, out, dout);
cout << "out " << endl;
cout << out[0] << " " << out[1] << " " << out[2] << endl;
cout << "deuler " << endl;
cout << deuler[0] << " " << deuler[1] << " " << deuler[2] << endl;
}
int main(int argc, char** argv )
{
cout << "testEulerRotate" << endl;
testEulerRotate(); //SegFault in enzyme
return 0;
}
Compilation with :
clang bugFunctionPointer.cpp -lstdc++ -lm -fno-exceptions -Rpass=enzyme -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-12.so -O2 -o bugFunctionPointer
Output :
bugFunctionPointer.cpp:24:17: remark: Load may need caching %4 = load double, double* %arraydecay, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 due to call void %2(double %3, double* nonnull %arraydecay) #14, !dbg !23 [-Rpass=enzyme]
out[i] += A[i*m+j]*v[j];
^
bugFunctionPointer.cpp:24:26: remark: Load may need caching %5 = load double, double* %out, align 8, !dbg !40, !tbaa !29, !alias.scope !36, !noalias !41 due to store double %add12.i.2, double* %out, align 8, !dbg !45, !alias.scope !47 [-Rpass=enzyme]
out[i] += A[i*m+j]*v[j];
^
bugFunctionPointer.cpp:24:17: remark: Load may need caching %6 = load double, double* %arrayidx6.i.1, align 8, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 due to call void %2(double %3, double* nonnull %arraydecay) #14, !dbg !23 [-Rpass=enzyme]
out[i] += A[i*m+j]*v[j];
^
bugFunctionPointer.cpp:24:26: remark: Load may need caching %7 = load double, double* %arrayidx8.i.1, align 8, !dbg !40, !tbaa !29, !alias.scope !36, !noalias !41 due to store double %add12.i.2.1, double* %arrayidx8.i.1, align 8, !dbg !45, !alias.scope !47 [-Rpass=enzyme]
out[i] += A[i*m+j]*v[j];
^
bugFunctionPointer.cpp:24:17: remark: Load may need caching %8 = load double, double* %arrayidx6.i.2, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 due to call void %2(double %3, double* nonnull %arraydecay) #14, !dbg !23 [-Rpass=enzyme]
out[i] += A[i*m+j]*v[j];
^
bugFunctionPointer.cpp:24:26: remark: Load may need caching %9 = load double, double* %arrayidx8.i.2, align 8, !dbg !40, !tbaa !29, !alias.scope !36, !noalias !41 due to store double %add12.i.2.2, double* %arrayidx8.i.2, align 8, !dbg !45, !alias.scope !47 [-Rpass=enzyme]
out[i] += A[i*m+j]*v[j];
^
bugFunctionPointer.cpp:24:17: remark: Load may need caching %10 = load double, double* %arrayidx6.i.122, align 8, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 due to call void %2(double %3, double* nonnull %arraydecay) #14, !dbg !23 [-Rpass=enzyme]
out[i] += A[i*m+j]*v[j];
^
bugFunctionPointer.cpp:24:17: remark: Load may need caching %11 = load double, double* %arrayidx6.i.1.1, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 due to call void %2(double %3, double* nonnull %arraydecay) #14, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load may need caching %12 = load double, double* %arrayidx6.i.2.1, align 8, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 due to call void %2(double %3, double* nonnull %arraydecay) #14, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load may need caching %13 = load double, double* %arrayidx6.i.225, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 due to call void %2(double %3, double* nonnull %arraydecay) #14, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load may need caching %14 = load double, double* %arrayidx6.i.1.2, align 8, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 due to call void %2(double %3, double* nonnull %arraydecay) #14, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load may need caching %15 = load double, double* %arrayidx6.i.2.2, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 due to call void %2(double %3, double* nonnull %arraydecay) #14, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:26: remark: Load must be recomputed %9 = load double, double* %arrayidx8.i.2, align 8, !dbg !40, !tbaa !29, !alias.scope !36, !noalias !41 in reverse_invertfor.body due to store double %add12.i.2.2, double* %arrayidx8.i.2, align 8, !dbg !45, !alias.scope !47 [-Rpass=enzyme]
out[i] += A[i*m+j]*v[j];
^
bugFunctionPointer.cpp:24:26: remark: Caching instruction %9 = load double, double* %arrayidx8.i.2, align 8, !dbg !40, !tbaa !29, !alias.scope !36, !noalias !41 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load must be recomputed %15 = load double, double* %arrayidx6.i.2.2, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 in reverse_invertfor.body due to call void %2(double %3, double* nonnull %arraydecay) #15, !dbg !23 [-Rpass=enzyme]
out[i] += A[i*m+j]*v[j];
^
bugFunctionPointer.cpp:24:17: remark: Caching instruction %15 = load double, double* %arrayidx6.i.2.2, align 16, !dbg !17, !tbaa !30, !alias.scope !33, !noalias !39 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:26: remark: Load must be recomputed %7 = load double, double* %arrayidx8.i.1, align 8, !dbg !40, !tbaa !29, !alias.scope !36, !noalias !41 in reverse_invertfor.body due to store double %add12.i.2.1, double* %arrayidx8.i.1, align 8, !dbg !45, !alias.scope !47 [-Rpass=enzyme]
out[i] += A[i*m+j]*v[j];
^
bugFunctionPointer.cpp:24:26: remark: Caching instruction %7 = load double, double* %arrayidx8.i.1, align 8, !dbg !42, !tbaa !31, !alias.scope !38, !noalias !43 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load must be recomputed %14 = load double, double* %arrayidx6.i.1.2, align 8, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 in reverse_invertfor.body due to call void %2(double %3, double* nonnull %arraydecay) #15, !dbg !23 [-Rpass=enzyme]
out[i] += A[i*m+j]*v[j];
^
bugFunctionPointer.cpp:24:17: remark: Caching instruction %14 = load double, double* %arrayidx6.i.1.2, align 8, !dbg !17, !tbaa !32, !alias.scope !35, !noalias !41 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:26: remark: Load must be recomputed %5 = load double, double* %out, align 8, !dbg !40, !tbaa !29, !alias.scope !36, !noalias !41 in reverse_invertfor.body due to store double %add12.i.2, double* %out, align 8, !dbg !45, !alias.scope !47 [-Rpass=enzyme]
out[i] += A[i*m+j]*v[j];
^
bugFunctionPointer.cpp:24:26: remark: Caching instruction %5 = load double, double* %out, align 8, !dbg !44, !tbaa !33, !alias.scope !40, !noalias !45 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load must be recomputed %13 = load double, double* %arrayidx6.i.225, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 in reverse_invertfor.body due to call void %2(double %3, double* nonnull %arraydecay) #15, !dbg !23 [-Rpass=enzyme]
out[i] += A[i*m+j]*v[j];
^
bugFunctionPointer.cpp:24:17: remark: Caching instruction %13 = load double, double* %arrayidx6.i.225, align 16, !dbg !17, !tbaa !34, !alias.scope !37, !noalias !43 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load must be recomputed %12 = load double, double* %arrayidx6.i.2.1, align 8, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 in reverse_invertfor.body due to call void %2(double %3, double* nonnull %arraydecay) #15, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Caching instruction %12 = load double, double* %arrayidx6.i.2.1, align 8, !dbg !17, !tbaa !35, !alias.scope !38, !noalias !44 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load must be recomputed %11 = load double, double* %arrayidx6.i.1.1, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 in reverse_invertfor.body due to call void %2(double %3, double* nonnull %arraydecay) #15, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Caching instruction %11 = load double, double* %arrayidx6.i.1.1, align 16, !dbg !17, !tbaa !36, !alias.scope !39, !noalias !45 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load must be recomputed %10 = load double, double* %arrayidx6.i.122, align 8, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 in reverse_invertfor.body due to call void %2(double %3, double* nonnull %arraydecay) #15, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Caching instruction %10 = load double, double* %arrayidx6.i.122, align 8, !dbg !17, !tbaa !37, !alias.scope !40, !noalias !46 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load must be recomputed %8 = load double, double* %arrayidx6.i.2, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 in reverse_invertfor.body due to call void %2(double %3, double* nonnull %arraydecay) #15, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Caching instruction %8 = load double, double* %arrayidx6.i.2, align 16, !dbg !17, !tbaa !38, !alias.scope !41, !noalias !47 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load must be recomputed %6 = load double, double* %arrayidx6.i.1, align 8, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 in reverse_invertfor.body due to call void %2(double %3, double* nonnull %arraydecay) #15, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Caching instruction %6 = load double, double* %arrayidx6.i.1, align 8, !dbg !17, !tbaa !39, !alias.scope !42, !noalias !48 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load must be recomputed %4 = load double, double* %arraydecay, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 in reverse_invertfor.body due to call void %2(double %3, double* nonnull %arraydecay) #15, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Caching instruction %4 = load double, double* %arraydecay, align 16, !dbg !17, !tbaa !40, !alias.scope !43, !noalias !49 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:85:7: remark: Caching instruction %subcache = extractvalue { i8* } %_augmented, 0, !dbg !35 legalRecompute: 1 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
fp[i](euler[i],R);
^
testEulerRotate
i = 0 before fp
i = 0 before matvprod
i = 0 before assign
i = 1 before fp
i = 1 before matvprod
i = 1 before assign
i = 2 before fp
i = 2 before matvprod
i = 2 before assign
forward pass work without enzyme
out
1.38177 -0.301169 1
with enzyme :
i = 0 before fp
Segmentation fault (core dumped)
After we have preliminary support for forward mode, we might need to think about doing chunking.
@ChrisRackauckas had some references around challenges.
Hello,
I tried to use Eigen3 with enzyme (not sure if it's planned to be supported), and even though it works in the simple cases, it seems to hang the compilation in some more advanced cases. Like matrix inversion, matrix-vector solve, matrix exponentiation.
In https://enzyme.mit.edu/getting_started/CallingConvention/ you describe how to add some custom gradients, but it doesn't seem straight-forward to add them from c++.
For example for the adjoint of the inverse of the matrix
d K^-1/dp = - K^-1 * dK/dp * K^-1 (https://math.stackexchange.com/questions/1471825/derivative-of-the-inverse-of-a-matrix)
Which may be easier and faster to compute, and more numerically stable than the automatically derived one.
Can you please advise ?
Thanks
Here is my test file, where compilation hangs when some of the __enzyme_autodiff lines are present.
testmatrix.cpp
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <random>
#include <math.h>
#include <vector>
#include <algorithm>
#include <Eigen/Dense>
#include <unsupported/Eigen/MatrixFunctions>
using Eigen::MatrixXd;
using namespace std;
using namespace Eigen;
int enzyme_dup;
int enzyme_out;
int enzyme_const;
void __enzyme_autodiff(...);
template<int T>
double normVector( const Matrix<double,T,1>& m )
{
double out = 0.0;
for( int i = 0 ; i < m.rows() ; i++ )
{
out += m(i,0)* m(i,0);
}
return out;
}
template<int T>
double normMatrix( const Matrix<double,T,T,RowMajor>& m )
{
double out = 0.0;
for( int i = 0 ; i < m.rows() ; i++ )
{
for( int j = 0 ; j < m.cols(); j++)
{
out += m(i,j)* m(i,j);
}
}
return out;
}
double normMatrixXd( const MatrixXd& m )
{
double out = 0.0;
for( int i = 0 ; i < m.rows() ; i++ )
{
for( int j = 0 ; j < m.cols(); j++)
{
out += m(i,j)* m(i,j);
}
}
return out;
}
template<int T>
double normInverseMatrix( const Matrix<double,T,T,RowMajor>& m )
{
return normMatrix<T>(m.inverse());
}
double normInverseMatrixXd(const MatrixXd& m )
{
MatrixXd inv = m.inverse();
return normMatrixXd(inv);
}
template<int T>
double normSolveMatrix( const Matrix<double,T,T,RowMajor>& m)
{
Matrix<double,T,1> v;
for( int i = 0 ; i < m.cols() ; i++)
{
v(i,0) = i;
}
Matrix<double,T,1> sol = m.fullPivLu().solve(v);
return normVector<T>(sol);
}
double normSolveMatrixXd( const MatrixXd& m)
{
MatrixXd v(m.cols(),1);
for( int i = 0 ; i < m.cols() ; i++)
{
v(i,0) = i;
}
return normMatrixXd(m.fullPivLu().solve(v));
}
double normExpMatrixXd( const MatrixXd& m)
{
return normMatrixXd(m.exp());
}
template<int T>
double normExpMatrix( const Matrix<double,T,T,RowMajor>& m )
{
return normMatrix<T>(m.exp());
}
template< int T>
void testMatrix()
{
Matrix<double,T,T,RowMajor> m;
for( int i = 0; i < T ; i++)
{
for( int j = 0 ; j < T ;j++)
{
m(i,j) = (i+j)*(i+j);
}
}
Matrix<double,T,T,RowMajor> dm;
for( int i = 0; i < T ; i++)
{
for( int j = 0 ; j < T ;j++)
{
dm(i,j) = 0.0;
}
}
std::cout <<"m : " << std::endl;
std::cout << m << std::endl;
std::cout <<"m.inverse() : " << std::endl;
std::cout << m.inverse() << std::endl;
std::cout << "normSolveMatrix "<< std::endl;
std::cout << normSolveMatrix<T>(m) << std::endl;
std::cout << "normExpMatrix "<< std::endl;
std::cout << normExpMatrix(m) << std::endl;
__enzyme_autodiff(normMatrix<T>, enzyme_dup, &m,&dm); // Works
__enzyme_autodiff(normInverseMatrix<T>, enzyme_dup, &m,&dm);//Hangs compilation
__enzyme_autodiff(normSolveMatrix<T>, enzyme_dup, &m,&dm);//Hangs compilation
__enzyme_autodiff(normExpMatrix<T>, enzyme_dup, &m,&dm);//Hangs compilation
std::cout << dm << std::endl;
}
void testMatrixXd( int T )
{
MatrixXd m(T,T);
for( int i = 0; i < T ; i++)
{
for( int j = 0 ; j < T ;j++)
{
m(i,j) = (i+j)*(i+j);
}
}
MatrixXd dm(T,T);
for( int i = 0; i < T ; i++)
{
for( int j = 0 ; j < T ;j++)
{
dm(i,j) = 0.0;
}
}
std::cout <<"m : " << std::endl;
std::cout << m << std::endl;
std::cout <<"m.inverse() : " << std::endl;
std::cout << m.inverse() << std::endl;
std::cout << "normSolveMatrix "<< std::endl;
std::cout << normSolveMatrixXd(&m) << std::endl;
std::cout << "normExpMatrix "<< std::endl;
std::cout << normExpMatrixXd(&m) << std::endl;
__enzyme_autodiff(normMatrixXd, enzyme_dup, &m,&dm); // Works
__enzyme_autodiff(normInverseMatrixXd, enzyme_dup, &m,&dm); //Hangs compilation
__enzyme_autodiff(normSolveMatrixXd, enzyme_dup, &m,&dm); //Hangs compilation
__enzyme_autodiff(normExpMatrixXd, enzyme_dup, &m,&dm); //Hangs compilation
std::cout << dm << std::endl;
}
int main()
{
testMatrix<3>();
testMatrix<4>();
testMatrix<5>(); //There are no more formulas for matrix inversion in eigen when n = 5
testMatrixXd(3);
testMatrixXd(4);
testMatrixXd(5);
return 0;
}
Compilation with provided that you have eigen3 installed by ubuntu (apt-get install libeigen3-dev
) :
clang testmatrix.cpp -I/usr/include/eigen3/ -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-11.so -O2 -o testMatrix -fno-exceptions
I also tried to add the following flag (-mllvm -enzyme-max-type-offset=20
) which helped in the past when compilation was hanging but it didn't have any effect this time.
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.