enzymead / enzyme Goto Github PK

High-performance automatic differentiation of LLVM and MLIR.

License: Other

CMake 0.46% C++ 39.79% Python 0.11% C 1.84% LLVM 56.70% Shell 0.04% Makefile 0.08% Julia 0.05% MLIR 0.70% Fortran 0.04% Starlark 0.18%

ad automatic-differentiation c clang compiler cpp deep-learning derivative differentiable-programming enzyme gradient high-performance llvm llvm-enzyme machine-learning pytorch rust scientific-computing simulation tensorflow

enzyme's People

Contributors

Stargazers

Watchers

Forkers

sailfish009 tiberiusferreira zeta1999 hanblee bytesnake silky liudyboy jxzhangjhu tubbz-alt wenming2014 proteneer stjordanis lw00245 dendisuhubdy femtomc iomeone saqibmamoon mbrukman emperoryp7 carlocab reikdas ewenwan hchrobo grimmmyshini ludgerpaehler syaikhipin chrislupp vchuravy jaydown windborne anandijain cookieli ai-and-ml un110076 nestordemeure sfu-arch ebarsoum deep-learning-engineering vguerra tgymnich anirudhacharya aneax tide999 leticia-maria ahmedshakill theo-lw igeorge0503 ryanstoner1 leilaghaffari nirhar rayegun thewilsonator swilliamson7 jerin-thirdai umatin rnaimehaom enzymead ofekshochat yuenxq the-spellchecker miladhakimi ris-bali vtjnash chenglong92 stepasite roastduck motabbara pragmatwice tthsqe12 topazus williamjameshandley clasp-developers avhz ianna samuelpmishllnl erick-xanadu igor-stoppa milescranmer gaurav-arya gmh5225 hongqing-work martinjm97 jgreener64 maxaehle rmoyard devmotion garvitgupta08 skewballfox matinraayai brugarolas

enzyme's Issues

Quadratic memory usage (mk iii)

Hello,

The following instruction
F[indj*d+l] += wjk * parts[indk*d+l];

Make the code needs quadratic memory in the backward pass.

#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <random>
#include <math.h>
#include <vector>
#include <algorithm>


using namespace std;


struct Index
{
    int* cellId;
    int* start;
    int* cellSize;
    int size;
    int* argsorted;
    int n;
} ;


void buildIndex( Index& index , double * parts, int n )
{
    int d = 3;
    index.n = n;
    index.cellId = new int[n];
    index.start = new int[n];
    index.cellSize = new int[n]; //Max Size is n but the end may be unused
    index.argsorted = new int[n];

    for( int i = 0 ; i < n ; i++)
    {
        int id = parts[d*i];
        index.cellId[i] = id;
    }

    vector<pair<int,int> > v(n);
    for( int i = 0 ; i < n ; i++)
    {
        v[i].first = index.cellId[i];
        v[i].second = i;
    }

    sort( v.begin(), v.end() );
    int i = 0 ;
    int cur = -1;
    int curCellId = -1;
    for( int i = 0 ; i < n ; i++)
    {
        index.argsorted[i] = v[i].second;
        if( v[i].first == curCellId)
        {
            index.cellSize[cur]++;
        }
        else
        {
            curCellId = v[i].first;
            cur ++;
            index.cellSize[cur] = 1;
            index.start[cur] = i;
        }
    }
    index.size = cur+1;

}


double foo( double* __restrict__ parts,int n, Index* __restrict__ index)
{
     double out = 0;
     const int d = 3;

     double F[n*d];

     double W[n];

     for( int i = 0 ; i < n ; i++)
     {
         for( int j = 0 ; j < d ; j++)
         {
             F[i*d+j] = 0.0;
         }
         W[i] = 0.0;
     }

     for( int i = 0 ; i < index->size ; i++)
     {
         for( int j = 0 ; j < index->cellSize[i] ; j++ )
         {
             for( int k = 0 ; k < index->cellSize[i] ; k++ )
             {
                 int indj = index->argsorted[index->start[i]+j];
                 int indk = index->argsorted[index->start[i]+k];

                 double djk = 0;
                 for( int l = 0 ; l < d ; l++)
                 {
                     double temp;
                     temp = parts[indj * d +l ]- parts[indk * d +l ];
                     djk += temp*temp;
                 }
                 //out += djk;

                 double wjk = 1.0+djk; // strictly positive

                 for( int l = 0 ; l < d ; l++)
                 {
                     F[indj*d+l] += wjk * parts[indk*d+l];
                 }

                 //W[indj] += wjk;

            }
         }
     }

     /*
    //Normalize the field value
    for( int i = 0 ; i < n ; i++)
    {
        for( int j = 0 ; j < d ; j++)
        {
            F[i*d+j] /= W[i*d+j];
        }
    }
*/
/*
    //Compute the energy
    for( int i = 0 ; i < n ; i++)
    {
        double e = 0.0;
        for( int j = 0 ; j < d ; j++)
        {
            out += F[i*d+j]*F[i*d+j];
        }
    }
*/

     //delete[] F;
     //delete[] W;

     return out;
}


int enzyme_dup;
int enzyme_out;
int enzyme_const;

typedef double (*f_ptr)(double *,int,Index*);

extern double __enzyme_autodiff(f_ptr,
    int, double *, double *,
    int, int,
    int, Index*);


int main() {
    std::mt19937 e2(42);
    std::uniform_real_distribution<> dist(0, 10);
    int n = 100000;
    int d = 3;
    double* x = new double[n*d];
    double* d_x = new double[n*d];
    for( int i = 0 ; i < n*d ; i++)
    {
        x[i] = dist(e2);
        d_x[i] = 0.0;
    }

    Index index;
    buildIndex(index, x, n);

    for( int i = 0 ; i < 100 ; i++)
    {
    printf("cellId[%d] = %d\n ",i, index.cellId[i]);
    }

    printf("before autodiff\n");
    __enzyme_autodiff(foo,
        enzyme_dup, x, d_x,
        enzyme_const, n,
        enzyme_const, &index);


    //printf("%f \n", y);
    for( int i = 0 ; i < 100 ; i++)
    {
    printf("dx[%d] = [%f, %f, %f]\n ",i, d_x[d*i],d_x[d*i+1],d_x[d*i+2]);
    }

}

Compiled with :
clang test2.cpp -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-7.so -O2 -o test2 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -fno-exceptions

First feedback. Examples for AD community?

Hi,

I cloned, build and played around with enzyme and I am very happy so far!:)
I want to point out that enzyme is not only calculating gradients, but also implements the adjoint semantics (this is great news for me!).

So do you have plans to add examples for the (AD) community?

I don't want to create a pull request yet, so here is my first quick and dirty test (modified the sumAndMul function to become a speelpenning(ish) function with 2 outputs).

put the attached files into $(ENZYME_ROOT)/ad/test, calling
$make

results in correct input adjoints for calling dF 3 times with (1,0), (0,1) and (1,1) for the outptut adjoints (d_mul and d_out2).

./output.exe
d(output)/darray[0] = 24.000000
d(output)/darray[1] = 12.000000
d(output)/darray[2] = 8.000000
d(output)/darray[3] = 6.000000

d(output)/darray[0] = 48.000000
d(output)/darray[1] = 24.000000
d(output)/darray[2] = 16.000000
d(output)/darray[3] = 12.000000

d(output)/darray[0] = 72.000000
d(output)/darray[1] = 36.000000
d(output)/darray[2] = 24.000000
d(output)/darray[3] = 18.000000

Very nice!
first_example.zip

Compilation crash : Geometric Algebra : cannot compute with global variable that doesn't have " "marked shadow global"' failed.

I was not expecting it to work, and even if it works I'm still not sure I'd be using it, as I'll probably have to rewrite it anyway to make sure the derivative of the exponential form is properly handled.

I tried to use http://versor.mat.ucsb.edu/ which is a header-only geometric algebra (aka Clifford algebra) library. It's basically an extension of the complex numbers, called multivectors which you can use to represent things like rotations for some mathematical spaces.

I tried to differentiate through it, and the compiler crashed with some issue related to global variable missing some attribute.

@_ZN3vsr11MultivectorINS_7algebraINS_6metricILi3ELi0ELi0ELb0EEEfEENS_5BasisIJLs3ELs5ELs6EEEEE2xyE = linkonce_odr dso_local global %"struct.vsr::Multivector.0" zeroinitializer, comdat, align 4
clang-11: ../Enzyme/GradientUtils.cpp:1909: llvm::Value* GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<>&): Assertion `0 && "cannot compute with global variable that doesn't have " "marked shadow global"' failed.

I don't know how the versor library works internally. There are probably some basis, (x,y,z) to represent euclidean spaces, from which they derive a basis to represent the multivectors, (1, x, y,z, x^y, x^z, y^z, x^y^z). Some of them are probably declared globally because they are taken as the reference, aka constant.

Usually there are formula which allows to change basis, which is a just a matrix multiplication.

I'm not quite sure what exactly makes sense to do, whether or not we can consider those global basis element constant (aka derivative is zero), or create a shadow for them which would be ignored later on by the user. (Probably both can make sense depending on what you are working on)

It's probably related to issue #60

testVersor.cpp

#include <stdio.h>
#include <iostream>
#include <stdlib.h>

#include <math.h>

#include "vsr/vsr.h"

using namespace vsr;
using namespace vsr::nga;


using Vec = vsr::euclidean_vector<3,float>;        //<-- A 3 dimensional euclidean vector defined over floats
using Biv = vsr::euclidean_bivector<3,float>;   //<-- A 3 dimensional bivector or "directed area element"


int enzyme_dup;
int enzyme_out;
int enzyme_const;

void __enzyme_autodiff(...);

//-L/home/darkblue/versor/build/lib -lvsr
double multivectorNorm( const Vec& v)
{
  Vec v2 = v.rotate( Biv::xy * .25 );
  double n = v2.norm();
  return n*n;
}

void testversor()
{
  Vec v = Vec(1,2,3);                    //<-- A 3D vector at coordinates 1,2,3;
  //v.print();

  //v.rotate( Biv::xy * .25 ).print();      //<-- Rotate the vector in the xy plane and print result
  Vec v2 = v.rotate( Biv::xy * .25 );
  double n = v.norm();
  cout <<"norm2 v " << n << endl;

  cout <<"norm2 v2 " << v2.norm() << endl;

  cout << "multivectorNorm " << multivectorNorm(v) << endl;

  Vec dv = Vec(0,0,0);
  //__enzyme_autodiff( &multivectorNorm,enzyme_dup, &v,&dv);

  cout <<"dv "<< endl;
  dv.print();

  cout<<"testversor done" << endl;
}

int main() {
  testversor();
}

Compilation with :
clang testVersor.cpp -Ipathto/versor/include/ -DVSR_PRECISION_DOUBLE -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-11.so -O2 -o testVersor -fno-exceptions

The Full error stack is 3000 lines long.
Begins with :
cannot shadow-inline global @_ZN3vsr11MultivectorINS_7algebraINS_6metricILi3ELi0ELi0ELb0EEEfEENS_5BasisIJLs3ELs5ELs6EEEEE2xyE = linkonce_odr dso_local global %"struct.vsr::Multivector.0" zeroinitializer, comdat, align 4 due to %call1 = call { <2 x float>, float } @_ZNK3vsr11MultivectorINS_7algebraINS_6metricILi3ELi0ELi0ELb0EEEfEENS_5BasisIJLs1ELs2ELs4EEEEE6rotateINS5_IJLs3ELs5ELs6EEEEEES7_RKNS0_IS4_T_EE(%"struct.vsr::Multivector"* nonnull %v, %"struct.vsr::Multivector.0"* nonnull align 4 dereferenceable(12) %ref.tmp)``

and ends with :

@_ZN3vsr11MultivectorINS_7algebraINS_6metricILi3ELi0ELi0ELb0EEEfEENS_5BasisIJLs3ELs5ELs6EEEEE2xyE = linkonce_odr dso_local global %"struct.vsr::Multivector.0" zeroinitializer, comdat, align 4
clang-11: ../Enzyme/GradientUtils.cpp:1909: llvm::Value* GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<>&): Assertion `0 && "cannot compute with global variable that doesn't have " "marked shadow global"' failed.
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace, preprocessed source, and associated run script.
Stack dump:
0.	Program arguments: /home/username/usrlocal/bin/clang-11 -cc1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -main-file-name testVersor.cpp -mrelocation-model static -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /home/username/usrlocal/lib/clang/11.1.0 -I /home/username/versor/include/ -D VSR_PRECISION_DOUBLE -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /home/username/usrlocal/lib/clang/11.1.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -fdeprecated-macro -fdebug-compilation-dir /home/username/testenzyme -ferror-limit 19 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -load /usr/local/lib/ClangEnzyme-11.so -faddrsig -o /tmp/testVersor-7a3fdd.o -x c++ testVersor.cpp 
1.	<eof> parser at end of file
2.	Per-module optimization passes
3.	Running pass 'Enzyme Pass' on module 'testVersor.cpp'.
 #0 0x0000563b94cf498a llvm::sys::PrintStackTrace(llvm::raw_ostream&) (/home/username/usrlocal/bin/clang-11+0x1d0f98a)
 #1 0x0000563b94cf2654 llvm::sys::RunSignalHandlers() (/home/username/usrlocal/bin/clang-11+0x1d0d654)
 #2 0x0000563b94cf27a3 SignalHandler(int) (/home/username/usrlocal/bin/clang-11+0x1d0d7a3)
 #3 0x00007f4f46611980 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x12980)
 #4 0x00007f4f452c2fb7 raise /build/glibc-S9d2JN/glibc-2.27/signal/../sysdeps/unix/sysv/linux/raise.c:51:0
 #5 0x00007f4f452c4921 abort /build/glibc-S9d2JN/glibc-2.27/stdlib/abort.c:81:0
 #6 0x00007f4f452b448a __assert_fail_base /build/glibc-S9d2JN/glibc-2.27/assert/assert.c:89:0
 #7 0x00007f4f452b4502 (/lib/x86_64-linux-gnu/libc.so.6+0x30502)
 #8 0x00007f4f44e784fc GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&) (/usr/local/lib/ClangEnzyme-11.so+0x5284fc)
 #9 0x00007f4f44e79228 GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&) (/usr/local/lib/ClangEnzyme-11.so+0x529228)
#10 0x00007f4f44db10a0 DiffeGradientUtils::addToInvertedPtrDiffe(llvm::Value*, llvm::Value*, llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>&, llvm::MaybeAlign) (/usr/local/lib/ClangEnzyme-11.so+0x4610a0)
#11 0x00007f4f44df5a78 AdjointGenerator<AugmentedReturn const*>::visitLoadInst(llvm::LoadInst&) (/usr/local/lib/ClangEnzyme-11.so+0x4a5a78)
#12 0x00007f4f44dd6e23 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visitLoad(llvm::LoadInst&) (/usr/local/lib/ClangEnzyme-11.so+0x486e23)
#13 0x00007f4f44dc9309 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction&) (/usr/local/lib/ClangEnzyme-11.so+0x479309)
#14 0x00007f4f44dbde3d llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction*) (/usr/local/lib/ClangEnzyme-11.so+0x46de3d)
#15 0x00007f4f44d9fae3 CreatePrimalAndGradient(llvm::Function*, DIFFE_TYPE, std::vector<DIFFE_TYPE, std::allocator<DIFFE_TYPE> > const&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, bool, bool, llvm::Type*, FnTypeInfo const&, std::map<llvm::Argument*, bool, std::less<llvm::Argument*>, std::allocator<std::pair<llvm::Argument* const, bool> > >, AugmentedReturn const*, bool, bool, bool) (/usr/local/lib/ClangEnzyme-11.so+0x44fae3)
#16 0x00007f4f44d79378 bool (anonymous namespace)::Enzyme::HandleAutoDiff<llvm::CallInst>(llvm::CallInst*, llvm::TargetLibraryInfo&, llvm::AAResults&, bool) (/usr/local/lib/ClangEnzyme-11.so+0x429378)
#17 0x00007f4f44d754d2 (anonymous namespace)::Enzyme::lowerEnzymeCalls(llvm::Function&, bool, bool&, std::set<llvm::Function*, std::less<llvm::Function*>, std::allocator<llvm::Function*> >&) (/usr/local/lib/ClangEnzyme-11.so+0x4254d2)
#18 0x00007f4f44d75dfb (anonymous namespace)::Enzyme::runOnModule(llvm::Module&) (/usr/local/lib/ClangEnzyme-11.so+0x425dfb)
#19 0x0000563b946b4a81 llvm::legacy::PassManagerImpl::run(llvm::Module&) (/home/username/usrlocal/bin/clang-11+0x16cfa81)
#20 0x0000563b94f74c54 (anonymous namespace)::EmitAssemblyHelper::EmitAssembly(clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream> >) (/home/username/usrlocal/bin/clang-11+0x1f8fc54)
#21 0x0000563b94f766f4 clang::EmitBackendOutput(clang::DiagnosticsEngine&, clang::HeaderSearchOptions const&, clang::CodeGenOptions const&, clang::TargetOptions const&, clang::LangOptions const&, llvm::DataLayout const&, llvm::Module*, clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream> >) (/home/username/usrlocal/bin/clang-11+0x1f916f4)
#22 0x0000563b95b1edf5 clang::BackendConsumer::HandleTranslationUnit(clang::ASTContext&) (/home/username/usrlocal/bin/clang-11+0x2b39df5)
#23 0x0000563b966a2fe9 clang::ParseAST(clang::Sema&, bool, bool) (/home/username/usrlocal/bin/clang-11+0x36bdfe9)
#24 0x0000563b95b1efa8 clang::CodeGenAction::ExecuteAction() (/home/username/usrlocal/bin/clang-11+0x2b39fa8)
#25 0x0000563b95506d39 clang::FrontendAction::Execute() (/home/username/usrlocal/bin/clang-11+0x2521d39)
#26 0x0000563b954c167a clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (/home/username/usrlocal/bin/clang-11+0x24dc67a)
#27 0x0000563b955d1486 clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (/home/username/usrlocal/bin/clang-11+0x25ec486)
#28 0x0000563b93bf20fc cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (/home/username/usrlocal/bin/clang-11+0xc0d0fc)
#29 0x0000563b93bed479 ExecuteCC1Tool(llvm::SmallVectorImpl<char const*>&) (/home/username/usrlocal/bin/clang-11+0xc08479)
#30 0x0000563b93b70fd4 main (/home/username/usrlocal/bin/clang-11+0xb8bfd4)
#31 0x00007f4f452a5bf7 __libc_start_main /build/glibc-S9d2JN/glibc-2.27/csu/../csu/libc-start.c:344:0
#32 0x0000563b93becc4a _start (/home/username/usrlocal/bin/clang-11+0xc07c4a)
clang-11: error: unable to execute command: Aborted (core dumped)
clang-11: error: clang frontend command failed due to signal (use -v to see invocation)
clang version 11.1.0
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/local/bin
clang-11: note: diagnostic msg:

Can Enzyme be used in TACO?

https://github.com/tensor-compiler/taco
I want to make taco's tensor computation with auto-diff.
Is it possible?

Making a tag

It takes a while to check out this repo, which is quite large. Uncompressed the repo size is 2 GB, which makes it a bit slow to build dockerfiles with. To cut down on the initial download for users, it would be nice to distribute tarballs with either

the full working tree (but no .git) (273 MB).
the enzyme subdirectory (161 MB).
the enzyme subdirectory but no benchmark data (748 kb)

This last option is orders of magnitudes smaller, so would be awesome. With some small configurations this github repo can produce such an archive for every git tag. Just add this to the .gitattributes:

/LICENSE export-ignore
/Readme.md export-ignore
/clang export-ignore
/clang-tools-extra export-ignore
/compiler-rt export-ignore
/contrib export-ignore
/debuginfo-tests export-ignore
/libcxx export-ignore
/libcxxabi export-ignore
/libunwind export-ignore
/lld export-ignore
/lldb export-ignore
/llvm export-ignore
/openmp export-ignore
/polly export-ignore
/tests export-ignore
/enzyme/benchmarks export-ignore

and push a tag to github

git tag v0.1.0
git push --tags origin

The benchmarks build would also need to be made an optional part of the CMakeLists for this to work.

Feasibility of combining Numba and Enzyme

Because they're both based on LLVM, I was wondering what would be involved in integrating Numba with Enzyme, so that I could decorate a python function and get (optimized) gradients. I poked around the documentation a little but didn't see anything relevant.

Documentation / Tutorial for integrating Enzyme into a Frontend

We should add some information on this to the website

Compilation hangs when sorting

Hello,

I'm trying to sort vector of integers inside the function I want to differentiate.
It should be a no operation in the backward pass.
But the compilation hangs.

Can you advise for the proper way to tell enzyme to just ignore some variables or function ?
Thanks

#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <random>
#include <math.h>
#include <vector>
#include <algorithm>

using namespace std;

double foo( double* __restrict__ parts,int n, int* cellId)
 {
    vector< int > sorted(n);
    for( int i = 0 ; i < n ; i++)
    {
        sorted[i] = cellId[i];
    }

    sort( sorted.begin(),sorted.end());

    double out = parts[0];

    return out;
}


int enzyme_dup;
int enzyme_out;
int enzyme_const;

typedef double (*f_ptr)(double *,int,int*);

extern double __enzyme_autodiff(f_ptr,
    int, double *, double *,
    int, int,
    int, int*);


int main() {

    srand(42);

    std::mt19937 e2(42);
    std::uniform_real_distribution<> dist(0, 10);
    int n = 100000;
    int d = 3;
    double* x = new double[n*d];
    double* d_x = new double[n*d];
    for( int i = 0 ; i < n*d ; i++)
    {
        x[i] = dist(e2);
        d_x[i] = 0.0;
    }

    int * cellId = new int[n];
    for( int i = 0 ; i < n ; i++)
    {
        cellId[i] = i;
    }

    printf("before autodiff\n");
    __enzyme_autodiff(foo,
        enzyme_dup, x, d_x,
        enzyme_const, n,
        enzyme_const, cellId);


    //printf("%f \n", y);
    for( int i = 0 ; i < 100 ; i++)
    {
    printf("dx[%d] = %f\n",i, d_x[i]);
    }

}

Compilation command :
clang test2.cpp -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-7.so -O2 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o test2

Investigate Potential Memory Leaks

There have been some updates to the memory behavior (specifically user-provided mallocs inside code being differentiated). We should investigate and ensure that these and relevant shadows do not memory leak.

The cache does not have a potential leak as it maintains the property where all cache deallocations are free'd at the corresponding location in the reverse pass.

Compiler crash with cannot deal with ptr that isn't arg

Error: opt: /efs/home/tfk/Enzyme-plugin/enzyme/Enzyme/Enzyme.cpp:2700: llvm::Value* GradientUtils::invertPointerM(llvm::Value*, llvm::IRBuilder<>&): Assertion `0 && "cannot find deal with ptr that isnt arg"' failed.

; ModuleID = 'segfault.c'
source_filename = "segfault.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@.str = private unnamed_addr constant [36 x i8] c"hello! %f, res2 %f, da: %f, db: %f\0A\00", align 1

; Function Attrs: noinline nounwind optnone uwtable
define dso_local float @man_max(float* %a, float* %b) #0 {
entry:
  %retval = alloca float, align 4
  %a.addr = alloca float*, align 8
  %b.addr = alloca float*, align 8
  store float* %a, float** %a.addr, align 8
  store float* %b, float** %b.addr, align 8
  %0 = load float*, float** %a.addr, align 8
  %1 = load float, float* %0, align 4
  %2 = load float*, float** %b.addr, align 8
  %3 = load float, float* %2, align 4
  %cmp = fcmp ogt float %1, %3
  br i1 %cmp, label %if.then, label %if.else

if.then:                                          ; preds = %entry
  %4 = load float*, float** %a.addr, align 8
  %5 = load float, float* %4, align 4
  store float %5, float* %retval, align 4
  br label %return

if.else:                                          ; preds = %entry
  %6 = load float*, float** %b.addr, align 8
  %7 = load float, float* %6, align 4
  store float %7, float* %retval, align 4
  br label %return

return:                                           ; preds = %if.else, %if.then
  %8 = load float, float* %retval, align 4
  ret float %8
}

; Function Attrs: noinline nounwind optnone uwtable
define dso_local void @compute_max(float* %a, float* %b, float* %ret) #0 {
entry:
  %a.addr = alloca float*, align 8
  %b.addr = alloca float*, align 8
  %ret.addr = alloca float*, align 8
  store float* %a, float** %a.addr, align 8
  store float* %b, float** %b.addr, align 8
  store float* %ret, float** %ret.addr, align 8
  %0 = load float*, float** %a.addr, align 8
  %1 = load float*, float** %b.addr, align 8
  %call = call float @man_max(float* %0, float* %1)
  %2 = load float*, float** %ret.addr, align 8
  store float %call, float* %2, align 4
  ret void
}

; Function Attrs: noinline norecurse optnone uwtable
define dso_local i32 @main(i32 %argc, i8** %argv) #1 {
entry:
  %retval = alloca i32, align 4
  %argc.addr = alloca i32, align 4
  %argv.addr = alloca i8**, align 8
  %a = alloca float, align 4
  %b = alloca float, align 4
  %da = alloca float, align 4
  %db = alloca float, align 4
  %ret = alloca float, align 4
  %dret = alloca float, align 4
  store i32 0, i32* %retval, align 4
  store i32 %argc, i32* %argc.addr, align 4
  store i8** %argv, i8*** %argv.addr, align 8
  store float 2.000000e+00, float* %a, align 4
  store float 3.000000e+00, float* %b, align 4
  store float 0.000000e+00, float* %da, align 4
  store float 0.000000e+00, float* %db, align 4
  store float 0.000000e+00, float* %ret, align 4
  store float 1.000000e+00, float* %dret, align 4
  call void @compute_max(float* %a, float* %b, float* %ret)
  %0 = call double (...) @__enzyme_autodiff.f64(void (float*, float*, float*)* @compute_max, float* %a, float* %da, float* %b, float* %db, float* %ret, float* %dret)
  %1 = load float, float* %ret, align 4
  %conv = fpext float %1 to double
  %2 = load float, float* %ret, align 4
  %conv1 = fpext float %2 to double
  %3 = load float, float* %da, align 4
  %conv2 = fpext float %3 to double
  %4 = load float, float* %db, align 4
  %conv3 = fpext float %4 to double
  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str, i32 0, i32 0), double %conv, double %conv1, double %conv2, double %conv3)
  ret i32 0
}

declare double @__enzyme_autodiff.f64(...)

declare dso_local i32 @printf(i8*, ...) #2

attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.module.flags = !{!0}
!llvm.ident = !{!1}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 7.1.0 "}

Excessive Caching in Neural Network

Hello,

I'm trying to build a proto-neural-network with enzyme, aka two successive Matrix-vector product.
I tried to keep the code as simple and minimalist as possible.

The code runs fine but when I pass -Rpass=enzyme it indicates that it's caching and recomputing whereas it shouldn't need any memory allocation, as I'm preallocating the intermediate buffers, nor recomputation as I'm preserving the intermediate layers.

I have put restrict everywhere I can, but what am I doing wrong ?

Thanks

bugDense.cpp

#include <iostream>
using namespace std;

extern int enzyme_dup;
extern int enzyme_dupnoneed;
extern int enzyme_out;
extern int enzyme_const;

void __enzyme_autodiff(...);

inline void zero( double*__restrict__ v, int n)
{
  for( int i = 0 ; i < n ; i++) v[i] = 0.0;
}

void dense( double*__restrict__ A, double* __restrict__  x, double* __restrict__ out, int n, int m)
{
    zero( out, n );
    for( int i = 0 ; i < n ; i++ )
      for( int j = 0 ; j < m ; j++)
          out[i] += A[i*m+j] *x[j];
}


inline void rangep1( double*__restrict__ v, int n)
{
  for( int i = 0 ; i < n ; i++) v[i] = i+1;
}

template<typename T>
T sq( T x)
{
  return x*x;
}

inline void printVector( double*__restrict__ x, int n )
{
    for( int i = 0 ; i < n ; i++)
    {
      cout << x[i] << endl;
    }
    cout << endl;
}

inline void printMatrix( double*__restrict__ A, int n1, int n2 )
{
  for( int i = 0 ; i < n1 ; i++ )
  {
    for( int j = 0 ; j < n2 ; j++)
    {
      cout << A[i*n2+j] << " ";
    }
    cout << endl;
  }
}

class Fun2Params
{
public:
    Fun2Params(int featDim, int d)
    {
       A = new double[featDim*d];
       B = new double[featDim*featDim];
       rangep1(A,featDim*d);
       rangep1(B,featDim*featDim);
    }
    double* __restrict__ A;
    double* __restrict__ B;
};

class Fun2Memory
{
public:
    Fun2Memory(int featDim)
    {
      y0 = new double[featDim];
      y1 = new double[featDim];
      zero(y0,featDim);
      zero(y1,featDim);
    }
    double* __restrict__ y0;
    double* __restrict__ y1;
};

class Fun2
{
public:
  Fun2(int featDim, int d):featDim(featDim),d(d)
  {
      p = new double[d];
      rangep1(p,d);
  }
 double*  __restrict__ p;
 int featDim;
 int d;
};

void structuredFun2 (Fun2Params* __restrict__  x, Fun2Memory*  __restrict__ y,  double*  __restrict__  out ,Fun2* __restrict__ parameters )
{
    int d = parameters->d;
    int featDim = parameters->featDim;
    printf("featDim %d\n", featDim);

    dense( x->A, parameters->p, y->y0,featDim,d);
    dense( x->B, y->y0, y->y1,featDim,featDim);

    double temp = 0.0;
    for( int i= 0; i < featDim ; i++)
    {
      temp += sq(y->y0[i]) ;
      temp += sq(y->y1[i]);
    }
    *out = temp;
}

void testFun2()
{
  int d = 2;
  int featDim = 6;

  Fun2Params fp(d,featDim);
  Fun2Params dfp(d,featDim);

  Fun2Memory fm(featDim);
  Fun2Memory dfm(featDim);

  Fun2 fun2(featDim,d);

  double dout = 1.0;
  double out=0.0;
  __enzyme_autodiff(structuredFun2,
                                      enzyme_dup, &fp,&dfp,
                                      enzyme_dup, &fm ,&dfm,
                                      enzyme_dup,&out,&dout,
                                      enzyme_const, fun2);
  cout << "out " << endl;
  cout << out << endl;
  cout << "dfp.A " << endl;
  printMatrix( dfp.A,featDim,d);
  cout << "dfp.B" << endl;
  printMatrix( dfp.B,featDim,featDim);
  cout << endl;
}

int main(int argc, char** argv )
{
  cout<<"bugDense "<<endl;
  testFun2();
  return 0;
}

Compilation with :
clang bugDense.cpp -lstdc++ -lm -fno-exceptions -Rpass=enzyme -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-11.so -O2 -o bugDense

Output :

remark: Load may need caching   %arrayidx9.promoted.i = load double, double* %arrayidx9.i, align 8, !tbaa !44, !alias.scope !46, !noalias !38 due to   store double %add10.i, double* %arrayidx9.i, align 8, !dbg !47, !tbaa !44, !alias.scope !46, !noalias !38 [-Rpass=enzyme]
bugDense.cpp:21:21: remark: Load may need caching   %9 = load double, double* %arrayidx.i, align 8, !dbg !53, !tbaa !44, !alias.scope !54, !noalias !55 due to   store double %add10.i, double* %arrayidx9.i, align 8, !dbg !47, !tbaa !44, !alias.scope !46, !noalias !38 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                    ^
bugDense.cpp:21:31: remark: Load may need caching   %10 = load double, double* %arrayidx6.i, align 8, !dbg !56, !tbaa !44, !alias.scope !57, !noalias !58 due to   store double %add10.i, double* %arrayidx9.i, align 8, !dbg !47, !tbaa !44, !alias.scope !46, !noalias !38 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                              ^
remark: Load may need caching   %arrayidx9.promoted.i45 = load double, double* %arrayidx9.i44, align 8, !tbaa !44, !alias.scope !80, !noalias !75 due to   store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
bugDense.cpp:21:21: remark: Load may need caching   %15 = load double, double* %arrayidx.i53, align 8, !dbg !89, !tbaa !44, !alias.scope !90, !noalias !91 due to   store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                    ^
bugDense.cpp:21:31: remark: Load may need caching   %16 = load double, double* %arrayidx6.i54, align 8, !dbg !92, !tbaa !44, !alias.scope !93, !noalias !94 due to   store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                              ^
bugDense.cpp:21:31: remark: Load must be recomputed   %10 = load double, double* %arrayidx6.i, align 8, !dbg !56, !tbaa !44, !alias.scope !57, !noalias !58 in reverse_invertfor.body4.i due to   store double %add10.i, double* %arrayidx9.i, align 8, !dbg !47, !tbaa !44, !alias.scope !46, !noalias !38 [-Rpass=enzyme]
bugDense.cpp:21:31: remark: Caching instruction   %13 = load double, double* %arrayidx6.i, align 8, !dbg !55, !tbaa !44, !alias.scope !56, !noalias !57 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugDense.cpp:21:31: remark: Load must be recomputed   %16 = load double, double* %arrayidx6.i54, align 8, !dbg !92, !tbaa !44, !alias.scope !93, !noalias !94 in reverse_invertfor.body4.i59 due to   store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
bugDense.cpp:21:31: remark: Caching instruction   %31 = load double, double* %arrayidx6.i54, align 8, !dbg !93, !tbaa !45, !alias.scope !94, !noalias !95 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugDense.cpp:21:21: remark: Load must be recomputed   %15 = load double, double* %arrayidx.i53, align 8, !dbg !89, !tbaa !44, !alias.scope !90, !noalias !91 in reverse_invertfor.body4.i59 due to   store double %add10.i56, double* %arrayidx9.i44, align 8, !dbg !82, !tbaa !44, !alias.scope !80, !noalias !75 [-Rpass=enzyme]
          out[i] += A[i*m+j] *x[j];
                    ^
bugDense.cpp:21:21: remark: Caching instruction   %35 = load double, double* %arrayidx.i53, align 8, !dbg !91, !tbaa !45, !alias.scope !92, !noalias !93 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugDense 
featDim 6
out 
5.72725e+08
dfp.A 
7.83671e+06 1.56734e+07 
9.69974e+06 1.93995e+07 
1.76338e+07 3.52676e+07 
2.8622e+07 5.7244e+07 
4.65207e+07 9.30413e+07 
7.4193e+07 1.48386e+08 
dfp.B
1.19345e+06 2.62559e+06 4.05773e+06 5.48987e+06 6.92201e+06 8.35415e+06 
2.6271e+06 5.77963e+06 8.93216e+06 1.20847e+07 1.52372e+07 1.83897e+07 
4.0644e+06 8.94168e+06 1.38191e+07 1.86968e+07 2.35748e+07 2.84511e+07 
5.48659e+06 1.20907e+07 1.86488e+07 2.52308e+07 7.83671e+06 9.69973e+06 
1.76338e+07 2.8622e+07 4.65206e+07 7.4193e+07 4.01854e+07 4.84996e+07 
238690 525420 812880 1.09699e+06 1.3857e+06 1.71612e+06

What is the purpose of @preprocess_tester

I tried this simple input sin2.ll:

; ModuleID = '<stdin>'
source_filename = "<stdin>"

; Function Attrs: norecurse nounwind readnone
define double @tester(double %x) #0 {
entry:
  %0 = fmul double %x, %x
  ret double %0
}

define double @test_derivative(double %x) local_unnamed_addr {
entry:
  %0 = tail call double (double (double)*, ...) @__enzyme_autodiff(double (double)* nonnull @tester, double %x)
  ret double %0
}

declare double @__enzyme_autodiff(double (double)*, ...) local_unnamed_addr

attributes #0 = { norecurse nounwind readnone }

and run it through like this:

opt-6.0 < sin2.ll -load ../../build/Enzyme/LLVMEnzyme-6.so  -enzyme -enzyme_preopt=false -O3 -S

with the result:

; ModuleID = '<stdin>'
source_filename = "<stdin>"

; Function Attrs: norecurse nounwind readnone
define double @tester(double %x) local_unnamed_addr #0 {
entry:
  %0 = fmul double %x, %x
  ret double %0
}

; Function Attrs: norecurse nounwind readnone
define double @test_derivative(double %x) local_unnamed_addr #0 {
entry:
  %factor.i = fmul fast double %x, 2.000000e+00
  ret double %factor.i
}

; Function Attrs: norecurse nounwind readnone
define double @preprocess_tester(double %x) local_unnamed_addr #0 {
entry:
  %0 = fmul double %x, %x
  ret double %0
}

attributes #0 = { norecurse nounwind readnone }

I can see that it left @tester intact that returns x^2. Then it optimized out the @__enzyme_autodiff to just return 2*x.

But why did it create the @preprocess_tester function?

Fail to build Enzyme : cannot find -lLLVM

Hello,

I tried to follow : https://enzyme.mit.edu/Installation/

I am on ubuntu 18.04.

cmake --version
cmake version 3.19.4

gcc --version
gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0

g++ --version
g++ (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0

I add to do to make sure cmake use the right compiler (otherwise it use gcc-4.9 and fails when it needs CXX-17)

export CC=/usr/bin/gcc
export CXX=/usr/bin/g++

LLVM was compiled successfully with :

cd ~/Enzyme
mkdir build && cd build
cmake -G Ninja ../llvm -DLLVM_TARGETS_TO_BUILD="host" -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_PLUGINS=ON -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON
ninja

I even added a sudo ninja install for good measure

~/Enzyme/enzyme/build$ cmake -G Ninja .. -DLLVM_DIR=../../build/lib/cmake/llvm

LLVM_SHLIBEXT=.so
found llvm dir /home/username/Enzyme/build/lib/cmake/llvm
found llvm lit /home/username/Enzyme/enzyme/build
CMAKE_PREFIX_PATH /home/username/Enzyme/build/lib/cmake/llvm
-- Linker detection: GNU ld
LLVM_INSTALL_PREFIX: 
LLVM_INCLUDE_DIRS: /home/username/Enzyme/llvm/include;/home/username/Enzyme/build/include
found llvm definitions -D_GNU_SOURCE -D_DEBUG -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS
found llvm version 7
first llvm include directory/home/username/Enzyme/llvm/include
found bench flags: -I/home/username/Enzyme/enzyme/build/benchmarks/adept2/src/adept2/include -I/home/username/Enzyme/enzyme/build/benchmarks/tapenade/src/tapenade
-- Configuring done
-- Generating done
-- Build files have been written to: /home/username/Enzyme/enzyme/build

~/Enzyme/enzyme/build$ ninja

[19/43] Linking CXX shared library Enzyme/libEnzyme-7.so
FAILED: Enzyme/libEnzyme-7.so 
: && /usr/bin/g++ -fPIC -Wall -fPIC -fno-rtti   -shared -Wl,-soname,libEnzyme-7.so -o Enzyme/libEnzyme-7.so Enzyme/CMakeFiles/Enzyme-7.dir/ActivityAnalysis.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/CApi.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/CacheUtility.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/Enzyme.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/EnzymeLogic.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/FunctionUtils.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/GradientUtils.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/MustExitScalarEvolution.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/Utils.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/SCEV/ScalarEvolutionExpander.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/TypeAnalysis/TypeTree.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/TypeAnalysis/TypeAnalysis.cpp.o Enzyme/CMakeFiles/Enzyme-7.dir/TypeAnalysis/TypeAnalysisPrinter.cpp.o  -lLLVM && :
/usr/bin/ld: cannot find -lLLVM
collect2: error: ld returned 1 exit status
[28/43] Building CXX object Enzyme/CMa.../ClangEnzyme-7.dir/FunctionUtils.cpp.o
ninja: build stopped: subcommand failed.

It seems it can't find the LLVM libraries : there is no libLLVM.a but there are plenty of LLVM library either in
~/Enzyme/build/lib and /usr/local/lib/

~/Enzyme/build/lib$ ls libLLVM*
ls /usr/local/lib/libLLVM*

libLLVMAggressiveInstCombine.a  libLLVMMC.a
libLLVMAnalysis.a               libLLVMMCDisassembler.a
libLLVMAsmParser.a              libLLVMMCJIT.a
libLLVMAsmPrinter.a             libLLVMMCParser.a
libLLVMBinaryFormat.a           libLLVMMIRParser.a
libLLVMBitReader.a              libLLVMObjCARCOpts.a
libLLVMBitWriter.a              libLLVMObject.a
libLLVMCFIVerify.a              libLLVMObjectYAML.a
libLLVMCodeGen.a                libLLVMOption.a
libLLVMCore.a                   libLLVMOrcJIT.a
libLLVMCoroutines.a             libLLVMPasses.a
libLLVMCoverage.a               libLLVMProfileData.a
libLLVMDebugInfoCodeView.a      libLLVMRuntimeDyld.a
libLLVMDebugInfoDWARF.a         libLLVMScalarOpts.a
libLLVMDebugInfoMSF.a           libLLVMSelectionDAG.a
libLLVMDebugInfoPDB.a           libLLVMSupport.a
libLLVMDemangle.a               libLLVMSymbolize.a
libLLVMDlltoolDriver.a          libLLVMTableGen.a
libLLVMExecutionEngine.a        libLLVMTarget.a
libLLVMExegesis.a               libLLVMTestingSupport.a
libLLVMExegesisX86.a            libLLVMTransformUtils.a
libLLVMFuzzMutate.a             libLLVMVectorize.a
libLLVMGlobalISel.a             libLLVMWindowsManifest.a
libLLVMInstCombine.a            libLLVMX86AsmParser.a
libLLVMInstrumentation.a        libLLVMX86AsmPrinter.a
libLLVMInterpreter.a            libLLVMX86CodeGen.a
libLLVMipo.a                    libLLVMX86Desc.a
libLLVMIRReader.a               libLLVMX86Disassembler.a
libLLVMLibDriver.a              libLLVMX86Info.a
libLLVMLineEditor.a             libLLVMX86Utils.a
libLLVMLinker.a                 libLLVMXRay.a
libLLVMLTO.a

Can you please advise
Thanks

Compilation Fails for Posit numbers

Hello,

I tried to use Posit representation for floating point numbers .
I grab a header only library that does it as a drop-in replacement for double

git clone https://github.com/stillwater-sc/universal

testPosit.cpp

#include <iostream>
using namespace std;

// https://github.com/stillwater-sc/universal
#pragma clang diagnostic ignored "-Wc++17-extensions"
#include <universal/number/posit/posit.hpp>

extern int enzyme_dup;
extern int enzyme_dupnoneed;
extern int enzyme_out;
extern int enzyme_const;

void __enzyme_autodiff(...);

template<typename T>
T sq( T x)
{
  return x*x;
}

template<typename T>
void fun1( T* x, T* out )
{
    *out = sq(x[0] - 1.0);
}


template<typename Real>
Real MyKernel(const Real& a, const Real& b) {
    return a * b;  // replace this with your kernel computation
}

constexpr double pi = 3.14159265358979323846;

using Real = sw::universal::posit<32,2>;


int main(int argc, char** argv )
{
  cout << "testMemoryAllocator "<< endl;

  Real a = sqrt(2);
  Real b = pi;
  std::cout << "Result: " << MyKernel(a, b) << std::endl;

  Real x = 3.0;
  Real out = 0.0;
  {
  out = 0.0;
  fun1( &x, &out );
  cout << "out " << endl;
  cout << out << endl;
  }

  {
  Real dx = 0.0;
  Real gout = 1.0;
  Real out = 0.0;
  __enzyme_autodiff( fun1<Real>, enzyme_dup, &x, &dx,
                                         enzyme_dup,&out,&gout );

  cout << "out " << endl;
  cout << out << endl;
  cout << "dx " << endl;
  cout << dx << endl;
}

}

clang testPosit.cpp -I/home/username/universal/include -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-11.so -O2 -o testPosit

Here is the compilation log :
errorPosit.log

Thanks

Integer type support

Are integer types supported? I only get the expected answer for floats or doubles.

Correct answer with type 'float':

#include <stdio.h>
#include <stdint.h>

#define DTYPE float

extern DTYPE __enzyme_autodiff(void*, DTYPE);
DTYPE square(DTYPE x) {
    return x * x;
}
DTYPE dsquare(DTYPE x) {
    return __enzyme_autodiff(square, x);
}
int main() {
    for(DTYPE i=1; i<5; i++)
        printf("square(%f)=%f, dsquare(%f)=%f\n", (float) i, (float) square(i), (float) i, (float) dsquare(i));
}

square(1.000000)=1.000000, dsquare(1.000000)=2.000000
square(2.000000)=4.000000, dsquare(2.000000)=4.000000
square(3.000000)=9.000000, dsquare(3.000000)=6.000000
square(4.000000)=16.000000, dsquare(4.000000)=8.000000

Incorrect answer with type `int':

#include <stdio.h>
#include <stdint.h>

#define DTYPE int

extern DTYPE __enzyme_autodiff(void*, DTYPE);
DTYPE square(DTYPE x) {
    return x * x;
}
DTYPE dsquare(DTYPE x) {
    return __enzyme_autodiff(square, x);
}
int main() {
    for(DTYPE i=1; i<5; i++)
        printf("square(%f)=%f, dsquare(%f)=%f\n", (float) i, (float) square(i), (float) i, (float) dsquare(i));
}

square(1.000000)=1.000000, dsquare(1.000000)=0.000000
square(2.000000)=4.000000, dsquare(2.000000)=0.000000
square(3.000000)=9.000000, dsquare(3.000000)=0.000000
square(4.000000)=16.000000, dsquare(4.000000)=0.000000

Excessive memory allocation

Hello,

I was expecting the following to run without any allocations or tape usage, but it crashes.

bugStackAlloc.cpp

#include <iostream>
using namespace std;

extern int enzyme_dup;
extern int enzyme_dupnoneed;
extern int enzyme_out;
extern int enzyme_const;

void __enzyme_autodiff(...);

template<typename T>
void comp( double* __restrict__  x,  double* __restrict__ out);

const int nbiter = 100000000;
const int d = 30;

class Fun1{};template<> void comp<Fun1>( double* __restrict__ x, double* __restrict__ out)
{
    double buf[d];
    double xinp = x[0];
    for( int i = 0 ; i < d ; i++)
    {
        buf[i] = xinp * i;
    }
    for( int i =1 ; i < d-1 ; i++)
    {
       *out += (buf[i]-buf[i+1] )*(buf[i]-buf[i+1] ) ;
    }
}

class Fun2{};template<> void comp<Fun2>(  double*  __restrict__ x,  double*__restrict__  out)
{
  for( int  j = 0; j < nbiter ; j++)
  {
     comp<Fun1>(x,out);
  }
}

class Fun3{};template<> void comp<Fun3>( double*__restrict__  x, double* __restrict__ out)
{
    double xinp = x[0];
    for( int  j = 0; j < nbiter ; j++)
    {
      double buf[d];
      double temp = 0.0;

      for( int i = 0 ; i < d ; i++)
      {
          buf[i] = xinp * i;
      }
      for( int i =1 ; i < d-1 ; i++)
      {
         temp +=  (buf[i]-buf[i+1] ) * (buf[i]-buf[i+1] ) ;
      }
      *out += temp;
    }
}

template< typename T>
void demo( )
{
  double x = 3.0;
  double out = 0.0;
  {
  out = 0.0;
  comp<T>( &x, &out );
  cout << "out without enzyme" << endl;
  cout << out << endl;
  }

  {
  double dx = 0.0;
  double gout = 1.0;
  double out = 0.0;
  __enzyme_autodiff( comp<T>, enzyme_dup, &x, &dx,
                                         enzyme_dup,&out,&gout);
  cout << "out with enzyme" << endl;
  cout << out << endl;
  cout << "dx with enzyme" << endl;
  cout << dx << endl;
  }
}

int main(int argc, char** argv )
{
  cout << "testMemoryAllocator "<< endl;
  cout << "demofun1() " << endl;
  demo<Fun1>();
  cout << "demofun2() " << endl;
  demo<Fun2>(); // Fails
  cout << "demofun3() " << endl;
  demo<Fun3>(); // Fails
}

clang bugStackAlloc.cpp -lstdc++ -lm -fno-exceptions -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-11.so -O2 -o bugStackAlloc

testMemoryAllocator 
demofun1() 
out without enzyme
252
out with enzyme
252
dx with enzyme
168
demofun2() 
out without enzyme
2.52e+10
Killed

Compiles fine, and runs fine until it get Killed due to excessive memory usage.

Compilation crash : Complex numbers :"attempting to differentiate function without definition"

I tried to use complex numbers :
And functions like std::abs and std::arg (it works with .real() and .imag() )
It makes the compiler crash with the error :
clang-11: ../Enzyme/EnzymeLogic.cpp:1181: const AugmentedReturn& CreateAugmentedPrimal(llvm::Function*, DIFFE_TYPE, const std::vector<DIFFE_TYPE>&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, const FnTypeInfo&, std::map<llvm::Argument*, bool>, bool, bool, bool, bool): Assertion 0 && "attempting to differentiate function without definition"' failed`.

I don't know exactly where the standard complex library is defined, or wether or not it is header only, but it seems enzyme can't get its source code.

complex.cpp

#include <stdio.h>
#include <iostream>
#include <stdlib.h>

#include <math.h>
#include <complex>      // std::complex, std::abs, std::arg

int enzyme_dup;
int enzyme_out;
int enzyme_const;

void __enzyme_autodiff(...);

using namespace std;

double h( const complex<double>& c )
{
  //double theta = c.real()+3*c.imag();
  double theta = std::abs(c);
  //double theta = arg(c);
  return theta * theta;
}

double h2( const double& c )
{
  double theta = std::abs(c);
  return theta * theta;
}

int main ()
{
  std::complex<double> mycomplex (3.0,4.0);

  std::cout << "The polar form of " << mycomplex;
  std::cout << " is " << abs(mycomplex) << "*e^i*" << arg(mycomplex) << "rad\n";

  std::cout << "energy of particle is : " << h(mycomplex) << endl;

  std::complex<double> dc(0.0,0.0);

  double x = -3.0;
  double dx = 0.0;
  //Works with real number
  __enzyme_autodiff(&h2, enzyme_dup, &x,&dx);

  //Compilation fails for complex number
  //"attempting to differentiate function without definition"' failed.
  __enzyme_autodiff(&h, enzyme_dup, &mycomplex,&dc);

  std::cout << "grad energy of particle is : " << dc << endl;

  return 0;
}

Compilation with :
clang complex.cpp -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-11.so -O2 -o complex -fno-exceptions

Full Error message :

mod: ; ModuleID = 'complex.cpp'
source_filename = "complex.cpp"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%"class.std::ios_base::Init" = type { i8 }
%"class.std::basic_ostream" = type { i32 (...)**, %"class.std::basic_ios" }
%"class.std::basic_ios" = type { %"class.std::ios_base", %"class.std::basic_ostream"*, i8, i8, %"class.std::basic_streambuf"*, %"class.std::ctype"*, %"class.std::num_put"*, %"class.std::num_get"* }
%"class.std::ios_base" = type { i32 (...)**, i64, i64, i32, i32, i32, %"struct.std::ios_base::_Callback_list"*, %"struct.std::ios_base::_Words", [8 x %"struct.std::ios_base::_Words"], i32, %"struct.std::ios_base::_Words"*, %"class.std::locale" }
%"struct.std::ios_base::_Callback_list" = type { %"struct.std::ios_base::_Callback_list"*, void (i32, %"class.std::ios_base"*, i32)*, i32, i32 }
%"struct.std::ios_base::_Words" = type { i8*, i64 }
%"class.std::locale" = type { %"class.std::locale::_Impl"* }
%"class.std::locale::_Impl" = type { i32, %"class.std::locale::facet"**, i64, %"class.std::locale::facet"**, i8** }
%"class.std::locale::facet" = type <{ i32 (...)**, i32, [4 x i8] }>
%"class.std::basic_streambuf" = type { i32 (...)**, i8*, i8*, i8*, i8*, i8*, i8*, %"class.std::locale" }
%"class.std::ctype" = type <{ %"class.std::locale::facet.base", [4 x i8], %struct.__locale_struct*, i8, [7 x i8], i32*, i32*, i16*, i8, [256 x i8], [256 x i8], i8, [6 x i8] }>
%"class.std::locale::facet.base" = type <{ i32 (...)**, i32 }>
%struct.__locale_struct = type { [13 x %struct.__locale_data*], i16*, i32*, i32*, [13 x i8*] }
%struct.__locale_data = type opaque
%"class.std::num_put" = type { %"class.std::locale::facet.base", [4 x i8] }
%"class.std::num_get" = type { %"class.std::locale::facet.base", [4 x i8] }
%"struct.std::complex" = type { { double, double } }

@_ZStL8__ioinit = internal global %"class.std::ios_base::Init" zeroinitializer, align 1
@__dso_handle = external hidden global i8
@enzyme_dup = dso_local local_unnamed_addr global i32 0, align 4
@enzyme_out = dso_local local_unnamed_addr global i32 0, align 4
@enzyme_const = dso_local local_unnamed_addr global i32 0, align 4
@_ZSt4cout = external dso_local global %"class.std::basic_ostream", align 8
@.str = private unnamed_addr constant [19 x i8] c"The polar form of \00", align 1
@.str.1 = private unnamed_addr constant [5 x i8] c" is \00", align 1
@.str.2 = private unnamed_addr constant [6 x i8] c"*e^i*\00", align 1
@.str.3 = private unnamed_addr constant [5 x i8] c"rad\0A\00", align 1
@.str.4 = private unnamed_addr constant [25 x i8] c"energy of particle is : \00", align 1
@.str.5 = private unnamed_addr constant [30 x i8] c"grad energy of particle is : \00", align 1
@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_complex.cpp, i8* null }]

declare dso_local void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"*) unnamed_addr #0

; Function Attrs: nounwind
declare dso_local void @_ZNSt8ios_base4InitD1Ev(%"class.std::ios_base::Init"*) unnamed_addr #1

; Function Attrs: nofree nounwind
declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) local_unnamed_addr #2

; Function Attrs: nofree nounwind uwtable
define dso_local double @_Z1hRKSt7complexIdE(%"struct.std::complex"* nocapture nonnull readonly align 8 dereferenceable(16) %c) #3 {
entry:
  %_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %c, i64 0, i32 0, i32 0
  %_M_value.real.i.i = load double, double* %_M_value.realp.i.i, align 8
  %_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %c, i64 0, i32 0, i32 1
  %_M_value.imag.i.i = load double, double* %_M_value.imagp.i.i, align 8
  %call.i.i = tail call double @cabs(double %_M_value.real.i.i, double %_M_value.imag.i.i) #11
  %mul = fmul double %call.i.i, %call.i.i
  ret double %mul
}

; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #4

; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #4

; Function Attrs: nounwind readonly uwtable
define dso_local double @_Z2h2RKd(double* nocapture nonnull readonly align 8 dereferenceable(8) %c) #5 {
entry:
  %0 = load double, double* %c, align 8, !tbaa !2
  %mul = fmul double %0, %0
  ret double %mul
}

; Function Attrs: norecurse nounwind uwtable
define dso_local i32 @main() local_unnamed_addr #6 {
entry:
  %mycomplex = alloca %"struct.std::complex", align 8
  %dc = alloca %"struct.std::complex", align 8
  %x = alloca double, align 8
  %dx = alloca double, align 8
  %0 = bitcast %"struct.std::complex"* %mycomplex to i8*
  call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %0) #11
  %_M_value.realp.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %mycomplex, i64 0, i32 0, i32 0
  %_M_value.imagp.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %mycomplex, i64 0, i32 0, i32 1
  store double 3.000000e+00, double* %_M_value.realp.i, align 8
  store double 4.000000e+00, double* %_M_value.imagp.i, align 8
  %call1.i17 = tail call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8) @_ZSt4cout, i8* nonnull getelementptr inbounds ([19 x i8], [19 x i8]* @.str, i64 0, i64 0), i64 18) #11
  %call1 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZStlsIdcSt11char_traitsIcEERSt13basic_ostreamIT0_T1_ES6_RKSt7complexIT_E(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8) @_ZSt4cout, %"struct.std::complex"* nonnull align 8 dereferenceable(16) %mycomplex) #11
  %call1.i19 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8) @_ZSt4cout, i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @.str.1, i64 0, i64 0), i64 4) #11
  %_M_value.real.i.i = load double, double* %_M_value.realp.i, align 8
  %_M_value.imag.i.i = load double, double* %_M_value.imagp.i, align 8
  %call.i.i20 = call double @cabs(double %_M_value.real.i.i, double %_M_value.imag.i.i) #11
  %call.i21 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull @_ZSt4cout, double %call.i.i20) #11
  %call1.i23 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8) %call.i21, i8* nonnull getelementptr inbounds ([6 x i8], [6 x i8]* @.str.2, i64 0, i64 0), i64 5) #11
  %_M_value.real.i.i25 = load double, double* %_M_value.realp.i, align 8
  %_M_value.imag.i.i27 = load double, double* %_M_value.imagp.i, align 8
  %call.i.i28 = call double @carg(double %_M_value.real.i.i25, double %_M_value.imag.i.i27) #11
  %call.i29 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %call.i21, double %call.i.i28) #11
  %call1.i31 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8) %call.i29, i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @.str.3, i64 0, i64 0), i64 4) #11
  %call1.i33 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8) @_ZSt4cout, i8* nonnull getelementptr inbounds ([25 x i8], [25 x i8]* @.str.4, i64 0, i64 0), i64 24) #11
  %_M_value.real.i.i.i = load double, double* %_M_value.realp.i, align 8
  %_M_value.imag.i.i.i = load double, double* %_M_value.imagp.i, align 8
  %call.i.i.i = call double @cabs(double %_M_value.real.i.i.i, double %_M_value.imag.i.i.i) #11
  %mul.i = fmul double %call.i.i.i, %call.i.i.i
  %call.i34 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull @_ZSt4cout, double %mul.i) #11
  %1 = bitcast %"class.std::basic_ostream"* %call.i34 to i8**
  %vtable.i36 = load i8*, i8** %1, align 8, !tbaa !6
  %vbase.offset.ptr.i37 = getelementptr i8, i8* %vtable.i36, i64 -24
  %2 = bitcast i8* %vbase.offset.ptr.i37 to i64*
  %vbase.offset.i38 = load i64, i64* %2, align 8
  %3 = bitcast %"class.std::basic_ostream"* %call.i34 to i8*
  %add.ptr.i39 = getelementptr inbounds i8, i8* %3, i64 %vbase.offset.i38
  %_M_ctype.i50 = getelementptr inbounds i8, i8* %add.ptr.i39, i64 240
  %4 = bitcast i8* %_M_ctype.i50 to %"class.std::ctype"**
  %5 = load %"class.std::ctype"*, %"class.std::ctype"** %4, align 8, !tbaa !8
  %tobool.not.i65 = icmp eq %"class.std::ctype"* %5, null
  br i1 %tobool.not.i65, label %if.then.i66, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit68

if.then.i66:                                      ; preds = %entry
  call void @_ZSt16__throw_bad_castv() #12
  unreachable

_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit68:  ; preds = %entry
  %_M_widen_ok.i52 = getelementptr inbounds %"class.std::ctype", %"class.std::ctype"* %5, i64 0, i32 8
  %6 = load i8, i8* %_M_widen_ok.i52, align 8, !tbaa !12
  %tobool.not.i53 = icmp eq i8 %6, 0
  br i1 %tobool.not.i53, label %if.end.i59, label %if.then.i55

if.then.i55:                                      ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit68
  %arrayidx.i54 = getelementptr inbounds %"class.std::ctype", %"class.std::ctype"* %5, i64 0, i32 9, i64 10
  %7 = load i8, i8* %arrayidx.i54, align 1, !tbaa !14
  br label %_ZNKSt5ctypeIcE5widenEc.exit61

if.end.i59:                                       ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit68
  call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* nonnull %5) #11
  %8 = bitcast %"class.std::ctype"* %5 to i8 (%"class.std::ctype"*, i8)***
  %vtable.i56 = load i8 (%"class.std::ctype"*, i8)**, i8 (%"class.std::ctype"*, i8)*** %8, align 8, !tbaa !6
  %vfn.i57 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)*, i8 (%"class.std::ctype"*, i8)** %vtable.i56, i64 6
  %9 = load i8 (%"class.std::ctype"*, i8)*, i8 (%"class.std::ctype"*, i8)** %vfn.i57, align 8
  %call.i58 = call signext i8 %9(%"class.std::ctype"* nonnull %5, i8 signext 10) #11
  br label %_ZNKSt5ctypeIcE5widenEc.exit61

_ZNKSt5ctypeIcE5widenEc.exit61:                   ; preds = %if.then.i55, %if.end.i59
  %retval.0.i60 = phi i8 [ %7, %if.then.i55 ], [ %call.i58, %if.end.i59 ]
  %call1.i41 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* nonnull %call.i34, i8 signext %retval.0.i60) #11
  %call.i42 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* nonnull %call1.i41) #11
  %10 = bitcast %"struct.std::complex"* %dc to i8*
  call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %10) #11
  %11 = bitcast double* %x to i8*
  call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(16) %10, i8 0, i64 16, i1 false)
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %11) #11
  store double -3.000000e+00, double* %x, align 8, !tbaa !2
  %12 = bitcast double* %dx to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %12) #11
  store double 0.000000e+00, double* %dx, align 8, !tbaa !2
  %13 = load i32, i32* @enzyme_dup, align 4, !tbaa !15
  %14 = load double, double* %x, align 8, !tbaa !2
  %15 = fadd fast double %14, %14
  %16 = load double, double* %dx, align 8
  %17 = fadd fast double %16, %15
  store double %17, double* %dx, align 8
  %18 = load i32, i32* @enzyme_dup, align 4, !tbaa !15
  call void (...) @_Z17__enzyme_autodiffz(double (%"struct.std::complex"*)* nonnull @_Z1hRKSt7complexIdE, i32 %18, %"struct.std::complex"* nonnull %mycomplex, %"struct.std::complex"* nonnull %dc) #11
  %call1.i46 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8) @_ZSt4cout, i8* nonnull getelementptr inbounds ([30 x i8], [30 x i8]* @.str.5, i64 0, i64 0), i64 29) #11
  %call14 = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZStlsIdcSt11char_traitsIcEERSt13basic_ostreamIT0_T1_ES6_RKSt7complexIT_E(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8) @_ZSt4cout, %"struct.std::complex"* nonnull align 8 dereferenceable(16) %dc) #11
  %19 = bitcast %"class.std::basic_ostream"* %call14 to i8**
  %vtable.i = load i8*, i8** %19, align 8, !tbaa !6
  %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24
  %20 = bitcast i8* %vbase.offset.ptr.i to i64*
  %vbase.offset.i = load i64, i64* %20, align 8
  %21 = bitcast %"class.std::basic_ostream"* %call14 to i8*
  %add.ptr.i = getelementptr inbounds i8, i8* %21, i64 %vbase.offset.i
  %_M_ctype.i = getelementptr inbounds i8, i8* %add.ptr.i, i64 240
  %22 = bitcast i8* %_M_ctype.i to %"class.std::ctype"**
  %23 = load %"class.std::ctype"*, %"class.std::ctype"** %22, align 8, !tbaa !8
  %tobool.not.i62 = icmp eq %"class.std::ctype"* %23, null
  br i1 %tobool.not.i62, label %if.then.i63, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit

if.then.i63:                                      ; preds = %_ZNKSt5ctypeIcE5widenEc.exit61
  call void @_ZSt16__throw_bad_castv() #12
  unreachable

_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit:    ; preds = %_ZNKSt5ctypeIcE5widenEc.exit61
  %_M_widen_ok.i = getelementptr inbounds %"class.std::ctype", %"class.std::ctype"* %23, i64 0, i32 8
  %24 = load i8, i8* %_M_widen_ok.i, align 8, !tbaa !12
  %tobool.not.i = icmp eq i8 %24, 0
  br i1 %tobool.not.i, label %if.end.i, label %if.then.i

if.then.i:                                        ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit
  %arrayidx.i = getelementptr inbounds %"class.std::ctype", %"class.std::ctype"* %23, i64 0, i32 9, i64 10
  %25 = load i8, i8* %arrayidx.i, align 1, !tbaa !14
  br label %_ZNKSt5ctypeIcE5widenEc.exit

if.end.i:                                         ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit
  call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* nonnull %23) #11
  %26 = bitcast %"class.std::ctype"* %23 to i8 (%"class.std::ctype"*, i8)***
  %vtable.i48 = load i8 (%"class.std::ctype"*, i8)**, i8 (%"class.std::ctype"*, i8)*** %26, align 8, !tbaa !6
  %vfn.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)*, i8 (%"class.std::ctype"*, i8)** %vtable.i48, i64 6
  %27 = load i8 (%"class.std::ctype"*, i8)*, i8 (%"class.std::ctype"*, i8)** %vfn.i, align 8
  %call.i49 = call signext i8 %27(%"class.std::ctype"* nonnull %23, i8 signext 10) #11
  br label %_ZNKSt5ctypeIcE5widenEc.exit

_ZNKSt5ctypeIcE5widenEc.exit:                     ; preds = %if.then.i, %if.end.i
  %retval.0.i = phi i8 [ %25, %if.then.i ], [ %call.i49, %if.end.i ]
  %call1.i = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* nonnull %call14, i8 signext %retval.0.i) #11
  %call.i = call nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* nonnull %call1.i) #11
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %12) #11
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %11) #11
  call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %10) #11
  call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %0) #11
  ret i32 0
}

declare dso_local nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZStlsIdcSt11char_traitsIcEERSt13basic_ostreamIT0_T1_ES6_RKSt7complexIT_E(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8), %"struct.std::complex"* nonnull align 8 dereferenceable(16)) local_unnamed_addr #0

declare dso_local void @_Z17__enzyme_autodiffz(...) local_unnamed_addr #0

; Function Attrs: nofree nounwind
declare dso_local double @cabs(double, double) local_unnamed_addr #7

declare dso_local nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* nonnull align 8 dereferenceable(8), i8*, i64) local_unnamed_addr #0

declare dso_local nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"*, double) local_unnamed_addr #0

; Function Attrs: nounwind
declare dso_local double @carg(double, double) local_unnamed_addr #1

declare dso_local nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"*, i8 signext) local_unnamed_addr #0

declare dso_local nonnull align 8 dereferenceable(8) %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"*) local_unnamed_addr #0

; Function Attrs: noreturn
declare dso_local void @_ZSt16__throw_bad_castv() local_unnamed_addr #8

declare dso_local void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"*) local_unnamed_addr #0

; Function Attrs: nounwind uwtable
define internal void @_GLOBAL__sub_I_complex.cpp() #9 section ".text.startup" {
entry:
  tail call void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"* nonnull @_ZStL8__ioinit) #11
  %0 = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::ios_base::Init"*)* @_ZNSt8ios_base4InitD1Ev to void (i8*)*), i8* getelementptr inbounds (%"class.std::ios_base::Init", %"class.std::ios_base::Init"* @_ZStL8__ioinit, i64 0, i32 0), i8* nonnull @__dso_handle) #11
  ret void
}

; Function Attrs: argmemonly nounwind willreturn writeonly
declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #10

; Function Attrs: nounwind readonly uwtable
define dso_local double @preprocess__Z2h2RKd(double* nocapture nonnull readonly align 8 dereferenceable(8) %c) #5 {
entry:
  %0 = load double, double* %c, align 8, !tbaa !2
  %mul = fmul double %0, %0
  ret double %mul
}

; Function Attrs: nounwind uwtable
define internal void @diffe_Z2h2RKd(double* nocapture nonnull readonly align 8 dereferenceable(8) %c, double* nocapture %"c'", double %differeturn) #9 {
entry:
  %0 = load double, double* %c, align 8, !tbaa !2
  %m0diffe = fmul fast double %differeturn, %0
  %1 = fadd fast double %m0diffe, %m0diffe
  %2 = load double, double* %"c'", align 8
  %3 = fadd fast double %2, %1
  store double %3, double* %"c'", align 8
  ret void
}

; Function Attrs: nofree nounwind uwtable
define dso_local double @preprocess__Z1hRKSt7complexIdE(%"struct.std::complex"* nocapture nonnull readonly align 8 dereferenceable(16) %c) #3 {
entry:
  %_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %c, i64 0, i32 0, i32 0
  %_M_value.real.i.i = load double, double* %_M_value.realp.i.i, align 8
  %_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %c, i64 0, i32 0, i32 1
  %_M_value.imag.i.i = load double, double* %_M_value.imagp.i.i, align 8
  %call.i.i = tail call double @cabs(double %_M_value.real.i.i, double %_M_value.imag.i.i) #11
  %mul = fmul double %call.i.i, %call.i.i
  ret double %mul
}

; Function Attrs: nofree nounwind uwtable
define internal void @diffe_Z1hRKSt7complexIdE(%"struct.std::complex"* nocapture nonnull readonly align 8 dereferenceable(16) %c, %"struct.std::complex"* nocapture %"c'", double %differeturn) #3 {
entry:
  %_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %c, i64 0, i32 0, i32 0
  %_M_value.real.i.i = load double, double* %_M_value.realp.i.i, align 8
  %_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %c, i64 0, i32 0, i32 1
  %_M_value.imag.i.i = load double, double* %_M_value.imagp.i.i, align 8
  %call.i.i = tail call double @cabs(double %_M_value.real.i.i, double %_M_value.imag.i.i) #11
  %mul_replacementA = phi double 
  br label %invertentry

allocsForInversion:                               ; No predecessors!
  %"mul'de" = alloca double, align 8
  store double 0.000000e+00, double* %"mul'de", align 8
  %"call.i.i'de" = alloca double, align 8
  store double 0.000000e+00, double* %"call.i.i'de", align 8

invertentry:                                      ; preds = %entry
  store double %differeturn, double* %"mul'de", align 8
  %0 = load double, double* %"mul'de", align 8
  %m0diffecall.i.i = fmul fast double %0, %call.i.i
  %m1diffecall.i.i = fmul fast double %0, %call.i.i
  store double 0.000000e+00, double* %"mul'de", align 8
  %1 = load double, double* %"call.i.i'de", align 8
  %2 = fadd fast double %1, %m0diffecall.i.i
  store double %2, double* %"call.i.i'de", align 8
  %3 = load double, double* %"call.i.i'de", align 8
  %4 = fadd fast double %3, %m1diffecall.i.i
  store double %4, double* %"call.i.i'de", align 8
}

attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nofree nounwind }
attributes #3 = { nofree nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { argmemonly nounwind willreturn }
attributes #5 = { nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #6 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #7 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #8 = { noreturn "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #9 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #10 = { argmemonly nounwind willreturn writeonly }
attributes #11 = { nounwind }
attributes #12 = { noreturn nounwind }

!llvm.module.flags = !{!0}
!llvm.ident = !{!1}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 11.1.0"}
!2 = !{!3, !3, i64 0}
!3 = !{!"double", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C++ TBAA"}
!6 = !{!7, !7, i64 0}
!7 = !{!"vtable pointer", !5, i64 0}
!8 = !{!9, !10, i64 240}
!9 = !{!"_ZTSSt9basic_iosIcSt11char_traitsIcEE", !10, i64 216, !4, i64 224, !11, i64 225, !10, i64 232, !10, i64 240, !10, i64 248, !10, i64 256}
!10 = !{!"any pointer", !4, i64 0}
!11 = !{!"bool", !4, i64 0}
!12 = !{!13, !4, i64 56}
!13 = !{!"_ZTSSt5ctypeIcE", !10, i64 16, !11, i64 24, !10, i64 32, !10, i64 40, !10, i64 48, !4, i64 56, !4, i64 57, !4, i64 313, !4, i64 569}
!14 = !{!4, !4, i64 0}
!15 = !{!16, !16, i64 0}
!16 = !{!"int", !4, i64 0}

; Function Attrs: nofree nounwind
declare dso_local double @cabs(double, double) local_unnamed_addr #7

clang-11: ../Enzyme/EnzymeLogic.cpp:1181: const AugmentedReturn& CreateAugmentedPrimal(llvm::Function*, DIFFE_TYPE, const std::vector<DIFFE_TYPE>&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, const FnTypeInfo&, std::map<llvm::Argument*, bool>, bool, bool, bool, bool): Assertion `0 && "attempting to differentiate function without definition"' failed.
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace, preprocessed source, and associated run script.
Stack dump:
0.	Program arguments: /home/username/usrlocal/bin/clang-11 -cc1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -main-file-name complex.cpp -mrelocation-model static -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -fno-split-dwarf-inlining -debugger-tuning=gdb -resource-dir /home/username/usrlocal/lib/clang/11.1.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /home/username/usrlocal/lib/clang/11.1.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -fdeprecated-macro -fdebug-compilation-dir /home/username/testenzyme -ferror-limit 19 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -load /usr/local/lib/ClangEnzyme-11.so -faddrsig -o /tmp/complex-7d9a21.o -x c++ complex.cpp 
1.	<eof> parser at end of file
2.	Per-module optimization passes
3.	Running pass 'Enzyme Pass' on module 'complex.cpp'.
 #0 0x0000557a0430998a llvm::sys::PrintStackTrace(llvm::raw_ostream&) (/home/username/usrlocal/bin/clang-11+0x1d0f98a)
 #1 0x0000557a04307654 llvm::sys::RunSignalHandlers() (/home/username/usrlocal/bin/clang-11+0x1d0d654)
 #2 0x0000557a043077a3 SignalHandler(int) (/home/username/usrlocal/bin/clang-11+0x1d0d7a3)
 #3 0x00007f1bd04f3980 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x12980)
 #4 0x00007f1bcf1a4fb7 raise /build/glibc-S9d2JN/glibc-2.27/signal/../sysdeps/unix/sysv/linux/raise.c:51:0
 #5 0x00007f1bcf1a6921 abort /build/glibc-S9d2JN/glibc-2.27/stdlib/abort.c:81:0
 #6 0x00007f1bcf19648a __assert_fail_base /build/glibc-S9d2JN/glibc-2.27/assert/assert.c:89:0
 #7 0x00007f1bcf196502 (/lib/x86_64-linux-gnu/libc.so.6+0x30502)
 #8 0x00007f1bcec77ef4 CreateAugmentedPrimal(llvm::Function*, DIFFE_TYPE, std::vector<DIFFE_TYPE, std::allocator<DIFFE_TYPE> > const&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, FnTypeInfo const&, std::map<llvm::Argument*, bool, std::less<llvm::Argument*>, std::allocator<std::pair<llvm::Argument* const, bool> > >, bool, bool, bool, bool) (/usr/local/lib/ClangEnzyme-11.so+0x445ef4)
 #9 0x00007f1bcece1aeb AdjointGenerator<AugmentedReturn const*>::visitCallInst(llvm::CallInst&) (/usr/local/lib/ClangEnzyme-11.so+0x4afaeb)
#10 0x00007f1bcecd8a2b llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::delegateCallInst(llvm::CallInst&) (/usr/local/lib/ClangEnzyme-11.so+0x4a6a2b)
#11 0x00007f1bcecb91b3 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visitCall(llvm::CallInst&) (/usr/local/lib/ClangEnzyme-11.so+0x4871b3)
#12 0x00007f1bcecab549 llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction&) (/usr/local/lib/ClangEnzyme-11.so+0x479549)
#13 0x00007f1bcec9fe3d llvm::InstVisitor<AdjointGenerator<AugmentedReturn const*>, void>::visit(llvm::Instruction*) (/usr/local/lib/ClangEnzyme-11.so+0x46de3d)
#14 0x00007f1bcec81ae3 CreatePrimalAndGradient(llvm::Function*, DIFFE_TYPE, std::vector<DIFFE_TYPE, std::allocator<DIFFE_TYPE> > const&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, bool, bool, llvm::Type*, FnTypeInfo const&, std::map<llvm::Argument*, bool, std::less<llvm::Argument*>, std::allocator<std::pair<llvm::Argument* const, bool> > >, AugmentedReturn const*, bool, bool, bool) (/usr/local/lib/ClangEnzyme-11.so+0x44fae3)
#15 0x00007f1bcec5b378 bool (anonymous namespace)::Enzyme::HandleAutoDiff<llvm::CallInst>(llvm::CallInst*, llvm::TargetLibraryInfo&, llvm::AAResults&, bool) (/usr/local/lib/ClangEnzyme-11.so+0x429378)
#16 0x00007f1bcec574d2 (anonymous namespace)::Enzyme::lowerEnzymeCalls(llvm::Function&, bool, bool&, std::set<llvm::Function*, std::less<llvm::Function*>, std::allocator<llvm::Function*> >&) (/usr/local/lib/ClangEnzyme-11.so+0x4254d2)
#17 0x00007f1bcec57dfb (anonymous namespace)::Enzyme::runOnModule(llvm::Module&) (/usr/local/lib/ClangEnzyme-11.so+0x425dfb)
#18 0x0000557a03cc9a81 llvm::legacy::PassManagerImpl::run(llvm::Module&) (/home/username/usrlocal/bin/clang-11+0x16cfa81)
#19 0x0000557a04589c54 (anonymous namespace)::EmitAssemblyHelper::EmitAssembly(clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream> >) (/home/username/usrlocal/bin/clang-11+0x1f8fc54)
#20 0x0000557a0458b6f4 clang::EmitBackendOutput(clang::DiagnosticsEngine&, clang::HeaderSearchOptions const&, clang::CodeGenOptions const&, clang::TargetOptions const&, clang::LangOptions const&, llvm::DataLayout const&, llvm::Module*, clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream> >) (/home/username/usrlocal/bin/clang-11+0x1f916f4)
#21 0x0000557a05133df5 clang::BackendConsumer::HandleTranslationUnit(clang::ASTContext&) (/home/username/usrlocal/bin/clang-11+0x2b39df5)
#22 0x0000557a05cb7fe9 clang::ParseAST(clang::Sema&, bool, bool) (/home/username/usrlocal/bin/clang-11+0x36bdfe9)
#23 0x0000557a05133fa8 clang::CodeGenAction::ExecuteAction() (/home/username/usrlocal/bin/clang-11+0x2b39fa8)
#24 0x0000557a04b1bd39 clang::FrontendAction::Execute() (/home/username/usrlocal/bin/clang-11+0x2521d39)
#25 0x0000557a04ad667a clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (/home/username/usrlocal/bin/clang-11+0x24dc67a)
#26 0x0000557a04be6486 clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (/home/username/usrlocal/bin/clang-11+0x25ec486)
#27 0x0000557a032070fc cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (/home/username/usrlocal/bin/clang-11+0xc0d0fc)
#28 0x0000557a03202479 ExecuteCC1Tool(llvm::SmallVectorImpl<char const*>&) (/home/username/usrlocal/bin/clang-11+0xc08479)
#29 0x0000557a03185fd4 main (/home/username/usrlocal/bin/clang-11+0xb8bfd4)
#30 0x00007f1bcf187bf7 __libc_start_main /build/glibc-S9d2JN/glibc-2.27/csu/../csu/libc-start.c:344:0
#31 0x0000557a03201c4a _start (/home/username/usrlocal/bin/clang-11+0xc07c4a)
clang-11: error: unable to execute command: Aborted (core dumped)
clang-11: error: clang frontend command failed due to signal (use -v to see invocation)
clang version 11.1.0
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/local/bin
clang-11: note: diagnostic msg: 
********************

PLEASE ATTACH THE FOLLOWING FILES TO THE BUG REPORT:
Preprocessed source(s) and associated run script(s) are located at:
clang-11: note: diagnostic msg: /tmp/complex-8fd898.cpp
clang-11: note: diagnostic msg: /tmp/complex-8fd898.sh
clang-11: note: diagnostic msg: 

********************

Build of LLVMEnzyme should set soname.

Yggdrasil audit points out that we currently don't and this causes issues when developing Enzyme.jl on a dev build of Enzyme.

patchelf --set-soname /path/to/LLVMEnzyme-9.so LLVMEnzyme-9.so

Compilation fail when printf strings of length 1

Hello,

The compilation fails when using printf of strings of length 1 inside the function that is differentiated when it has been wrapped.

Clang version 11.1.0
Enzyme recent from a few days. ae09f3d

bugprintf.cpp

#include <iostream>

using namespace std;

int enzyme_dup;
void __enzyme_autodiff(...);

template <typename T>
void compute( double* x, double* out );

template< typename T>
void compute( double* x, double* out)
{
    T::comp(x,out);
}

template<typename T>
void compute_d(double* x, double* res)
{
  double out = 0.0;
  double dout = 1.0;
    __enzyme_autodiff(compute<T >,
                                        enzyme_dup, x,res,
                                        enzyme_dup,&out,&dout);

}

template<typename T>
class D
{
public:
  D()
  {}

  static void comp(double* v, double* out )
  {
      compute_d< T >(v,out);
  }
};

class Fun2{};
template<>
void compute<Fun2>( double* x, double* out )
{
  printf("a");// single character string make the compiler crash
  printf("ab");//but multi character printf work fine
  *out = 3*x[0] * x[0] ;
}

int main(int argc, char** argv )
{
  cout<<"bug printf "<<endl;
  const int d = 1;
  double* x = new double[d] ;
  x[0] = 5.0;

  //Make the compiler crash when a single character printf is present above
  double g=0.0;
  D<Fun2>::comp(x, &g);
  cout << "g " << g << endl ;
}

Compilation with :
clang bugprintf.cpp -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-11.so -O2 -o bugprintf -fno-exceptions

Here is the output of the compilation : (as a file to keep the thread readable)
bugprintf.log

How can I get the torch_enzyme and tf_enzyme

Hi, how can I get the torch_enzyme and tf_enzyme described in the paper? It seems that they have not been open-sourced?

LLVM build does not prefix libraries on Linux

I had to do some gymnastics with the linker to find enzyme in the system search path. I don't know much about the LLVM build system, but

if (UNIX)
    set(CMAKE_SHARED_LIBRARY_PREFIX "lib")
endif()

had no effect.

Compilation failed : UNREACHABLE executed /Enzyme/llvm/lib/IR/Value.cpp:887!

The instruction
W[indj] += wjk;

make the compiler crash.

#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <random>
#include <math.h>
#include <vector>
#include <algorithm>


using namespace std;


struct Index
{
    int* cellId;
    int* start;
    int* cellSize;
    int size;
    int* argsorted;
    int n;
} ;


void buildIndex( Index& index , double * parts, int n )
{
    int d = 3;
    index.n = n;
    index.cellId = new int[n];
    index.start = new int[n];
    index.cellSize = new int[n]; //Max Size is n but the end may be unused
    index.argsorted = new int[n];

    for( int i = 0 ; i < n ; i++)
    {
        int id = parts[d*i];
        index.cellId[i] = id;
    }

    vector<pair<int,int> > v(n);
    for( int i = 0 ; i < n ; i++)
    {
        v[i].first = index.cellId[i];
        v[i].second = i;
    }

    sort( v.begin(), v.end() );
    int i = 0 ;
    int cur = -1;
    int curCellId = -1;
    for( int i = 0 ; i < n ; i++)
    {
        index.argsorted[i] = v[i].second;
        if( v[i].first == curCellId)
        {
            index.cellSize[cur]++;
        }
        else
        {
            curCellId = v[i].first;
            cur ++;
            index.cellSize[cur] = 1;
            index.start[cur] = i;
        }
    }
    index.size = cur+1;

}


double foo( double* __restrict__ parts,int n, Index* __restrict__ index)
{
     double out = 0;
     const int d = 3;

     double F[n*d];

     double W[n];

     for( int i = 0 ; i < n ; i++)
     {
         for( int j = 0 ; j < d ; j++)
         {
             F[i*d+j] = 0.0;
         }
         W[i] = 0.0;
     }

     for( int i = 0 ; i < index->size ; i++)
     {
         for( int j = 0 ; j < index->cellSize[i] ; j++ )
         {
             for( int k = 0 ; k < index->cellSize[i] ; k++ )
             {
                 int indj = index->argsorted[index->start[i]+j];
                 int indk = index->argsorted[index->start[i]+k];

                 double djk = 0;
                 for( int l = 0 ; l < d ; l++)
                 {
                     double temp;
                     temp = parts[indj * d +l ]- parts[indk * d +l ];
                     djk += temp*temp;
                 }
                 //out += djk;

                 double wjk = 1.0+djk; // strictly positive
                 /*
                 for( int l = 0 ; l < d ; l++)
                 {
                     F[indj*d+l] += wjk * parts[indk*d+l];
                 }*/

                 W[indj] += wjk;

            }
         }
     }

     /*
    //Normalize the field value
    for( int i = 0 ; i < n ; i++)
    {
        for( int j = 0 ; j < d ; j++)
        {
            F[i*d+j] /= W[i*d+j];
        }
    }
*/
/*
    //Compute the energy
    for( int i = 0 ; i < n ; i++)
    {
        double e = 0.0;
        for( int j = 0 ; j < d ; j++)
        {
            out += F[i*d+j]*F[i*d+j];
        }
    }
*/

     //delete[] F;
     //delete[] W;

     return out;
}


int enzyme_dup;
int enzyme_out;
int enzyme_const;

typedef double (*f_ptr)(double *,int,Index*);

extern double __enzyme_autodiff(f_ptr,
    int, double *, double *,
    int, int,
    int, Index*);


int main() {
    std::mt19937 e2(42);
    std::uniform_real_distribution<> dist(0, 10);
    int n = 100000;
    int d = 3;
    double* x = new double[n*d];
    double* d_x = new double[n*d];
    for( int i = 0 ; i < n*d ; i++)
    {
        x[i] = dist(e2);
        d_x[i] = 0.0;
    }

    Index index;
    buildIndex(index, x, n);

    for( int i = 0 ; i < 100 ; i++)
    {
    printf("cellId[%d] = %d\n ",i, index.cellId[i]);
    }

    printf("before autodiff\n");
    __enzyme_autodiff(foo,
        enzyme_dup, x, d_x,
        enzyme_const, n,
        enzyme_const, &index);


    //printf("%f \n", y);
    for( int i = 0 ; i < 100 ; i++)
    {
    printf("dx[%d] = [%f, %f, %f]\n ",i, d_x[d*i],d_x[d*i+1],d_x[d*i+2]);
    }

}

compiled with :
clang test2.cpp -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-7.so -O2 -o test2 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -fno-exceptions

While deleting: i32 %_unwrap12
An asserting value handle still pointed to this value!
UNREACHABLE executed at /home/username/Enzyme/llvm/lib/IR/Value.cpp:887!
Stack dump:
0.	Program arguments: /home/username/usrlocal/bin/clang-7 -cc1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -main-file-name test2.cpp -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -momit-leaf-frame-pointer -resource-dir /home/username/usrlocal/lib/clang/7.1.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/x86_64-linux-gnu/c++/7.5.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.5.0/../../../../include/c++/7.5.0/backward -internal-isystem /usr/local/include -internal-isystem /home/username/usrlocal/lib/clang/7.1.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -fdeprecated-macro -fdebug-compilation-dir /home/username/testenzyme -ferror-limit 19 -fmessage-length 80 -fno-unroll-loops -fobjc-runtime=gcc -fdiagnostics-show-option -fcolor-diagnostics -load /usr/local/lib/ClangEnzyme-7.so -o /tmp/test2-6e834d.o -x c++ test2.cpp -faddrsig 
1.	<eof> parser at end of file
2.	Per-module optimization passes
3.	Running pass 'Enzyme Pass' on module 'test2.cpp'.
#0 0x000055d999b1537a llvm::sys::PrintStackTrace(llvm::raw_ostream&) (/home/username/usrlocal/bin/clang-7+0x170637a)
#1 0x000055d999b137d4 llvm::sys::RunSignalHandlers() (/home/username/usrlocal/bin/clang-7+0x17047d4)
#2 0x000055d999b13912 SignalHandler(int) (/home/username/usrlocal/bin/clang-7+0x1704912)
#3 0x00007f2cc6eab980 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x12980)
#4 0x00007f2cc5b5cfb7 gsignal /build/glibc-S9d2JN/glibc-2.27/signal/../sysdeps/unix/sysv/linux/raise.c:51:0
#5 0x00007f2cc5b5e921 abort /build/glibc-S9d2JN/glibc-2.27/stdlib/abort.c:81:0
#6 0x000055d999ab8e1a (/home/username/usrlocal/bin/clang-7+0x16a9e1a)
#7 0x000055d999678d22 llvm::ValueHandleBase::ValueIsDeleted(llvm::Value*) (/home/username/usrlocal/bin/clang-7+0x1269d22)
#8 0x000055d99967993d llvm::Value::~Value() (/home/username/usrlocal/bin/clang-7+0x126a93d)
#9 0x000055d999679a20 llvm::Value::deleteValue() (/home/username/usrlocal/bin/clang-7+0x126aa20)
#10 0x000055d999604d24 llvm::Instruction::eraseFromParent() (/home/username/usrlocal/bin/clang-7+0x11f5d24)
#11 0x000055d99991d3ec llvm::GVN::processBlock(llvm::BasicBlock*) (/home/username/usrlocal/bin/clang-7+0x150e3ec)
#12 0x000055d99991d87f llvm::GVN::iterateOnFunction(llvm::Function&) (/home/username/usrlocal/bin/clang-7+0x150e87f)
#13 0x000055d99991da2f llvm::GVN::runImpl(llvm::Function&, llvm::AssumptionCache&, llvm::DominatorTree&, llvm::TargetLibraryInfo const&, llvm::AAResults&, llvm::MemoryDependenceResults*, llvm::LoopInfo*, llvm::OptimizationRemarkEmitter*) (/home/username/usrlocal/bin/clang-7+0x150ea2f)
#14 0x000055d99991e5ea llvm::GVN::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (/home/username/usrlocal/bin/clang-7+0x150f5ea)
#15 0x00007f2cc56de0d5 optimizeIntermediate(GradientUtils*, bool, llvm::Function*) (/usr/local/lib/ClangEnzyme-7.so+0x4db0d5)
#16 0x00007f2cc563c857 CreatePrimalAndGradient(llvm::Function*, DIFFE_TYPE, std::vector<DIFFE_TYPE, std::allocator<DIFFE_TYPE> > const&, llvm::TargetLibraryInfo&, TypeAnalysis&, llvm::AAResults&, bool, bool, bool, llvm::Type*, FnTypeInfo const&, std::map<llvm::Argument*, bool, std::less<llvm::Argument*>, std::allocator<std::pair<llvm::Argument* const, bool> > >, AugmentedReturn const*, bool, bool, bool) (/usr/local/lib/ClangEnzyme-7.so+0x439857)
#17 0x00007f2cc5618065 bool HandleAutoDiff<llvm::CallInst>(llvm::CallInst*, llvm::TargetLibraryInfo&, llvm::AAResults&, bool) (/usr/local/lib/ClangEnzyme-7.so+0x415065)
#18 0x00007f2cc56103ad (anonymous namespace)::Enzyme::lowerEnzymeCalls(llvm::Function&, bool, bool&, std::set<llvm::Function*, std::less<llvm::Function*>, std::allocator<llvm::Function*> >&) (/usr/local/lib/ClangEnzyme-7.so+0x40d3ad)
#19 0x00007f2cc5610e12 (anonymous namespace)::Enzyme::runOnModule(llvm::Module&) (/usr/local/lib/ClangEnzyme-7.so+0x40de12)
#20 0x000055d999636a68 llvm::legacy::PassManagerImpl::run(llvm::Module&) (/home/username/usrlocal/bin/clang-7+0x1227a68)
#21 0x000055d999d1e34c clang::EmitBackendOutput(clang::DiagnosticsEngine&, clang::HeaderSearchOptions const&, clang::CodeGenOptions const&, clang::TargetOptions const&, clang::LangOptions const&, llvm::DataLayout const&, llvm::Module*, clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream> >) (/home/username/usrlocal/bin/clang-7+0x190f34c)
#22 0x000055d99a50e3b8 clang::BackendConsumer::HandleTranslationUnit(clang::ASTContext&) (/home/username/usrlocal/bin/clang-7+0x20ff3b8)
#23 0x000055d99add1e49 clang::ParseAST(clang::Sema&, bool, bool) (/home/username/usrlocal/bin/clang-7+0x29c2e49)
#24 0x000055d99a50cff8 clang::CodeGenAction::ExecuteAction() (/home/username/usrlocal/bin/clang-7+0x20fdff8)
#25 0x000055d99a18a01e clang::FrontendAction::Execute() (/home/username/usrlocal/bin/clang-7+0x1d7b01e)
#26 0x000055d99a14f32e clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (/home/username/usrlocal/bin/clang-7+0x1d4032e)
#27 0x000055d99a22d00b clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (/home/username/usrlocal/bin/clang-7+0x1e1e00b)
#28 0x000055d998d4beb8 cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (/home/username/usrlocal/bin/clang-7+0x93ceb8)
#29 0x000055d998cfde2d main (/home/username/usrlocal/bin/clang-7+0x8eee2d)
#30 0x00007f2cc5b3fbf7 __libc_start_main /build/glibc-S9d2JN/glibc-2.27/csu/../csu/libc-start.c:344:0
#31 0x000055d998d47c1a _start (/home/username/usrlocal/bin/clang-7+0x938c1a)
clang-7: error: unable to execute command: Aborted (core dumped)
clang-7: error: clang frontend command failed due to signal (use -v to see invocation)
clang version 7.1.0 
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/local/bin
clang-7: note: diagnostic msg: PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace, preprocessed source, and associated run script.
clang-7: note: diagnostic msg: 
********************

PLEASE ATTACH THE FOLLOWING FILES TO THE BUG REPORT:
Preprocessed source(s) and associated run script(s) are located at:
clang-7: note: diagnostic msg: /tmp/test2-3778d1.cpp
clang-7: note: diagnostic msg: /tmp/test2-3778d1.sh
clang-7: note: diagnostic msg:

The diagnostic msg /tmp/test2-3778d1.cpp is of size 3.7M I can add it if necessary.

Incorrect forward pass when running AD

//#include "XSbench_header.cuh"
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <stdint.h>
#include <string.h>

// Structures
typedef struct{
	double energy;
	double total_xs;
	//double elastic_xs;
	//double absorbtion_xs;
	//double fission_xs;
} NuclideGridPoint;

typedef struct{
	long n_isotopes;
	long n_gridpoints;
	int grid_type; // 0: Unionized Grid (default)    1: Nuclide Grid
} Inputs;

typedef struct{
	int * num_nucs;                     // Length = length_num_nucs;
	int * mats;                         // Length = length_mats
	double * unionized_energy_array;    // Length = length_unionized_energy_array
	int * index_grid;                   // Length = length_index_grid
	NuclideGridPoint * nuclide_grid;    // Length = length_nuclide_grid
	NuclideGridPoint * d_nuclide_grid;
	int length_num_nucs;
	int length_mats;
	int length_unionized_energy_array;
	long length_index_grid;
	int length_nuclide_grid;
	int max_num_nucs;
} SimulationData;



// Grid types
#define UNIONIZED 0
#define NUCLIDE 1
#define HASH 2

// Simulation types
#define HISTORY_BASED 1
#define EVENT_BASED 2

// Binary Mode Type
#define NONE 0
#define READ 1
#define WRITE 2

// Starting Seed
#define STARTING_SEED 1070


int double_compare(const void * a, const void * b)
{
	double A = *((double *) a);
	double B = *((double *) b);

	if( A > B )
		return 1;
	else if( A < B )
		return -1;
	else
		return 0;
}

int NGP_compare(const void * a, const void * b)
{
	NuclideGridPoint A = *((NuclideGridPoint *) a);
	NuclideGridPoint B = *((NuclideGridPoint *) b);

	if( A.energy > B.energy )
		return 1;
	else if( A.energy < B.energy )
		return -1;
	else
		return 0;
}

// num_nucs represents the number of nuclides that each material contains
int * load_num_nucs(long n_isotopes)
{
	int * num_nucs = (int*)malloc(12*sizeof(int));
	
	// Material 0 is a special case (fuel). The H-M small reactor uses
	// 34 nuclides, while H-M larges uses 300.
	if( n_isotopes == 68 )
		num_nucs[0]  = 34; // HM Small is 34, H-M Large is 321
	else
		num_nucs[0]  = 321; // HM Small is 34, H-M Large is 321

	num_nucs[1]  = 5;
	num_nucs[2]  = 4;
	num_nucs[3]  = 4;
	num_nucs[4]  = 27;
	num_nucs[5]  = 21;
	num_nucs[6]  = 21;
	num_nucs[7]  = 21;
	num_nucs[8]  = 21;
	num_nucs[9]  = 21;
	num_nucs[10] = 9;
	num_nucs[11] = 9;

	return num_nucs;
}

// Assigns an array of nuclide ID's to each material
int * load_mats( int * num_nucs, long n_isotopes, int * max_num_nucs )
{
	*max_num_nucs = 0;
	int num_mats = 12;
	for( int m = 0; m < num_mats; m++ )
	{
		if( num_nucs[m] > *max_num_nucs )
			*max_num_nucs = num_nucs[m];
	}
	int * mats = (int *) malloc( num_mats * (*max_num_nucs) * sizeof(int) );

	// Small H-M has 34 fuel nuclides
	int mats0_Sml[] =  { 58, 59, 60, 61, 40, 42, 43, 44, 45, 46, 1, 2, 3, 7,
	                 8, 9, 10, 29, 57, 47, 48, 0, 62, 15, 33, 34, 52, 53, 
	                 54, 55, 56, 18, 23, 41 }; //fuel
	// Large H-M has 300 fuel nuclides
	int mats0_Lrg[321] =  { 58, 59, 60, 61, 40, 42, 43, 44, 45, 46, 1, 2, 3, 7,
	                 8, 9, 10, 29, 57, 47, 48, 0, 62, 15, 33, 34, 52, 53,
	                 54, 55, 56, 18, 23, 41 }; //fuel
	for( int i = 0; i < 321-34; i++ )
		mats0_Lrg[34+i] = 68 + i; // H-M large adds nuclides to fuel only
	
	// These are the non-fuel materials	
	int mats1[] =  { 63, 64, 65, 66, 67 }; // cladding
	int mats2[] =  { 24, 41, 4, 5 }; // cold borated water
	int mats3[] =  { 24, 41, 4, 5 }; // hot borated water
	int mats4[] =  { 19, 20, 21, 22, 35, 36, 37, 38, 39, 25, 27, 28, 29,
	                 30, 31, 32, 26, 49, 50, 51, 11, 12, 13, 14, 6, 16,
	                 17 }; // RPV
	int mats5[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
	                 49, 50, 51, 11, 12, 13, 14 }; // lower radial reflector
	int mats6[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
	                 49, 50, 51, 11, 12, 13, 14 }; // top reflector / plate
	int mats7[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
	                 49, 50, 51, 11, 12, 13, 14 }; // bottom plate
	int mats8[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
	                 49, 50, 51, 11, 12, 13, 14 }; // bottom nozzle
	int mats9[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
	                 49, 50, 51, 11, 12, 13, 14 }; // top nozzle
	int mats10[] = { 24, 41, 4, 5, 63, 64, 65, 66, 67 }; // top of FA's
	int mats11[] = { 24, 41, 4, 5, 63, 64, 65, 66, 67 }; // bottom FA's
	
	// H-M large v small dependency
	if( n_isotopes == 68 )
		memcpy( mats,  mats0_Sml,  num_nucs[0]  * sizeof(int) );	
	else
		memcpy( mats,  mats0_Lrg,  num_nucs[0]  * sizeof(int) );
	
	// Copy other materials
	memcpy( mats + *max_num_nucs * 1,  mats1,  num_nucs[1]  * sizeof(int) );	
	memcpy( mats + *max_num_nucs * 2,  mats2,  num_nucs[2]  * sizeof(int) );	
	memcpy( mats + *max_num_nucs * 3,  mats3,  num_nucs[3]  * sizeof(int) );	
	memcpy( mats + *max_num_nucs * 4,  mats4,  num_nucs[4]  * sizeof(int) );	
	memcpy( mats + *max_num_nucs * 5,  mats5,  num_nucs[5]  * sizeof(int) );	
	memcpy( mats + *max_num_nucs * 6,  mats6,  num_nucs[6]  * sizeof(int) );	
	memcpy( mats + *max_num_nucs * 7,  mats7,  num_nucs[7]  * sizeof(int) );	
	memcpy( mats + *max_num_nucs * 8,  mats8,  num_nucs[8]  * sizeof(int) );	
	memcpy( mats + *max_num_nucs * 9,  mats9,  num_nucs[9]  * sizeof(int) );	
	memcpy( mats + *max_num_nucs * 10, mats10, num_nucs[10] * sizeof(int) );	
	memcpy( mats + *max_num_nucs * 11, mats11, num_nucs[11] * sizeof(int) );	

	return mats;
}


double LCG_random_double(uint64_t * seed)
{
	// LCG parameters
	const uint64_t m = 9223372036854775808ULL; // 2^63
	const uint64_t a = 2806196910506780709ULL;
	const uint64_t c = 1ULL;
	*seed = (a * (*seed) + c) % m;
	return (double) (*seed) / (double) m;
}

template<typename... Args>
//__device__ 
void __enzyme_autodiff(void*, Args...);

int enzyme_dup, enzyme_const, enzyme_active;

// picks a material based on a probabilistic distribution
int pick_mat( uint64_t * seed )
{
	// I have a nice spreadsheet supporting these numbers. They are
	// the fractions (by volume) of material in the core. Not a 
	// *perfect* approximation of where XS lookups are going to occur,
	// but this will do a good job of biasing the system nonetheless.

	// Also could be argued that doing fractions by weight would be 
	// a better approximation, but volume does a good enough job for now.

	double dist[12];
	dist[0]  = 0.140;	// fuel
	dist[1]  = 0.052;	// cladding
	dist[2]  = 0.275;	// cold, borated water
	dist[3]  = 0.134;	// hot, borated water
	dist[4]  = 0.154;	// RPV
	dist[5]  = 0.064;	// Lower, radial reflector
	dist[6]  = 0.066;	// Upper reflector / top plate
	dist[7]  = 0.055;	// bottom plate
	dist[8]  = 0.008;	// bottom nozzle
	dist[9]  = 0.015;	// top nozzle
	dist[10] = 0.025;	// top of fuel assemblies
	dist[11] = 0.013;	// bottom of fuel assemblies
	
	double roll = LCG_random_double(seed);

	// makes a pick based on the distro
	for( int i = 0; i < 12; i++ )
	{
		double running = 0;
		for( int j = i; j > 0; j-- )
			running += dist[j];
		if( roll < running )
			return i;
	}

	return 0;
}

uint64_t fast_forward_LCG(uint64_t seed, uint64_t n)
{
	// LCG parameters
	const uint64_t m = 9223372036854775808ULL; // 2^63
	uint64_t a = 2806196910506780709ULL;
	uint64_t c = 1ULL;

	n = n % m;

	uint64_t a_new = 1;
	uint64_t c_new = 0;

	while(n > 0) 
	{
		if(n & 1)
		{
			a_new *= a;
			c_new = c_new * a + c;
		}
		c *= (a + 1);
		a *= a;

		n >>= 1;
	}

	return (a_new * seed + c_new) % m;
}

// Calculates the microscopic cross section for a given nuclide & energy
__attribute__((always_inline))
void calculate_micro_xs(   int nuc, long n_isotopes,
                           long n_gridpoints,
						   int * __restrict__ index_data,
                           NuclideGridPoint * __restrict__ nuclide_grids,
                           double * __restrict__ xs_vector, int grid_type){
	// Variables
	long idx = 420020;
	NuclideGridPoint * low;

	// If using only the nuclide grid, we must perform a binary search
	// to find the energy location in this particular nuclide's grid.
	if( grid_type == NUCLIDE )
	{

		{
			long lowerLimit = 0;
			long upperLimit = n_gridpoints-1;
			long examinationPoint;
			long length = upperLimit - lowerLimit;

			for (int j=0; j<10; j++)
			{
				examinationPoint = lowerLimit + (length / 2);
				
				if( nuclide_grids[nuc*n_gridpoints + examinationPoint].energy > 0 )
					upperLimit = examinationPoint;
				else
					lowerLimit = examinationPoint;
				
				length = upperLimit - lowerLimit;
			}
	
			idx = lowerLimit;
		}

		// pull ptr from nuclide grid and check to ensure that
		// we're not reading off the end of the nuclide's grid
		if( idx == n_gridpoints - 1 )
			low = &nuclide_grids[nuc*n_gridpoints + 1];
		else
			low = &nuclide_grids[nuc*n_gridpoints + idx];
	}
	else {
		// pull ptr from energy grid and check to ensure that
		// we're not reading off the end of the nuclide's grid
		if( index_data[idx * n_isotopes + nuc] == n_gridpoints - 1 )
			low = &nuclide_grids[nuc*n_gridpoints + 2];
		else
			low = &nuclide_grids[nuc*n_gridpoints + index_data[idx * n_isotopes + nuc]];
	}
	
	// Total XS
	xs_vector[0] = 1 / (1.0 - low->energy);
}

void calculate_macro_xs( long n_isotopes,
                         long n_gridpoints,
						 int * __restrict__ index_data,
                         NuclideGridPoint * __restrict__ nuclide_grids,
                         int * __restrict__ mats,
                         double * __restrict__ macro_xs_vector, int grid_type){

	// cleans out macro_xs_vector
	macro_xs_vector[0] = 0;

	for( int j = 0; j < 2; j++ )
	{
		double xs_vector;
		int nuc = mats[j];
		calculate_micro_xs( nuc, n_isotopes,
		                    n_gridpoints, index_data,
		                    nuclide_grids, &xs_vector, grid_type);
		for( int k = 0; k < 3; k++ ) {
			macro_xs_vector[k] += xs_vector;
			//printf("xs_vector[k=%d] j=%d %f\n", k, j, xs_vector[k]);
			//printf("xs_vector[k=%d] j=%d %f\n", k, j, 1.0);
		}
		//printf("mid\n");
	}
}

void xs_lookup_kernel_baselineLocal(Inputs in, SimulationData GSD )
{
	// The lookup ID. Used to set the seed, and to store the verification value
	const int i = 0;

	// Set the initial seed value
	uint64_t seed = STARTING_SEED;	

	// Forward seed to lookup index (we need 2 samples per lookup)
	seed = fast_forward_LCG(seed, 2*i);

	// Randomly pick an energy and material for the particle
	double p_energy = LCG_random_double(&seed);
		
	double macro_xs_vector[5] = {0};
	double d_macro_xs_vector[5] = {1.0};
		
	// Perform macroscopic Cross Section Lookup
	#if 0
	calculate_macro_xs(
			in.n_isotopes,   // Total number of isotopes in simulation
			in.n_gridpoints, // Number of gridpoints per isotope in simulation
			GSD.index_grid,   // Flattened 2-D grid holding indices into nuclide grid for each unionized energy level
			GSD.nuclide_grid, // Flattened 2-D grid holding energy levels and XS_data for all nuclides in simulation
			GSD.mats,         // Flattened 2-D array with nuclide indices defining composition of each type of material
			macro_xs_vector, // 1-D array with result of the macroscopic cross section (5 different reaction channels)
			in.grid_type,    // Lookup type (nuclide, hash, or unionized)
			GSD.max_num_nucs  // Maximum number of nuclides present in any material
			);
    #else
	__enzyme_autodiff((void*)calculate_macro_xs,
			enzyme_const, in.n_isotopes,   // Total number of isotopes in simulation
			enzyme_const, in.n_gridpoints, // Number of gridpoints per isotope in simulation
			enzyme_const, GSD.index_grid,   // Flattened 2-D grid holding indices into nuclide grid for each unionized energy level

			//enzyme_const, GSD.nuclide_grid, // Flattened 2-D grid holding energy levels and XS_data for all nuclides in simulation
			enzyme_dup, GSD.nuclide_grid, GSD.d_nuclide_grid, // Flattened 2-D grid holding energy levels and XS_data for all nuclides in simulation
			
			enzyme_const, GSD.mats,         // Flattened 2-D array with nuclide indices defining composition of each type of material
			
			//enzyme_const, macro_xs_vector, // 1-D array with result of the macroscopic cross section (5 different reaction channels)
			enzyme_dup, macro_xs_vector, d_macro_xs_vector,// 1-D array with result of the macroscopic cross section (5 different reaction channels)
			
			
			enzyme_const, in.grid_type    // Lookup type (nuclide, hash, or unionized)
			);
    #endif
	if (i == 0) {
		for(int j=0; j<5; j++)
			printf("macro_xs_vector[%d]=%f\n", j, macro_xs_vector[j]);
	}
	
}


SimulationData grid_init_do_not_profile( Inputs in, int mype )
{
	// Structure to hold all allocated simuluation data arrays
	SimulationData SD;

	// Keep track of how much data we're allocating
	size_t nbytes = 0;
	
	// Set the initial seed value
	uint64_t seed = 42;	


	// First, we need to initialize our nuclide grid. This comes in the form
	// of a flattened 2D array that hold all the information we need to define
	// the cross sections for all isotopes in the simulation. 
	// The grid is composed of "NuclideGridPoint" structures, which hold the
	// energy level of the grid point and all associated XS data at that level.
	// An array of structures (AOS) is used instead of
	// a structure of arrays, as the grid points themselves are accessed in 
	// a random order, but all cross section interaction channels and the
	// energy level are read whenever the gridpoint is accessed, meaning the
	// AOS is more cache efficient.
	
	// Initialize Nuclide Grid
	SD.length_nuclide_grid = in.n_isotopes * in.n_gridpoints;
	SD.nuclide_grid     = (NuclideGridPoint *) malloc( SD.length_nuclide_grid * sizeof(NuclideGridPoint));
	SD.d_nuclide_grid     = (NuclideGridPoint *) calloc( SD.length_nuclide_grid , sizeof(NuclideGridPoint));
	assert(SD.nuclide_grid != NULL);
	nbytes += SD.length_nuclide_grid * sizeof(NuclideGridPoint);
	for( int i = 0; i < SD.length_nuclide_grid; i++ )
	{
		SD.nuclide_grid[i].energy        = LCG_random_double(&seed);
		//SD.nuclide_grid[i].total_xs      = LCG_random_double(&seed);
		//SD.nuclide_grid[i].elastic_xs    = LCG_random_double(&seed);
		//SD.nuclide_grid[i].absorbtion_xs = LCG_random_double(&seed);
		//SD.nuclide_grid[i].fission_xs    = LCG_random_double(&seed);
	}

	// Sort so that each nuclide has data stored in ascending energy order.
	for( int i = 0; i < in.n_isotopes; i++ )
		qsort( &SD.nuclide_grid[i*in.n_gridpoints], in.n_gridpoints, sizeof(NuclideGridPoint), NGP_compare);
	
	if( in.grid_type == UNIONIZED )
	{

		// Allocate space to hold the union of all nuclide energy data
		SD.length_unionized_energy_array = in.n_isotopes * in.n_gridpoints;
		SD.unionized_energy_array = (double *) malloc( SD.length_unionized_energy_array * sizeof(double));
		assert(SD.unionized_energy_array != NULL );
		nbytes += SD.length_unionized_energy_array * sizeof(double);

		// Copy energy data over from the nuclide energy grid
		for( int i = 0; i < SD.length_unionized_energy_array; i++ )
			SD.unionized_energy_array[i] = SD.nuclide_grid[i].energy;

		// Sort unionized energy array
		qsort( SD.unionized_energy_array, SD.length_unionized_energy_array, sizeof(double), double_compare);

		// Allocate space to hold the acceleration grid indices
		SD.length_index_grid = SD.length_unionized_energy_array * in.n_isotopes;
		SD.index_grid = (int *) malloc( SD.length_index_grid * sizeof(int));
		assert(SD.index_grid != NULL);
		nbytes += SD.length_index_grid * sizeof(int);

		// Generates the double indexing grid
		int * idx_low = (int *) calloc( in.n_isotopes, sizeof(int));
		assert(idx_low != NULL );
		double * energy_high = (double *) malloc( in.n_isotopes * sizeof(double));
		assert(energy_high != NULL );

		for( int i = 0; i < in.n_isotopes; i++ )
			energy_high[i] = SD.nuclide_grid[i * in.n_gridpoints + 1].energy;

		for( long e = 0; e < SD.length_unionized_energy_array; e++ )
		{
			double unionized_energy = SD.unionized_energy_array[e];
			for( long i = 0; i < in.n_isotopes; i++ )
			{
				if( unionized_energy < energy_high[i]  )
					SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
				else if( idx_low[i] == in.n_gridpoints - 2 )
					SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
				else
				{
					idx_low[i]++;
					SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
					energy_high[i] = SD.nuclide_grid[i * in.n_gridpoints + idx_low[i] + 1].energy;	
				}
			}
		}

		free(idx_low);
		free(energy_high);
	}

	////////////////////////////////////////////////////////////////////
	// Initialize Materials and Concentrations
	///////////////////////////////////////////////////////
	
	// Set the number of nuclides in each material
	SD.num_nucs  = load_num_nucs(in.n_isotopes);
	SD.length_num_nucs = 12; // There are always 12 materials in XSBench

	// Intialize the flattened 2D grid of material data. The grid holds
	// a list of nuclide indices for each of the 12 material types. The
	// grid is allocated as a full square grid, even though not all
	// materials have the same number of nuclides.
	SD.mats = load_mats(SD.num_nucs, in.n_isotopes, &SD.max_num_nucs);
	SD.length_mats = SD.length_num_nucs * SD.max_num_nucs;

	return SD;
}

int main( int argc, char* argv[] )
{
	int mype = 0;

	Inputs in;
	
	// defaults to 11303 (corresponding to H-M Large benchmark)
	in.n_gridpoints = 11303;
	
	// default to unionized grid
	in.grid_type = UNIONIZED;
	
	in.n_isotopes = 68;

	SimulationData SD;

	SD = grid_init_do_not_profile( in, mype );

	xs_lookup_kernel_baselineLocal( in, SD );
	return 0;
}

Interestingly correct forward pass when printf added back.

/mnt/sabrent/wmoses/llvm13/buildallfast/bin/clang++  -fno-experimental-new-pass-manager -std=c++11 -Xclang -load -Xclang /home/wmoses/git/Enzyme/enzyme/build13Fast/Enzyme/ClangEnzyme-13.so  -O3 Main.cpp -o XSBench -lm

CUDA kernels not being autodiff'd

Edit: Working Solution at bottom

LLVM info: 11.0.1 (43ff75f2c3feef64f9d73328230d34dac8832a91), built from source with:

cmake ../llvm -DLLVM_TARGETS_TO_BUILD="host;NVPTX" -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_PLUGINS=ON -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DCMAKE_INSTALL_PREFIX=/home/yutong/local -DLLVM_BUILD_LLVM_DYLIB=ON -DLLVM_INSTALL_UTILS=ON

Enzyme info: fb2af2a

I'm trying to use enzyme to generate autodiff'd CUDA kernels. I'm invoking:

clang -c test3.cu -Xclang -load -Xclang /home/yutong/Code/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-11.so -O2 -fno-vectorize -fno-unroll-loops --cuda-gpu-arch=sm_70 -fPIC

Running nm on the resulting test3.o shows an undefined symbol for the autodiff version:

                 U atexit
0000000000000000 d __cuda_fatbin_wrapper
0000000000000010 b __cuda_gpubin_handle
                 U cudaLaunchKernel
                 U cudaMalloc
                 U cudaMemcpy
0000000000000240 t __cuda_module_ctor
00000000000002b0 t __cuda_module_dtor
                 U __cudaPopCallConfiguration
                 U __cudaPushCallConfiguration
                 U __cudaRegisterFatBinary
                 U __cudaRegisterFatBinaryEnd
                 U __cudaRegisterFunction
                 U __cudaUnregisterFatBinary
0000000000000008 B enzyme_const
0000000000000000 B enzyme_dup
0000000000000004 B enzyme_out
0000000000000000 r .L.str
0000000000000007 r .L__unnamed_1
0000000000000070 T main
                 U printf
0000000000000000 T _Z18__device_stub__fooPdS_
                 U _Z32__device_stub____enzyme_autodiffPFvPdS_EiS_S_iS_S_

// test3.cu
#include <stdio.h>

void __global__ foo(double* x_in, double *x_out) {
    x_out[0] = x_in[0] * x_in[0];
}

int enzyme_dup;
int enzyme_out;
int enzyme_const;

typedef void (*f_ptr)(double*, double*);

extern void __global__ __enzyme_autodiff(f_ptr,
    int, double*, double*,
    int, double*, double*);

int main() {

    double *x, *d_x, *y, *d_y; // all on the devic

    cudaMalloc(&x, sizeof(*x));
    cudaMalloc(&d_x, sizeof(*d_x));
    cudaMalloc(&y, sizeof(*y));
    cudaMalloc(&d_y, sizeof(*d_y));

    double host_x = 1.4;
    double host_d_x = 0.0;
    double host_y;
    double host_d_y = 1.0;

    cudaMemcpy(x, &host_x, sizeof(*x), cudaMemcpyHostToDevice);
    cudaMemcpy(d_x, &host_d_x, sizeof(*d_x), cudaMemcpyHostToDevice);
    cudaMemcpy(y, &host_y, sizeof(*y), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, &host_d_y, sizeof(*d_y), cudaMemcpyHostToDevice);

    __enzyme_autodiff<<<1,1>>>(foo,
        enzyme_dup, x, d_x,
        enzyme_dup, y, d_y);

    cudaMemcpy(&host_x, x, sizeof(*x), cudaMemcpyDeviceToHost);
    cudaMemcpy(&host_d_x, d_x, sizeof(*d_x), cudaMemcpyDeviceToHost);
    cudaMemcpy(&host_y, y, sizeof(*y), cudaMemcpyDeviceToHost);
    cudaMemcpy(&host_d_y, d_y, sizeof(*d_y), cudaMemcpyDeviceToHost);

    printf("%f %f\n", host_x, host_y);
    printf("%f %f\n", host_d_x, host_d_y);

}

Incorrect derivative on -O0

; ModuleID = 'silent_failure.c'
source_filename = "silent_failure.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

$_ZSt4fabsf = comdat any

@.str = private unnamed_addr constant [36 x i8] c"hello! %f, res2 %f, da: %f, db: %f\0A\00", align 1

; Function Attrs: noinline optnone uwtable
define dso_local void @compute_sumabs(float* %a, float* %b, float* %ret) #0 {
entry:
  %a.addr = alloca float*, align 8
  %b.addr = alloca float*, align 8
  %ret.addr = alloca float*, align 8
  store float* %a, float** %a.addr, align 8
  store float* %b, float** %b.addr, align 8
  store float* %ret, float** %ret.addr, align 8
  %0 = load float*, float** %a.addr, align 8
  %1 = load float, float* %0, align 4
  %call = call float @_ZSt4fabsf(float %1)
  %2 = load float*, float** %b.addr, align 8
  %3 = load float, float* %2, align 4
  %call1 = call float @_ZSt4fabsf(float %3)
  %add = fadd float %call, %call1
  %4 = load float*, float** %ret.addr, align 8
  store float %add, float* %4, align 4
  ret void
}

; Function Attrs: noinline nounwind optnone uwtable
define linkonce_odr dso_local float @_ZSt4fabsf(float %__x) #1 comdat {
entry:
  %__x.addr = alloca float, align 4
  store float %__x, float* %__x.addr, align 4
  %0 = load float, float* %__x.addr, align 4
  %1 = call float @llvm.fabs.f32(float %0)
  ret float %1
}

; Function Attrs: noinline norecurse optnone uwtable
define dso_local i32 @main(i32 %argc, i8** %argv) #2 {
entry:
  %retval = alloca i32, align 4
  %argc.addr = alloca i32, align 4
  %argv.addr = alloca i8**, align 8
  %a = alloca float, align 4
  %b = alloca float, align 4
  %da = alloca float, align 4
  %db = alloca float, align 4
  %ret = alloca float, align 4
  %dret = alloca float, align 4
  store i32 0, i32* %retval, align 4
  store i32 %argc, i32* %argc.addr, align 4
  store i8** %argv, i8*** %argv.addr, align 8
  store float 2.000000e+00, float* %a, align 4
  store float 3.000000e+00, float* %b, align 4
  store float 0.000000e+00, float* %da, align 4
  store float 0.000000e+00, float* %db, align 4
  store float 0.000000e+00, float* %ret, align 4
  store float 1.000000e+00, float* %dret, align 4
  call void @compute_sumabs(float* %a, float* %b, float* %ret)
  %0 = call double (...) @__enzyme_autodiff.f64(void (float*, float*, float*)* @compute_sumabs, float* %a, float* %da, float* %b, float* %db, float* %ret, float* %dret)
  %1 = load float, float* %ret, align 4
  %conv = fpext float %1 to double
  %2 = load float, float* %ret, align 4
  %conv1 = fpext float %2 to double
  %3 = load float, float* %da, align 4
  %conv2 = fpext float %3 to double
  %4 = load float, float* %db, align 4
  %conv3 = fpext float %4 to double
  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str, i32 0, i32 0), double %conv, double %conv1, double %conv2, double %conv3)
  ret i32 0
}

declare double @__enzyme_autodiff.f64(...)

declare dso_local i32 @printf(i8*, ...) #3

; Function Attrs: nounwind readnone speculatable
declare float @llvm.fabs.f32(float) #4

attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { noinline norecurse optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { nounwind readnone speculatable }

!llvm.module.flags = !{!0}
!llvm.ident = !{!1}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 7.1.0 "}

Global variables (no testcase yet)

We need a plan for automatically handling global variables without explicitly giving annotations. Can we summarize the challenges involved in handling globals here?

Here are a few preliminary thoughts that come to mind:

There are a few categories of global use (unsure if list is complete):
(a) globals that are constant for entire duration of program.
(b) globals that are precomputed during the program and constant for subsequent uses.
(c) globals acting as a scratch-space --- e.g. a static array of floats.
(d) globals for caching: e.g. a static lookup table.

(a),(b) can probably be identified via analysis; (c) can either be disallowed or handled by duplicating static storage; (d) is tricky because the data structure may be manipulated/initialized prior to the enzyme_autodiff call.

Minimal working example for differentiating BLAS

I am an incoming student at Julia lab and I am trying to do sth with Enzyme. I learn from the Enzyme paper that one can define custom gradient for precompiled libraries, but I am not sure about how to write the "augmented" and "gradient" functions for custom gradient. For example, I can compile the following program

#include <iostream>
#include "cblas.h"
using namespace std;

int main() {
    int N = 3, incX = 1, incY = 1;
    double X[3] = {1.0, 2.0, 3.0}, Y[3] = {4.0, 5.0, 6.0}, _X[3] = {0.0}, _Y[3] = {0.0};
    double s = cblas_ddot(N, X, incX, Y, incY);
    cout << s << endl;
    return 0;
}

with clang++ blas.cpp -I/usr/local/opt/openblas/include -L/usr/local/opt/openblas/lib -lopenblas -o blas. What should I do then to get the gradient w.r.t X?

Differential returns that are pointer types

We currently do not seem to handle differential returns that are pointer types. e.g. a call %x = float* foo() where %x is not a constant value. A test case for this is attached --- if you look at the test case please spend a moment to verify that I set it up correctly as it was extracted from a larger file generated from eigen code. If the general fix isn't easy we should discuss exactly what we want to do here (e.g. create a wrapper function for such calls that put all of our differentiable calls into a common form where certain return values are additional arguments)

eigen-small-test.ll.txt

PHINode (and generally return value), doesn't presently assert failure if no arguments are active if phi is active

This should be asserting (and presently is a warning). Resolving this involves better interprocedural Activity Analysis, which will come in with the move to the Attributor.

Add Spack recipe

max function derivative

I've been playing around with Enzyme. Enzyme seems to be able to find the derivative of the following function just fine

double fun(double x) {
if(x>=0) return x;
return x * 0;
}

However, when I do "return 0" instead of "return x * 0", it calculates the derivative always equal to zero. I assumed that somehow the constant confuses the library, but the following functions finds the correct derivative:

double fun(double x) {
if(x<=1) return x;
return 1;
}

so, I guess it might be something to do with zero? I am not sure.

Activity Analysis handling of globals needs improvement

Quadratic memory usage

Hello, I'm still learning Enzyme.
I was expecting the following simple code to not require quadratic memory for the backward pass with enzyme

test2.cpp

#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <random>

void foo(double* parts, double *x_out,int n) {
    int d = 3;
    *x_out = 0.0;

    for( int i = 0 ; i < n ; i++)
    {
        for( int j = 0 ; j < n ; j++)
        {
            for( int k = 0 ; k < d ; k++)
            {
                double temp = parts[d*i+k]-parts[d*j+k];
                *x_out += temp*temp;
            }
        }
    }
}


int enzyme_dup;
int enzyme_out;
int enzyme_const;

typedef void (*f_ptr)(double*, double*,int);

extern void __enzyme_autodiff(f_ptr,
    int, double*, double*,
    int, double*, double*,
    int, int);


int main() {

    srand(42);

    std::mt19937 e2(42);
    std::uniform_real_distribution<> dist(0, 10);
    int n = 100000;
    int d = 3;
    double* x = new double[n*d];
    double* d_x = new double[n*d];
    for( int i = 0 ; i < n*d ; i++)
    {
        x[i] = dist(e2);
        d_x[i] = 0.0;
    }

    double y;
    double d_y = 1.0;

    printf("before autodiff\n");
    __enzyme_autodiff(foo,
        enzyme_dup, x, d_x,
        enzyme_dup, &y, &d_y,
        enzyme_const, n);


    printf("%f \n", y);
    for( int i = 0 ; i < 100 ; i++)
    {
    printf("dx[%d] = %f\n",i, d_x[i]);
    }

}

I used the one liner to compile, ( I also tried to emit O2 llvm and opt it but same results )
clang test2.cpp -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-7.so -o test2

It crashes with SegFault probably because it tries to allocate (or stack-allocate)

before autodiff
Segmentation fault (core dumped)

Can you please advise ?
Thanks

Restore LLVM 6 support

FNeg in ForwardMode

Hi,
the implementation of unary minus for ForwardMode seems to be broken.
I'm using the current brew version (LLVM 12) on Linux and opt produces a segfault.
I traced the error to
https://github.com/wsmoses/Enzyme/blob/9be50337dbf98a6a0489e94660d15586466a4b34/enzyme/Enzyme/AdjointGenerator.h#L252
where it tries to get a ReverseBuilder which I suppose doesn't exist in ForwardMode.
But I'm not sure how to procede from there.

Testcase to reproduce:

// clang++ fneg.cpp -S -emit-llvm -o input.ll -O2 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops
// opt input.ll -load=/home/linuxbrew/.linuxbrew/lib/LLVMEnzyme-12.so -enzyme -o output.ll -S
// clang++ -c output.ll

extern double __enzyme_fwddiff(void*, double, double);

double foo(double x){
    return -x;
}

double dfoo(double x){
    return __enzyme_fwddiff((void*)foo, x, 1.0);
}

Best
Markus

Undefined symbols for __enzyme_autodiff(...)

Hi,

I've been trying to assess how to use Enzyme together with more complex codes (that use Eigen) and created a pretty simple C++ test program:

#include <iostream>

#include <Eigen/Dense>
using Eigen::VectorXd;

void __enzyme_autodiff(...);


double foo( VectorXd v )
{
    double out = 0.0;

    for( int i = 0 ; i < v.rows() ; i++ )
        out += v(i) * v(i);

    return out;
}



int main()
{
    size_t n = 3;

    VectorXd v(n);
    VectorXd dv(n);

    for( int i = 0; i < n ; i++)
    {
        v(i) = i;
        dv(i) = 0.0;
    }

#ifdef __clang__
    __enzyme_autodiff(foo, enzyme_dup, &v, &dv);
#endif

    std::cout << foo(v) << std::endl << std::endl;

    std::cout << dv << std::endl;


    return 0;
}

I've been compiling it with the following:

clang++ test.cpp -I/path/to/eigen -S -emit-llvm -o input.ll -O3 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops
opt input.ll -load=ClangEnzyme-11.dylib -enzyme -o output.ll -S
clang++ output.ll -O3 -o matrix

I've tried to base all of the above on other examples (some posted as issues here). Several of the other examples compile and link properly. Unfortunately, it seems that the autodiffed function for my example never makes it into the executable, because I get a missing symbol error on linking:

Undefined symbols for architecture x86_64:
  "__enzyme_autodiff(...)", referenced from:
      _main in output-c75457.o
ld: symbol(s) not found for architecture x86_64

That's kind of perplexing to me, because other examples I build don't seem to have this issue. I've also tried more explicit declarations of the autodiff functions, but no luck. Am I missing something fundamental here, or are there any pointers you could give me to resolve this. Thanks!

Quadratic Memory Usage (mk2)

The following code run fine for small n but crashes with segfault from extra memory allocations.

The code works if the integer-typed cellId is stackAllocated.
It also works if I pass an allocated cell_id as an enzyme_const parameter

test2.cpp

#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <random>
#include <math.h>

using namespace std;

double foo( double* __restrict__ parts,int n)
 {
    const int d = 3;
    double out = 0.0;

    int* cellId = new int[n];
    double minx = 0.0;
    double maxx = 10.0;
    double h = 3.1;

    int nh = ceil((maxx-minx) / h);
    int strides[d];
    strides[0] = 1;

    for( int i = 1 ; i < d; i++)
    {
        strides[i] = strides[i-1]*nh;
    }

    for( int i = 0 ; i < n ; i++)
    {
        cellId[i] = 0;
        for( int k = 0 ; k < d ; k++)
        {
            cellId[i] += (int) ( (parts[i*d+k] - minx)/h  * strides[k]) ;
        }
    }

    for( int i = 0 ; i < n ; i++)
    {
        for( int j = 0; j < n ; j++)
        {
            double dij = 0.0;
            for( int k = 0 ; k < d ; k++)
            {
                double temp = parts[d*i+k] - parts[d*j+k];
                dij += temp*temp;
            }
            if( cellId[i] == cellId[j])
                out += dij ;

        }
    }

    delete[] cellId;
    return out;
}


int enzyme_dup;
int enzyme_out;
int enzyme_const;

typedef double (*f_ptr)(double *,int);

extern double __enzyme_autodiff(f_ptr,
    int, double *, double *,
    int, int);


int main() {

    srand(42);

    std::mt19937 e2(42);
    std::uniform_real_distribution<> dist(0, 10);
    int n = 100000;
    int d = 3;
    double* x = new double[n*d];
    double* d_x = new double[n*d];
    for( int i = 0 ; i < n*d ; i++)
    {
        x[i] = dist(e2);
        d_x[i] = 0.0;
    }

    int * cellid = new int[n];

    printf("before autodiff\n");
    __enzyme_autodiff(foo,
        enzyme_dup, x, d_x,
        enzyme_const, n);


    //printf("%f \n", y);
    for( int i = 0 ; i < 100 ; i++)
    {
    printf("dx[%d] = %f\n",i, d_x[i]);
    }

}

Compiled with :
clang test2.cpp -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-7.so -O2 -o test2

Add compile time regression tests

Let's add compile time regression tests.
I've collected some useful resources:

https://www.npopov.com/2020/05/10/Make-LLVM-fast-again.html
http://llvm-compile-time-tracker.com
https://green.lab.llvm.org/green/view/Compile%20Time/

https://lnt.readthedocs.io/_/downloads/en/latest/pdf/
https://github.com/llvm/llvm-lnt
https://github.com/llvm/llvm-test-suite/tree/main/CTMark

Handle Parsing of Type Tree's passed as metadata

Properly handle masked load and store.

Improve Enzyme's performance for benchmarks (CFD application)

Hello everyone,

I just wanted to begin by thanking the developping community for your incredible work on Enzyme, it really does seem like a dream come true regarding ease-to-use and its integration directly with the llvm building tools.

I would like to know if anyone has been using Enzyme for CFD applications, where as in ML, the need to get access to gradients and/or directional derivatives is crucial. In fact, I was recently testing Tapenade and comparing the performance with Enzyme in reverse mode (on a simple FD scheme and on a more complex Flux computation scheme) and I started to notice that if the number of operations and data IO stays relatively small to medium size, Enzyme fares much better than the reverse mode code generated by Tapenade (Fortran), however when the functions become more complex and the number DOF (degrees of freedom) increases, Enzyme starts lagging behind in performance and the problem gets worse the more the DOF are used.

Here are some code excerpts and results of the benchmarking tests (where I need to run enzyme on reverse mode on a flux computation) :

//original function
  static void f(const T* rho, const T* velx, const T* vely, const T* velz, const T* temp,
  T* flux1, T* flux2, T* flux3, T* flux4, T* flux5,
  const int n_cell, const int gh, const T* surfx, const T* surfy, const T* surfz )
  {

    constexpr double gam    = 1.4;
    constexpr double gam1   = gam-1.;
    constexpr double gam1_1 = 1./gam1;
    constexpr double rgaz   = 237.;

    for(int i=gh; i<n_cell-gh ; ++i)
    {
      const double sc1 = surfx[i]; const double sc2 = surfy[i]; const double sc3 = surfz[i];
      const double sn  = std::sqrt(sc1*sc1 + sc2*sc2 + sc3*sc3);

      const double invsn = 1./std::max(sn,1.e-32);
      const double nx    = sc1*invsn; const double ny = sc2*invsn; const double nz    = sc3*invsn;

      const  auto wfl1 = rho[i-1];  const auto wfr1 = rho[i  ];

      const  auto wfl2 = velx[i-1]; const auto wfr2 = velx[i  ];
      const  auto wfl3 = vely[i-1]; const auto wfr3 = vely[i  ];
      const  auto wfl4 = velz[i-1]; const auto wfr4 = velz[i  ];
      const  auto wfl5 = temp[i-1]; const auto wfr5 = temp[i  ];

      const auto pm = wfl1*wfl5*rgaz; auto pp = wfr1*wfr5*rgaz;

      const auto hm  = gam*gam1_1*wfl5*rgaz + 0.5*(wfl2*wfl2 + wfl3*wfl3 + wfl4*wfl4);
      const auto hp  = gam*gam1_1*wfr5*rgaz + 0.5*(wfr2*wfr2 + wfr3*wfr3 + wfr4*wfr4);

      const auto fcdx1 = wfr1*wfr2 + wfl1*wfl2;
      const auto fcdy1 = wfr1*wfr3 + wfl1*wfl3;
      const auto fcdz1 = wfr1*wfr4 + wfl1*wfl4;

      const auto fcdx2 = wfr1*wfr2*wfr2 + pp + wfl1*wfl2*wfl2 + pm;
      const auto fcdy2 = wfr1*wfr2*wfr3      + wfl1*wfl2*wfl3;
      const auto fcdz2 = wfr1*wfr2*wfr4      + wfl1*wfl2*wfl4;

      const auto fcdx3 = fcdy2;
      const auto fcdy3 = wfr1*wfr3*wfr3 + pp + wfl1*wfl3*wfl3 + pm;
      const auto fcdz3 = wfr1*wfr3*wfr4      + wfl1*wfl3*wfl4;

      const auto fcdx4 = fcdz2;
      const auto fcdy4 = fcdz3;
      const auto fcdz4 = wfr1*wfr4*wfr4 + pp + wfl1*wfl4*wfl4 + pm;

      const auto fcdx5 = wfr2*wfr1*hp + wfl2*wfl1*hm;
      const auto fcdy5 = wfr3*wfr1*hp + wfl3*wfl1*hm;
      const auto fcdz5 = wfr4*wfr1*hp + wfl4*wfl1*hm;

      flux1[i] = 0.5*sn*(fcdx1*nx + fcdy1*ny + fcdz1*nz);
      flux2[i] = 0.5*sn*(fcdx2*nx + fcdy2*ny + fcdz2*nz);
      flux3[i] = 0.5*sn*(fcdx3*nx + fcdy3*ny + fcdz3*nz);
      flux4[i] = 0.5*sn*(fcdx4*nx + fcdy4*ny + fcdz4*nz);
      flux5[i] = 0.5*sn*(fcdx5*nx + fcdy5*ny + fcdz5*nz);
    }

  }

  // differentiated function
  static void df(const T *rho, T *rho_b, const T* velx, T* velx_b, const T* vely, T* vely_b,
    const T* velz, T* velz_b, const T* temp, T* temp_b,
    const T* flux1, T* flux1_b, const T* flux2, T* flux2_b, const T* flux3, T* flux3_b,
    const T* flux4, T* flux4_b, const T* flux5, T* flux5_b, const int n_cell, const int gh,
    const T* surfx, const T* surfy, const T* surfz )
  {

    __enzyme_autodiff(f,
       enzyme_dup, rho,
       rho_b,
       enzyme_dup, velx,
       velx_b,
       enzyme_dup, vely,
       vely_b,
       enzyme_dup, velz,
       velz_b,
       enzyme_dup, temp,
       temp_b,
       enzyme_dupnoneed, flux1,
       flux1_b,
       enzyme_dupnoneed, flux2,
       flux2_b,
       enzyme_dupnoneed, flux3,
       flux3_b,
       enzyme_dupnoneed, flux4,
       flux4_b,
       enzyme_dupnoneed, flux5,
       flux5_b,
       enzyme_const, n_cell, enzyme_const, gh,
       enzyme_const, surfx, enzyme_const, surfy, enzyme_const, surfz);
  }

where the results are stored in the shadow arrays for the primitve variables (rho_b, velx_b, ..., temp_b). The results are the same as for Tapenade but performance stalls as the DOF get bigger :

Running ./out
Run on (48 X 2900 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x24)
  L1 Instruction 32 KiB (x24)
  L2 Unified 256 KiB (x24)
  L3 Unified 30720 KiB (x2)
Load Average: 2.13, 7.63, 11.21
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
------------------------------------------------------------------------------------
Benchmark                          Time             CPU   Iterations UserCounters...
------------------------------------------------------------------------------------
tapenade_roe_flux/1024           144 us          143 us         4194 rho_b[n_cell/2]=330.698T
enzyme_roe_flux/1024             113 us          113 us         6064 rho_b[n_cell/2]=330.698T
tapenade_roe_flux/2048           258 us          257 us         2737 rho_b[n_cell/2]=7.82098P
enzyme_roe_flux/2048             279 us          279 us         2637 rho_b[n_cell/2]=7.82098P
tapenade_roe_flux/4096           522 us          522 us         1000 rho_b[n_cell/2]=206.173P
enzyme_roe_flux/4096             612 us          612 us         1176 rho_b[n_cell/2]=206.173P
tapenade_roe_flux/8192          1103 us         1103 us          623 rho_b[n_cell/2]=5.89262E
enzyme_roe_flux/8192            1253 us         1253 us          597 rho_b[n_cell/2]=5.89262E
tapenade_roe_flux/16384         2304 us         2304 us          324 rho_b[n_cell/2]=177.29E
enzyme_roe_flux/16384           2890 us         2890 us          245 rho_b[n_cell/2]=177.29E
tapenade_roe_flux/32768         3984 us         3983 us          159 rho_b[n_cell/2]=5.49296Z
enzyme_roe_flux/32768           5185 us         5185 us          136 rho_b[n_cell/2]=5.49296Z
tapenade_roe_flux/65536         8126 us         8125 us           88 rho_b[n_cell/2]=172.89Z
enzyme_roe_flux/65536          12211 us        12205 us           57 rho_b[n_cell/2]=172.89Z
tapenade_roe_flux/131072       15859 us        15859 us           45 rho_b[n_cell/2]=5.48632Y
enzyme_roe_flux/131072         35868 us        35866 us           20 rho_b[n_cell/2]=5.48632Y
tapenade_roe_flux/262144       33950 us        33947 us           21 rho_b[n_cell/2]=174.824Y
enzyme_roe_flux/262144         76974 us        76970 us            9 rho_b[n_cell/2]=174.824Y
tapenade_roe_flux/524288       65500 us        65496 us            9 rho_b[n_cell/2]=5.58255
enzyme_roe_flux/524288        159281 us       159270 us            5 rho_b[n_cell/2]=5.58255
tapenade_roe_flux/1048576     151165 us       151153 us            4 rho_b[n_cell/2]=178.452
enzyme_roe_flux/1048576       287164 us       287144 us            3 rho_b[n_cell/2]=178.452

Regarding the compilation options/flags, I have been using what is suggested on the website:

export F90LAGS="-O2 -DNDEBUG" # For Tapenade code
export CCFLAGS2="-O2 -DNDEBUG"

export ENZYMEPM_AND_OPTIONS="-enzyme -enzyme-inline=1 -enzyme-smallbool=1 -enzyme-cache-never=1"

As you can see, I'm quite new at using enzyme and I surely do not know all the options and best coding practices that might help increase performance, so I'm really interested in any suggestions that you in the community might have!

Thanks in advance for any information.

Idea: Use enzyme-ci-bot to use Yggdrasil to do CD and cross-compilation check

Maybe bad idea?

check-enzyme requires that LLVM_EXTERNAL_LIT is explicitly set

Hi,

I have been following the "Getting started" instructions and noticed that one needs to explicitly specify the path to LLVM_EXTERNAL_LIT when calling cmake.

I think this line in the instructions:
cmake -G Ninja .. -DLLVM_DIR=/path/to/llvm/lib/cmake/llvm

should probably be:
cmake -G Ninja .. -DLLVM_DIR=/path/to/llvm/lib/cmake/llvm -DLLVM_EXTERNAL_LIT=/path/to/llvm/lib/cmake/llvm/bin/llvm-lit

Otherwise the test check-enzyme later will fail with an error:

[0/1] Running enzyme regression tests
/bin/sh: line 1: <MY_BUILD_DIR>: Is a directory

Best,
Misha

Prepare for upstreaming

Handle use of a better calling convention for combined forward/reverse

There are instances in which a we need a shadow pointer from a function we can create a combined forward/reverse of (See #27). However, right now if that is the case we are conservative and fall back to individual forward/reverse and we should make sure that uses of the inverted pointer are moved to the right location when using the combined variant (and re-enable it in these cases).

How to run tests

Here is how I build Enzyme. How do I execute tests?

$ cd enzyme
$ mkdir build
$ cd build
$ cmake .. 
-- The C compiler identification is GNU 7.5.0
-- The CXX compiler identification is GNU 7.5.0
-- Check for working C compiler: /usr/bin/cc
-- Check for working C compiler: /usr/bin/cc -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
LLVM_SHLIBEXT=.so
found llvm dir /home/ondrej/repos/Enzyme/enzyme/build
found llvm lit /home/ondrej/repos/Enzyme/enzyme/build
CMAKE_PREFIX_PATH /home/ondrej/repos/Enzyme/enzyme/build
-- Linker detection: GNU ld
found llvm include directory here: /usr/lib/llvm-6.0/include
found llvm definitions -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS
found llvm version 6
first llvm include directory/usr/lib/llvm-6.0/include
found enzyme sources ActiveVariable.cppEnzyme.cppEnzymeLogic.cppFunctionUtils.cppGradientUtils.cppTypeAnalysis.cppUtils.cppSCEV/ScalarEvolutionExpander.cpp
-- Configuring done
-- Generating done
-- Build files have been written to: /home/ondrej/repos/Enzyme/enzyme/build
$ make -j4
Scanning dependencies of target intrinsics_gen
[  0%] Built target intrinsics_gen
Scanning dependencies of target LLVMEnzyme-6
[ 11%] Building CXX object Enzyme/CMakeFiles/LLVMEnzyme-6.dir/ActiveVariable.cpp.o
[ 22%] Building CXX object Enzyme/CMakeFiles/LLVMEnzyme-6.dir/Enzyme.cpp.o
[ 33%] Building CXX object Enzyme/CMakeFiles/LLVMEnzyme-6.dir/EnzymeLogic.cpp.o
[ 44%] Building CXX object Enzyme/CMakeFiles/LLVMEnzyme-6.dir/FunctionUtils.cpp.o
/home/ondrej/repos/Enzyme/enzyme/Enzyme/ActiveVariable.cpp: In function ‘void addCallRemovingCycle(std::vector<llvm::CallInst*>&, llvm::CallInst*)’:
/home/ondrej/repos/Enzyme/enzyme/Enzyme/ActiveVariable.cpp:186:41: warning: comparison between signed and unsigned integer expressions [-Wsign-compare]
                 if (newtrace.size()-1-j == i) break;
                     ~~~~~~~~~~~~~~~~~~~~^~~~
[ 55%] Building CXX object Enzyme/CMakeFiles/LLVMEnzyme-6.dir/GradientUtils.cpp.o
[ 66%] Building CXX object Enzyme/CMakeFiles/LLVMEnzyme-6.dir/TypeAnalysis.cpp.o
/home/ondrej/repos/Enzyme/enzyme/Enzyme/TypeAnalysis.cpp: In member function ‘ValueData ValueData::KeepForCast(const llvm::DataLayout&, llvm::Type*, llvm::Type*) const’:
/home/ondrej/repos/Enzyme/enzyme/Enzyme/TypeAnalysis.cpp:75:50: warning: comparison between signed and unsigned integer expressions [-Wsign-compare]
         if (pair.first[0] != -1 && pair.first[0] < tosize) {
[ 77%] Building CXX object Enzyme/CMakeFiles/LLVMEnzyme-6.dir/Utils.cpp.o
[ 88%] Building CXX object Enzyme/CMakeFiles/LLVMEnzyme-6.dir/SCEV/ScalarEvolutionExpander.cpp.o
[100%] Linking CXX shared module LLVMEnzyme-6.so
[100%] Built target LLVMEnzyme-6
$ ctest 
*********************************
No test configuration file found!
*********************************
Usage

  ctest [options]

$ make check-enzyme-integration
[  0%] Built target intrinsics_gen
[ 90%] Built target LLVMEnzyme-6
Scanning dependencies of target check-enzyme-integration
[100%] Running enzyme integration tests
/bin/sh: 1: ../../: Permission denied
test/Integration/CMakeFiles/check-enzyme-integration.dir/build.make:57: recipe for target 'test/Integration/CMakeFiles/check-enzyme-integration' failed
make[3]: *** [test/Integration/CMakeFiles/check-enzyme-integration] Error 126
CMakeFiles/Makefile2:296: recipe for target 'test/Integration/CMakeFiles/check-enzyme-integration.dir/all' failed
make[2]: *** [test/Integration/CMakeFiles/check-enzyme-integration.dir/all] Error 2
CMakeFiles/Makefile2:303: recipe for target 'test/Integration/CMakeFiles/check-enzyme-integration.dir/rule' failed
make[1]: *** [test/Integration/CMakeFiles/check-enzyme-integration.dir/rule] Error 2
Makefile:203: recipe for target 'check-enzyme-integration' failed
make: *** [check-enzyme-integration] Error 2

Active/Inactive determination (Tim, with input from Billy)

I am going to start work on the refined logic for Active/Inactive detection, as we discussed previously. As issues/questions come up (while we're working asynchronously), I'll write them here.

SegFault upon differentiating when using function pointers

Hello,

I'm trying to differentiate a code which uses some function pointers. I've tried using function pointers with enzyme before and it worked, but in this example it segFaults, (maybe because Caching instruction >fp[i] (euler[i],R) )

When I manually unroll it works fine.

(Additionally I have extracted this code from a bigger routine where the code is differentiated twice and there the compiler fails to compile because it complains about a bad number of parameters passed to enzyme, that I will create a separate issue for if solving this issue doesn't fix it).

bugFunctionPointer.cpp

#include <iostream>
#include <math.h>
using namespace std;

extern int enzyme_dup;
extern int enzyme_dupnoneed;
extern int enzyme_out;
extern int enzyme_const;

void __enzyme_autodiff(...);

inline void assign( double*__restrict__ v, double*__restrict__ out, int n)
{
  for( int i = 0 ; i < n ; i++) out[i] = v[i];
}

void matvprod( double*__restrict__ A, double * __restrict__ v, double* __restrict__ out, int n, int m )
{
  for( int i = 0 ; i < n ; i++)
  {
    out[i] = 0.0;
    for( int j = 0 ; j < m ; j++)
    {
      out[i] += A[i*m+j]*v[j];
    }
  }
}

void Rx( double ang, double * __restrict__  out)
{
  double mat[9] = {1.0, 0, 0,
              0, cos(ang),sin(ang),
              0,-sin(ang),cos(ang)};
  for( int i = 0 ; i < 9 ; i++)
      out[i] = mat[i];

}

void Ry( double ang, double * __restrict__  out)
{
  double mat[9] = {cos(ang), 0, -sin(ang),
              0, 1.0,0.0,
              sin(ang),0,cos(ang)};
  for( int i = 0 ; i < 9 ; i++)
      out[i] = mat[i];
}

void Rz( double ang, double * __restrict__  out)
{
  double mat[9] = {cos(ang), sin(ang),0 ,
              -sin(ang), cos(ang),0.0,
              0,0,1};
  for( int i = 0 ; i < 9 ; i++)
    out[i] = mat[i];
}

typedef void (*rotf)(double,double* __restrict__ );
void eulerRotate( double* __restrict__ euler, double* __restrict__  v, double* __restrict__  out)
{
    const int dim = 3;
    rotf fp[] = {Rz,Ry,Rx};
    assign(v,out,dim);
    double R[dim*dim]={0.0};
    double tempv[dim]= {0.0};

    //Manually unrolled works
    /*
    fp[0](euler[0],R);
    matvprod(R,out,tempv,dim,dim );
    assign(tempv,out,dim);

    fp[1](euler[1],R);
    matvprod(R,out,tempv,dim,dim );
    assign(tempv,out,dim);

    fp[2](euler[2],R);
    matvprod(R,out,tempv,dim,dim );
    assign(tempv,out,dim);
    */

    for( int i = 0 ; i < dim ; i++)
    {
      //Works if we replace fp[i] by either Rx, Ry or Rz
      printf("i = %d before fp \n ", i);
      fp[i](euler[i],R);
      printf("i = %d before matvprod \n", i);
      matvprod(R,out,tempv,dim,dim );
      printf("i = %d before assign \n", i);
      assign(tempv,out,dim);
    }
}

void testEulerRotate( )
{
    double euler[3] = {1.0,0.0,0.0};
    double deuler[3] = {0.0,0.0,0.0};

    double v[3] = {1.0,1.0,1.0};
    double dv[3] = {0.0,0.0,0.0};

    double out[3] = {0.0};
    double dout[3] = {0.0};
    dout[0]= 1.0;

    eulerRotate( euler,v,out);
    cout << "forward pass work without enzyme" << endl;
    cout << "out " << endl;
    cout << out[0] << " " << out[1] << " " << out[2] << endl;

    cout << "with enzyme : " << endl;
    __enzyme_autodiff(eulerRotate, enzyme_dup,euler,deuler,
                                                        enzyme_dup, v, dv,
                                                        enzyme_dup, out, dout);
   cout << "out " << endl;
   cout << out[0] << " " << out[1] << " " << out[2] << endl;
   cout << "deuler " << endl;
   cout << deuler[0] << " " << deuler[1] << " " << deuler[2] << endl;
}


int main(int argc, char** argv )
{
  cout << "testEulerRotate" << endl;
  testEulerRotate(); //SegFault in enzyme
  return 0;
}

Compilation with :
clang bugFunctionPointer.cpp -lstdc++ -lm -fno-exceptions -Rpass=enzyme -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-12.so -O2 -o bugFunctionPointer

Output :

bugFunctionPointer.cpp:24:17: remark: Load may need caching   %4 = load double, double* %arraydecay, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 due to   call void %2(double %3, double* nonnull %arraydecay) #14, !dbg !23 [-Rpass=enzyme]
      out[i] += A[i*m+j]*v[j];
                ^
bugFunctionPointer.cpp:24:26: remark: Load may need caching   %5 = load double, double* %out, align 8, !dbg !40, !tbaa !29, !alias.scope !36, !noalias !41 due to   store double %add12.i.2, double* %out, align 8, !dbg !45, !alias.scope !47 [-Rpass=enzyme]
      out[i] += A[i*m+j]*v[j];
                         ^
bugFunctionPointer.cpp:24:17: remark: Load may need caching   %6 = load double, double* %arrayidx6.i.1, align 8, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 due to   call void %2(double %3, double* nonnull %arraydecay) #14, !dbg !23 [-Rpass=enzyme]
      out[i] += A[i*m+j]*v[j];
                ^
bugFunctionPointer.cpp:24:26: remark: Load may need caching   %7 = load double, double* %arrayidx8.i.1, align 8, !dbg !40, !tbaa !29, !alias.scope !36, !noalias !41 due to   store double %add12.i.2.1, double* %arrayidx8.i.1, align 8, !dbg !45, !alias.scope !47 [-Rpass=enzyme]
      out[i] += A[i*m+j]*v[j];
                         ^
bugFunctionPointer.cpp:24:17: remark: Load may need caching   %8 = load double, double* %arrayidx6.i.2, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 due to   call void %2(double %3, double* nonnull %arraydecay) #14, !dbg !23 [-Rpass=enzyme]
      out[i] += A[i*m+j]*v[j];
                ^
bugFunctionPointer.cpp:24:26: remark: Load may need caching   %9 = load double, double* %arrayidx8.i.2, align 8, !dbg !40, !tbaa !29, !alias.scope !36, !noalias !41 due to   store double %add12.i.2.2, double* %arrayidx8.i.2, align 8, !dbg !45, !alias.scope !47 [-Rpass=enzyme]
      out[i] += A[i*m+j]*v[j];
                         ^
bugFunctionPointer.cpp:24:17: remark: Load may need caching   %10 = load double, double* %arrayidx6.i.122, align 8, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 due to   call void %2(double %3, double* nonnull %arraydecay) #14, !dbg !23 [-Rpass=enzyme]
      out[i] += A[i*m+j]*v[j];
                ^
bugFunctionPointer.cpp:24:17: remark: Load may need caching   %11 = load double, double* %arrayidx6.i.1.1, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 due to   call void %2(double %3, double* nonnull %arraydecay) #14, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load may need caching   %12 = load double, double* %arrayidx6.i.2.1, align 8, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 due to   call void %2(double %3, double* nonnull %arraydecay) #14, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load may need caching   %13 = load double, double* %arrayidx6.i.225, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 due to   call void %2(double %3, double* nonnull %arraydecay) #14, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load may need caching   %14 = load double, double* %arrayidx6.i.1.2, align 8, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 due to   call void %2(double %3, double* nonnull %arraydecay) #14, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load may need caching   %15 = load double, double* %arrayidx6.i.2.2, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 due to   call void %2(double %3, double* nonnull %arraydecay) #14, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:26: remark: Load must be recomputed   %9 = load double, double* %arrayidx8.i.2, align 8, !dbg !40, !tbaa !29, !alias.scope !36, !noalias !41 in reverse_invertfor.body due to   store double %add12.i.2.2, double* %arrayidx8.i.2, align 8, !dbg !45, !alias.scope !47 [-Rpass=enzyme]
      out[i] += A[i*m+j]*v[j];
                         ^
bugFunctionPointer.cpp:24:26: remark: Caching instruction   %9 = load double, double* %arrayidx8.i.2, align 8, !dbg !40, !tbaa !29, !alias.scope !36, !noalias !41 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load must be recomputed   %15 = load double, double* %arrayidx6.i.2.2, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 in reverse_invertfor.body due to   call void %2(double %3, double* nonnull %arraydecay) #15, !dbg !23 [-Rpass=enzyme]
      out[i] += A[i*m+j]*v[j];
                ^
bugFunctionPointer.cpp:24:17: remark: Caching instruction   %15 = load double, double* %arrayidx6.i.2.2, align 16, !dbg !17, !tbaa !30, !alias.scope !33, !noalias !39 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:26: remark: Load must be recomputed   %7 = load double, double* %arrayidx8.i.1, align 8, !dbg !40, !tbaa !29, !alias.scope !36, !noalias !41 in reverse_invertfor.body due to   store double %add12.i.2.1, double* %arrayidx8.i.1, align 8, !dbg !45, !alias.scope !47 [-Rpass=enzyme]
      out[i] += A[i*m+j]*v[j];
                         ^
bugFunctionPointer.cpp:24:26: remark: Caching instruction   %7 = load double, double* %arrayidx8.i.1, align 8, !dbg !42, !tbaa !31, !alias.scope !38, !noalias !43 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load must be recomputed   %14 = load double, double* %arrayidx6.i.1.2, align 8, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 in reverse_invertfor.body due to   call void %2(double %3, double* nonnull %arraydecay) #15, !dbg !23 [-Rpass=enzyme]
      out[i] += A[i*m+j]*v[j];
                ^
bugFunctionPointer.cpp:24:17: remark: Caching instruction   %14 = load double, double* %arrayidx6.i.1.2, align 8, !dbg !17, !tbaa !32, !alias.scope !35, !noalias !41 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:26: remark: Load must be recomputed   %5 = load double, double* %out, align 8, !dbg !40, !tbaa !29, !alias.scope !36, !noalias !41 in reverse_invertfor.body due to   store double %add12.i.2, double* %out, align 8, !dbg !45, !alias.scope !47 [-Rpass=enzyme]
      out[i] += A[i*m+j]*v[j];
                         ^
bugFunctionPointer.cpp:24:26: remark: Caching instruction   %5 = load double, double* %out, align 8, !dbg !44, !tbaa !33, !alias.scope !40, !noalias !45 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load must be recomputed   %13 = load double, double* %arrayidx6.i.225, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 in reverse_invertfor.body due to   call void %2(double %3, double* nonnull %arraydecay) #15, !dbg !23 [-Rpass=enzyme]
      out[i] += A[i*m+j]*v[j];
                ^
bugFunctionPointer.cpp:24:17: remark: Caching instruction   %13 = load double, double* %arrayidx6.i.225, align 16, !dbg !17, !tbaa !34, !alias.scope !37, !noalias !43 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load must be recomputed   %12 = load double, double* %arrayidx6.i.2.1, align 8, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 in reverse_invertfor.body due to   call void %2(double %3, double* nonnull %arraydecay) #15, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Caching instruction   %12 = load double, double* %arrayidx6.i.2.1, align 8, !dbg !17, !tbaa !35, !alias.scope !38, !noalias !44 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load must be recomputed   %11 = load double, double* %arrayidx6.i.1.1, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 in reverse_invertfor.body due to   call void %2(double %3, double* nonnull %arraydecay) #15, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Caching instruction   %11 = load double, double* %arrayidx6.i.1.1, align 16, !dbg !17, !tbaa !36, !alias.scope !39, !noalias !45 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load must be recomputed   %10 = load double, double* %arrayidx6.i.122, align 8, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 in reverse_invertfor.body due to   call void %2(double %3, double* nonnull %arraydecay) #15, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Caching instruction   %10 = load double, double* %arrayidx6.i.122, align 8, !dbg !17, !tbaa !37, !alias.scope !40, !noalias !46 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load must be recomputed   %8 = load double, double* %arrayidx6.i.2, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 in reverse_invertfor.body due to   call void %2(double %3, double* nonnull %arraydecay) #15, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Caching instruction   %8 = load double, double* %arrayidx6.i.2, align 16, !dbg !17, !tbaa !38, !alias.scope !41, !noalias !47 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load must be recomputed   %6 = load double, double* %arrayidx6.i.1, align 8, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 in reverse_invertfor.body due to   call void %2(double %3, double* nonnull %arraydecay) #15, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Caching instruction   %6 = load double, double* %arrayidx6.i.1, align 8, !dbg !17, !tbaa !39, !alias.scope !42, !noalias !48 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Load must be recomputed   %4 = load double, double* %arraydecay, align 16, !dbg !17, !tbaa !29, !alias.scope !32, !noalias !38 in reverse_invertfor.body due to   call void %2(double %3, double* nonnull %arraydecay) #15, !dbg !23 [-Rpass=enzyme]
bugFunctionPointer.cpp:24:17: remark: Caching instruction   %4 = load double, double* %arraydecay, align 16, !dbg !17, !tbaa !40, !alias.scope !43, !noalias !49 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
bugFunctionPointer.cpp:85:7: remark: Caching instruction   %subcache = extractvalue { i8* } %_augmented, 0, !dbg !35 legalRecompute: 1 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
      fp[i](euler[i],R);
      ^
testEulerRotate
i = 0 before fp 
 i = 0 before matvprod 
i = 0 before assign 
i = 1 before fp 
 i = 1 before matvprod 
i = 1 before assign 
i = 2 before fp 
 i = 2 before matvprod 
i = 2 before assign 
forward pass work without enzyme
out 
1.38177 -0.301169 1
with enzyme : 
i = 0 before fp 
Segmentation fault (core dumped)

Chunked forward mode

After we have preliminary support for forward mode, we might need to think about doing chunking.

@ChrisRackauckas had some references around challenges.

Compilation hangs (mk ii) : Matrix edition

Hello,

I tried to use Eigen3 with enzyme (not sure if it's planned to be supported), and even though it works in the simple cases, it seems to hang the compilation in some more advanced cases. Like matrix inversion, matrix-vector solve, matrix exponentiation.

In https://enzyme.mit.edu/getting_started/CallingConvention/ you describe how to add some custom gradients, but it doesn't seem straight-forward to add them from c++.

For example for the adjoint of the inverse of the matrix
d K^-1/dp = - K^-1 * dK/dp * K^-1 (https://math.stackexchange.com/questions/1471825/derivative-of-the-inverse-of-a-matrix)
Which may be easier and faster to compute, and more numerically stable than the automatically derived one.

Can you please advise ?
Thanks

Here is my test file, where compilation hangs when some of the __enzyme_autodiff lines are present.

testmatrix.cpp

#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <random>
#include <math.h>
#include <vector>
#include <algorithm>

#include <Eigen/Dense>
#include <unsupported/Eigen/MatrixFunctions>
using Eigen::MatrixXd;
using namespace std;
using namespace Eigen;

int enzyme_dup;
int enzyme_out;
int enzyme_const;

void __enzyme_autodiff(...);

template<int T>
double normVector( const Matrix<double,T,1>& m )
{
  double out = 0.0;
  for( int i = 0 ; i < m.rows() ; i++ )
  {
      out += m(i,0)* m(i,0);
  }
  return out;
}

template<int T>
double normMatrix( const Matrix<double,T,T,RowMajor>& m )
{
  double out = 0.0;
  for( int i = 0 ; i < m.rows() ; i++ )
  {
    for( int j = 0 ; j < m.cols(); j++)
    {
      out += m(i,j)* m(i,j);
    }
  }
  return out;
}

double normMatrixXd( const MatrixXd& m )
{
  double out = 0.0;
  for( int i = 0 ; i < m.rows() ; i++ )
  {
    for( int j = 0 ; j < m.cols(); j++)
    {
      out += m(i,j)* m(i,j);
    }
  }
  return out;
}

template<int T>
double normInverseMatrix( const Matrix<double,T,T,RowMajor>& m )
{
  return normMatrix<T>(m.inverse());
}

double normInverseMatrixXd(const MatrixXd& m )
{
  MatrixXd inv = m.inverse();
  return normMatrixXd(inv);
}


template<int T>
double normSolveMatrix( const  Matrix<double,T,T,RowMajor>& m)
{
  Matrix<double,T,1> v;
  for( int i = 0 ; i < m.cols() ; i++)
  {
    v(i,0) = i;
  }
  Matrix<double,T,1> sol = m.fullPivLu().solve(v);
  return normVector<T>(sol);
}


double normSolveMatrixXd( const MatrixXd& m)
{
  MatrixXd v(m.cols(),1);
  for( int i = 0 ; i < m.cols() ; i++)
  {
    v(i,0) = i;
  }
  return normMatrixXd(m.fullPivLu().solve(v));
}

double normExpMatrixXd( const MatrixXd& m)
{
  return normMatrixXd(m.exp());
}

template<int T>
double normExpMatrix( const  Matrix<double,T,T,RowMajor>& m )
{
  return normMatrix<T>(m.exp());
}

template< int T>
void testMatrix()
{
  Matrix<double,T,T,RowMajor> m;
  for( int i = 0; i < T ; i++)
  {
    for( int j = 0 ; j < T ;j++)
    {
      m(i,j) = (i+j)*(i+j);
    }
  }

  Matrix<double,T,T,RowMajor> dm;
  for( int i = 0; i < T ; i++)
  {
    for( int j = 0 ; j < T ;j++)
    {
      dm(i,j) = 0.0;
    }
  }
  std::cout <<"m : " << std::endl;
  std::cout << m << std::endl;
  std::cout <<"m.inverse() : " << std::endl;
  std::cout << m.inverse() << std::endl;
  std::cout << "normSolveMatrix "<< std::endl;
  std::cout << normSolveMatrix<T>(m) << std::endl;
  std::cout << "normExpMatrix "<< std::endl;
  std::cout << normExpMatrix(m) << std::endl;
  __enzyme_autodiff(normMatrix<T>, enzyme_dup, &m,&dm); // Works
  __enzyme_autodiff(normInverseMatrix<T>, enzyme_dup, &m,&dm);//Hangs compilation
  __enzyme_autodiff(normSolveMatrix<T>, enzyme_dup, &m,&dm);//Hangs compilation
  __enzyme_autodiff(normExpMatrix<T>, enzyme_dup, &m,&dm);//Hangs compilation

  std::cout << dm << std::endl;


}

void testMatrixXd( int T )
{

  MatrixXd m(T,T);
  for( int i = 0; i < T ; i++)
  {
    for( int j = 0 ; j < T ;j++)
    {
      m(i,j) = (i+j)*(i+j);
    }
  }

  MatrixXd dm(T,T);
  for( int i = 0; i < T ; i++)
  {
    for( int j = 0 ; j < T ;j++)
    {
      dm(i,j) = 0.0;
    }
  }
  std::cout <<"m : " << std::endl;
  std::cout << m << std::endl;
  std::cout <<"m.inverse() : " << std::endl;
  std::cout << m.inverse() << std::endl;
  std::cout << "normSolveMatrix "<< std::endl;
  std::cout << normSolveMatrixXd(&m) << std::endl;
  std::cout << "normExpMatrix "<< std::endl;
  std::cout << normExpMatrixXd(&m) << std::endl;
  __enzyme_autodiff(normMatrixXd, enzyme_dup, &m,&dm); // Works
  __enzyme_autodiff(normInverseMatrixXd, enzyme_dup, &m,&dm); //Hangs compilation
  __enzyme_autodiff(normSolveMatrixXd, enzyme_dup, &m,&dm); //Hangs compilation
  __enzyme_autodiff(normExpMatrixXd, enzyme_dup, &m,&dm); //Hangs compilation

  std::cout << dm << std::endl;


}

int main()
{
  testMatrix<3>();
  testMatrix<4>();
  testMatrix<5>(); //There are no more formulas for matrix inversion in eigen when n = 5

  testMatrixXd(3);
  testMatrixXd(4);
  testMatrixXd(5);

  return 0;
}

Compilation with provided that you have eigen3 installed by ubuntu (apt-get install libeigen3-dev) :
clang testmatrix.cpp -I/usr/include/eigen3/ -lstdc++ -lm -Xclang -load -Xclang /usr/local/lib/ClangEnzyme-11.so -O2 -o testMatrix -fno-exceptions

I also tried to add the following flag (-mllvm -enzyme-max-type-offset=20 ) which helped in the past when compilation was hanging but it didn't have any effect this time.

enzymead / enzyme Goto Github PK

enzyme's People

Contributors

Stargazers

Watchers

Forkers

enzyme's Issues

Recommend Projects

Recommend Topics

Recommend Org