Git Product home page Git Product logo

hdc's Introduction

Passionate Software Engineering student eager to work in new projects and with new technologies. With a problem-solving approach and teamwork skills, I like tackling new challenges and to constantly push myself to learn and grow professionally and as a person.

hdc's People

Contributors

ignaciosica avatar

Watchers

 avatar

hdc's Issues

Deep Learning

  • 5.1 Learning Algorithms => 99
  • 5.2 Capacity, Overfitting and Underfitting => 110
  • 5.3 Hyperparameters and Validation Sets => 120
  • 5.4 Estimators, Bias and Variance => 122
  • 5.5 Maximum Likelihood Estimation => 131
  • 5.6 Bayesian Statistics => 135
  • 5.7 Supervised Learning Algorithms => 140
  • 5.8 Unsupervised Learning Algorithms => 146
  • 5.9 Stochastic Gradient Descent => 151
  • 5.10 Building a Machine Learning Algorithm => 153
  • 5.11 Challenges Motivating Deep Learning => 155

AMX as TC

Kernels generated for AMXish core ((N),(N),(N,N)) with N = 4 for visualization
note: rendering was not changed, only for visualization purposes

matmul relu kernel

void r_1024_1024_4096_4_4(float* restrict data0, const float* restrict data1, const float* restrict data2) {
  for (int ridx0 = 0; ridx0 < 1024; ridx0++) {
    int alu0 = (ridx0*16384);
    for (int ridx1 = 0; ridx1 < 1024; ridx1++) {
      int alu1 = (ridx1*4);
      int alu2 = (alu0+alu1);
      float4 acc0 = make_float4(0.0f,0.0f,0.0f,0.0f);
      float4 acc1 = make_float4(0.0f,0.0f,0.0f,0.0f);
      float4 acc2 = make_float4(0.0f,0.0f,0.0f,0.0f);
      float4 acc3 = make_float4(0.0f,0.0f,0.0f,0.0f);
      for (int ridx2 = 0; ridx2 < 4096; ridx2++) {
        int alu3 = (alu0+ridx2);
        float4 val0 = *((float4*)(data2+alu1+(ridx2*4096)));
        float val1 = data1[alu3+4096];
        float val2 = data1[alu3+8192];
        float val3 = data1[alu3+12288];
        float val4 = data1[alu3];
        float16 wmma0 = __WMMA_4_4_4_float_float(make_float4(val4,val1,val2,val3), val0, 
                                                 make_float16(0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f));
        acc0 = make_float4((acc0.x+wmma0[0]),(acc0.y+wmma0[1]),(acc0.z+wmma0[2]),(acc0.w+wmma0[3]));
        acc1 = make_float4((acc1.x+wmma0[4]),(acc1.y+wmma0[5]),(acc1.z+wmma0[6]),(acc1.w+wmma0[7]));
        acc2 = make_float4((acc2.x+wmma0[8]),(acc2.y+wmma0[9]),(acc2.z+wmma0[10]),(acc2.w+wmma0[11]));
        acc3 = make_float4((acc3.x+wmma0[12]),(acc3.y+wmma0[13]),(acc3.z+wmma0[14]),(acc3.w+wmma0[15]));
      }
      float alu4 = ((acc0.x>0.0f)?acc0.x:0.0f);
      float alu5 = ((acc1.x>0.0f)?acc1.x:0.0f);
      float alu6 = ((acc2.x>0.0f)?acc2.x:0.0f);
      float alu7 = ((acc3.x>0.0f)?acc3.x:0.0f);
      float alu8 = ((acc0.y>0.0f)?acc0.y:0.0f);
      float alu9 = ((acc1.y>0.0f)?acc1.y:0.0f);
      float alu10 = ((acc2.y>0.0f)?acc2.y:0.0f);
      float alu11 = ((acc3.y>0.0f)?acc3.y:0.0f);
      float alu12 = ((acc0.z>0.0f)?acc0.z:0.0f);
      float alu13 = ((acc1.z>0.0f)?acc1.z:0.0f);
      float alu14 = ((acc2.z>0.0f)?acc2.z:0.0f);
      float alu15 = ((acc3.z>0.0f)?acc3.z:0.0f);
      float alu16 = ((acc0.w>0.0f)?acc0.w:0.0f);
      *((float4*)(data0+alu2)) = make_float4(alu4,alu8,alu12,alu16);
      float alu17 = ((acc1.w>0.0f)?acc1.w:0.0f);
      *((float4*)(data0+4096+alu2)) = make_float4(alu5,alu9,alu13,alu17);
      float alu18 = ((acc2.w>0.0f)?acc2.w:0.0f);
      *((float4*)(data0+8192+alu2)) = make_float4(alu6,alu10,alu14,alu18);
      float alu19 = ((acc3.w>0.0f)?acc3.w:0.0f);
      *((float4*)(data0+12288+alu2)) = make_float4(alu7,alu11,alu15,alu19);
    }
  }
}

kernel changing rendering

void r_1024_1024_4096_4_4(float* restrict data0, const float* restrict data1, const float* restrict data2) {
  for (int ridx0 = 0; ridx0 < 1024; ridx0++) {
    int alu0 = (ridx0*16384);
    for (int ridx1 = 0; ridx1 < 1024; ridx1++) {
      int alu1 = (ridx1*4);
      int alu2 = (alu0+alu1);
      float4 acc0 = make_float4(0.0f,0.0f,0.0f,0.0f);
      float4 acc1 = make_float4(0.0f,0.0f,0.0f,0.0f);
      float4 acc2 = make_float4(0.0f,0.0f,0.0f,0.0f);
      float4 acc3 = make_float4(0.0f,0.0f,0.0f,0.0f);
      AMX_SET(1);
      for (int ridx2 = 0; ridx2 < 4096; ridx2++) {
        int alu3 = (alu0+ridx2);
        float4 val0 = *((float4*)(data2+alu1+(ridx2*4096)));
        float val1 = data1[alu3+4096];
        float val2 = data1[alu3+8192];
        float val3 = data1[alu3+12288];
        float val4 = data1[alu3];
        __WMMA_4_4_4_float_float(make_float4(val4,val1,val2,val3), val0);
        __ST_float32(&acc0, 0);
        __ST_float32(&acc1, 1);
        __ST_float32(&acc2, 2);
        __ST_float32(&acc3, 3);
      }
      AMX_SET(0);
      float alu4 = ((acc0.x>0.0f)?acc0.x:0.0f);
      float alu5 = ((acc1.x>0.0f)?acc1.x:0.0f);
      float alu6 = ((acc2.x>0.0f)?acc2.x:0.0f);
      float alu7 = ((acc3.x>0.0f)?acc3.x:0.0f);
      float alu8 = ((acc0.y>0.0f)?acc0.y:0.0f);
      float alu9 = ((acc1.y>0.0f)?acc1.y:0.0f);
      float alu10 = ((acc2.y>0.0f)?acc2.y:0.0f);
      float alu11 = ((acc3.y>0.0f)?acc3.y:0.0f);
      float alu12 = ((acc0.z>0.0f)?acc0.z:0.0f);
      float alu13 = ((acc1.z>0.0f)?acc1.z:0.0f);
      float alu14 = ((acc2.z>0.0f)?acc2.z:0.0f);
      float alu15 = ((acc3.z>0.0f)?acc3.z:0.0f);
      float alu16 = ((acc0.w>0.0f)?acc0.w:0.0f);
      *((float4*)(data0+alu2)) = make_float4(alu4,alu8,alu12,alu16);
      float alu17 = ((acc1.w>0.0f)?acc1.w:0.0f);
      *((float4*)(data0+4096+alu2)) = make_float4(alu5,alu9,alu13,alu17);
      float alu18 = ((acc2.w>0.0f)?acc2.w:0.0f);
      *((float4*)(data0+8192+alu2)) = make_float4(alu6,alu10,alu14,alu18);
      float alu19 = ((acc3.w>0.0f)?acc3.w:0.0f);
      *((float4*)(data0+12288+alu2)) = make_float4(alu7,alu11,alu15,alu19);
    }
  }
}

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.