Git Product home page Git Product logo

one's People

one's Issues

Profiling master & inv3 & debug

Let's profile

  • massif profiles heap in virtual memory
  • rss means memory on the physical memory
  • massif's numbers don't mean as rss

setup internally

sudo dpkg-reconfigure ca-certificates

sudo update-ca-certificates

git config --global http.proxy $PROXY
git config --global https.proxy $PROXY

git config --system http.sslcainfo /etc/ssl/certs/SRnD_Web_Proxy_new.crt

git config --global https.sslVerify false

FCLayer -> cker -> ruy

fc_hybrid

Operator 0: FULLY_CONNECTED
        Fused Activation: NONE
        Input Tensors[0, 41, 25]                                                                           
                Tensor    0 : buffer   43 |  Empty | FLOAT32 | Memory 15.6K  | Shape [4, 1000] (Placeholder_10)   
                Tensor   41 : buffer   14 | Filled | UINT8   | Memory 93.8K  | Shape [96, 1000] (s_transformed/kernel/u/transpose)
                Tensor   25 : buffer    1 | Filled | FLOAT32 | Memory 384.0B | Shape [96] (decoder_1/enc_ctx/Tensordot/MatMul_bias)
        Output Tensors[36]
                Tensor   36 : buffer   21 |  Empty | FLOAT32 | Memory 1.5K   | Shape [4, 96] (decoder_1/s_transformed/MatMul)

Operator 2: FULLY_CONNECTED                                                                                         
        Fused Activation: NONE
        Input Tensors[1, 26, 25]
                Tensor    1 : buffer   23 |  Empty | FLOAT32 | Memory 48.0K  | Shape [4, 2, 1536] (Placeholder_9)
                Tensor   26 : buffer    6 | Filled | UINT8   | Memory 144.0K | Shape [96, 1536] (decoder_1/enc_ctx/Tensordot/Reshape_1/transpose)
                Tensor   25 : buffer    1 | Filled | FLOAT32 | Memory 384.0B | Shape [96] (decoder_1/enc_ctx/Tensordot/MatMul_bias)
        Output Tensors[24]                                                                                 
                Tensor   24 : buffer   13 |  Empty | FLOAT32 | Memory 3.0K   | Shape [8, 96] (decoder_1/enc_ctx/Tensordot/MatMul)

// CPU=NHWC, NHW, HW
// Y = aX+b
// output: h=8, w=96
// input: batch(n)=4, h=8, w=1536
// weight: h=96, w=1536 -> (later) h=1536 w=96
// bias: h=96
// [8,96] = [4,2,1536]*[96,1536]+[96]
// [8,96] = 4[2,1536]*[1536,96]+[96]
// [8,96] = 4[2,96]+[96]
// [8,96] = [8,96]+[96] = 8[96] + [96]
// weight * input = output
// [96,1536] * [4,8,1536] = [8,96]

// FullyConnectedLayer.cc
void FullyConnectedLayer::fullyConnectedHybrid() {
  // ...
  nnfw::cker::FullyConnectedHybrid(
      op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
      getTensorShape(_weights), reinterpret_cast<const int8_t *>(_weights->buffer()),
      getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena);
}
// FullyConnected.h
inline void FullyConnectedHybrid(const FullyConnectedParams &params,
	const Shape &input_shape, const float *input_data,
	const Shape &filter_shape, const int8_t *filter_data, // weights
	const Shape &, const float *bias_data,
	const Shape &output_shape, float *output_data,
	FCTempArena &temp_arena) {

  int total_input_size = input_shape.FlatSize();
  const int input_size = filter_shape.Dims(1);
  const int batch_size = total_input_size / input_size;
  const int num_units = filter_shape.Dims(0);

  // ...

  // Quantize input from float to uint8 + quantization params (scaling factor).
  float *scaling_factors_ptr = temp_arena.scaling_factors.data();
  int8_t *quant_data = temp_arena.input_quantized.data();

  // ...

  auto output_size = output_shape.FlatSize();
  temp_arena.accum_scratch.resize(output_size);
  int32_t *scratch = temp_arena.accum_scratch.data();
  MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
                                      scaling_factors_ptr, batch_size, scratch, output_data,
                                      /*result_stride=*/1);

}
// NeonTensorUtils.h
void NeonMatrixBatchVectorMultiplyAccumulate(
	const int8_t *__restrict__ matrix, // weights or filter_data
	const int m_rows, // row of weight, num_units of weights
	const int m_cols, // col of weight, input_size
	const int8_t *__restrict__ vectors, const float *scaling_factors, // quant_data from inputs
	int n_batch,
	int32_t *scratch, // temp or immediate output
	float *__restrict__ result, int result_stride) // output_data
{
  if (m_rows % 4 == 0 && result_stride == 1)
  {
    const int32_t *bias = static_cast<const int32_t *>(nullptr);
    NeonCpuBackendGemm(vectors, bias, matrix, n_batch, m_cols, m_rows,
                       /*output_zp =*/0, scratch);
    // ...
  }
}
// NeonTensorUtils.h
void NeonCpuBackendGemm(const int8_t *input, // quant_data or vectors from inputs
			const int32_t *bias, 
                        const int8_t *input_to_gate_weights, // weights or filter_data or matrix
			int32_t n_batch,
			int32_t n_input, // col of weights or input_size
			int32_t n_output,// row of weights or num_unit of weight
			int32_t, int32_t *scratch) // scratch=weight * input
{
  // ...
  // [n_output,n_batch] = [n_output,n_input]*[n_input,n_batch]
  ruy::Matrix<int8_t> ruy_lhs; // weights, int8, RowMajor, [n_output,n_input] == [row_of_weight,col_of_weight]
  ruy::Matrix<int8_t> ruy_rhs; // input, int8, ColMajor, [n_input, n_batch] = [row_of_weight,batch]
  ruy::Matrix<int32_t> ruy_dst; // output int32, ColMajor, [n_output, n_batch] = [row_of_weight,batch]
  ruy_support::MakeRuyMatrix(lhs_params, input_to_gate_weights, &ruy_lhs);
  ruy_support::MakeRuyMatrix(rhs_params, input, &ruy_rhs);
  ruy_support::MakeRuyMatrix(dst_params, scratch, &ruy_dst);

  ruy::BasicSpec<int32_t, int32_t> ruy_spec;
  ruy_support::MakeRuySpec(gemm_params, &ruy_spec);

  constexpr ruy::Path kRuyPath = ruy::kAllPaths;
  ruy::Mul<kRuyPath>(ruy_lhs, ruy_rhs, ruy_spec, ruy_context, &ruy_dst);
}
// ruy/dispatch.h
template <...>
void DispatchMul(const Matrix<LhsScalar>& lhs, // [n_output,n_input], RowMajor, int8
                 const Matrix<RhsScalar>& rhs, // [n_input,n_batch], ColMajor, int8
                 const Spec& spec, Context* context,
                 Matrix<DstScalar>* dst) {     // [n_output,n_batch], ColMajor, int32
  // ...
  Matrix<LhsScalar> transposed_lhs(lhs);
  Transpose(&transposed_lhs); // [n_input,n_output], ColMajor, int8
  TrMulParams params;
  CreateTrMulParams<TrMulCompiledPaths>(transposed_lhs, rhs, spec, dst,
                                        the_path, &params);
  SidePair<bool> cacheable(lhs.cacheable, rhs.cacheable);
  HandlePrepackedCaching(&params, cacheable, context);
  TrMul(&params, context);
}
// ruy/dispatch.h
inline void HandlePrepackedCaching(TrMulParams* params,
                                   const SidePair<bool>& cacheable,
                                   Context* context) {
  if (context->cache_policy == CachePolicy::kNoCache) {
    return;
  }

  if (context->cache_policy == CachePolicy::kCacheLHSOnNarrowMul) {
    // dst: [n_output,n_batch] -> n_batch
    if (!cacheable[Side::kLhs] || params->dst.layout.cols > 4) {
      return;
    }
    PrepackedCache* prepacked_cache = context->GetPrepackedCache();
    // cache_key is data address!!!
    auto cache_key = std::make_pair(reinterpret_cast<void*>(params->run_kernel),
                                    params->src[Side::kLhs].data);
    auto it = prepacked_cache->FindAndUpdate(cache_key);
    if (it != prepacked_cache->cend()) {
      params->packed[Side::kLhs].data = it->second.first.data;
      params->packed[Side::kLhs].sums = it->second.first.sums;
      params->is_prepacked[Side::kLhs] = true;
      return;
    }

    // Allocate the prepacked matrix.
    PrepackedMatrix prepacked_lhs;
    prepacked_lhs.data_size = DataSize(params->packed[Side::kLhs]);
    prepacked_lhs.sums_size = SumsSize(params->packed[Side::kLhs]);
    prepacked_cache->AllocatePrepackedMatrix(&prepacked_lhs);
    params->packed[Side::kLhs].data = prepacked_lhs.data;
    params->packed[Side::kLhs].sums = prepacked_lhs.sums;
    params->is_prepacked[Side::kLhs] = true;
    Tuning tuning = context->GetMainThreadTuning();
    params->RunPack(Side::kLhs, tuning, 0,
                    params->packed[Side::kLhs].layout.cols);
    prepacked_cache->Insert(cache_key, prepacked_lhs);
    return;
  }
}

bazel

How does Bazel work?

When running a build or a test, Bazel does the following:

  • Loads the BUILD files relevant to the target.
  • Analyzes the inputs and their dependencies, applies the specified build rules, and produces an action graph.
  • Executes the build actions on the inputs until the final build outputs are produced.

Since all previous build work is cached, Bazel can identify and reuse cached artifacts and only rebuild or retest what’s changed. To further enforce correctness, you can set up Bazel to run builds and tests hermetically through sandboxing, minimizing skew and maximizing reproducibility.

https://docs.bazel.build/versions/master/build-ref.html
In a repository - WORKSPACE

  • Packages
    • a package: BUILD
      • targets = files + rules
        • files = source files + generated files
        • rules = input -> {rule} -> output

https://docs.bazel.build/versions/master/platforms-intro.html#android
Android

Bazel’s Android rules do not yet support platforms to select Android toolchains.

They do support setting --platforms to select NDK toolchains: see here.

Most importantly, --fat_apk_cpu, which builds multi-architecture fat APKs, does not work with platform-enabled C++. This is because it sets legacy flags like --cpu and --crosstool_top, which platform-enabled C++ rules don’t read. Until this is migrated, using --fat_apk_cpu with --platforms requires platform mappings.

https://docs.bazel.build/versions/master/android-ndk.html#integration-with-platforms-and-toolchains

.bazelrc
https://docs.bazel.build/versions/master/guide.html#bazelrc-the-bazel-configuration-file

ruy kernel nnapi segment fault

odroid@odroid:/home/dragon/Works/github/ONE$ USE_NNAPI=1 BACKENDS="cpu" ./Product/out/bin/tflite_run ./nnpkg_asr_models_multi_threaded/dec2/dec2.tflite -w0 -r2 -m0

nnapi function 'ANeuralNetworksModel_create' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so' 
nnapi function 'ANeuralNetworksModel_addOperand' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so' 
nnapi function 'ANeuralNetworksModel_setOperandValue' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so' 
nnapi function 'ANeuralNetworksModel_addOperation' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so' 
nnapi function 'ANeuralNetworksModel_identifyInputsAndOutputs' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so' 
nnapi function 'ANeuralNetworksModel_finish' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so' 
nnapi function 'ANeuralNetworksCompilation_create' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so' 
nnapi function 'ANeuralNetworksCompilation_finish' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so' 

virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x752e58 
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x752e58 hybrid 
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x752e58 batch_size > 4 
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x74c858 
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x74c858 hybrid 
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x74c858 batch_size > 4 
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x752570 
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x752570 hybrid 
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x752570 cached:  0x754a28 
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x753080 
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x753080 hybrid 
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x753080 cached:  0xb541f008 

input tensor indices = [1,0,]
nnapi function 'ANeuralNetworksExecution_create' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so' 
nnapi function 'ANeuralNetworksExecution_setInput' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so' 
nnapi function 'ANeuralNetworksExecution_setOutput' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so' 
nnapi function 'ANeuralNetworksExecution_startCompute' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so' 
nnapi function 'ANeuralNetworksEvent_wait' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so' 

void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570 hybrid cache:  0x754a28 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570 hybrid cache:  0x754a28 is done 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570 tensor->decrease_ref() will run 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570 tensor->ref is 0 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570 tensor->decrease_ref() is done 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x753080 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x753080 hybrid cache:  0xb541f008 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x753080 hybrid cache:  0xb541f008 is done 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x753080 tensor->decrease_ref() will run 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x753080 tensor->ref is 0 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x753080 tensor->decrease_ref() is done 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752e58 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752e58 hybrid cache:  (nil) 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752e58 hybrid cache:  (nil) is done 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752e58 cached weight is nullptr 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x74c858 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x74c858 hybrid cache:  (nil) 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x74c858 hybrid cache:  (nil) is done 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x74c858 cached weight is nullptr 

nnapi function 'ANeuralNetworksEvent_free' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so' 
nnapi function 'ANeuralNetworksExecution_free' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so' 

void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570 hybrid cache:  0x754a28 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570 hybrid cache:  0x754a28 is done 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570 weight is already freed 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x753080 
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x753080 hybrid cache:  0xb541f008 

Segmentation fault

odroid xu4

env setting

sudo dpkg-reconfigure tzdata 
sudo apt-get install ntp

nfs

[host]

mkdir ~/nfs
chmod 777 ~/nfs
sudo vi /etc/exports
 /home/dragon/nfs 10.113.76.107 (rw,all_squash,async)
service nfs-kernel-server restart
service rpcbind restart

[client]

mkdir -p nfs
mkdir -p nnfw
mount -t nfs 10.113.79.41:/home/dragon/NNFW/nfs nfs
mount -t nfs 10.113.79.41:/home/dragon/NNFW/nnfw nnfw

/etc/fstab
10.113.79.41:/home/dragon/NNFW/nfs /root/nfs nfs auto,nofail,noatime,nolock,intr,tcp,actimeo=1800 0 0
10.113.79.41:/home/dragon/NNFW/nnfw /root/nnfw nfs auto,nofail,noatime,nolock,intr,tcp,actimeo=1800 0 0

mount -a

trouble-shooting

root@odroid:~# mount -a
mount.nfs: requested NFS version or transport protocol is not supported

check host's nfs server

dragon@loki:~/Works/github/ONE$ sudo systemctl status nfs-kernel-server
● nfs-server.service - NFS server and services
   Loaded: loaded (/lib/systemd/system/nfs-server.service; enabled; vendor preset: enabled)
   Active: failed (Result: exit-code) since Fri 2020-07-24 15:21:29 KST; 42s ago
  Process: 15890 ExecStopPost=/usr/sbin/exportfs -f (code=exited, status=0/SUCCESS)
  Process: 15889 ExecStopPost=/usr/sbin/exportfs -au (code=exited, status=0/SUCCESS)
  Process: 15888 ExecStartPre=/usr/sbin/exportfs -r (code=exited, status=1/FAILURE)

 7월 24 15:21:29 loki exportfs[15888]: exportfs: /etc/exports [2]: Neither 'subtree_check' or 'no_subtree_check' specified for export "10.113.221.160:/home/dragon/Works/armnn-tflite".
 7월 24 15:21:29 loki exportfs[15888]:   Assuming default behaviour ('no_subtree_check').
 7월 24 15:21:29 loki exportfs[15888]:   NOTE: this default has changed since nfs-utils version 1.0.x
 7월 24 15:21:29 loki exportfs[15888]: exportfs: /etc/exports [3]: Neither 'subtree_check' or 'no_subtree_check' specified for export "10.113.221.160:/home/dragon/Works/github/ONE".
 7월 24 15:21:29 loki exportfs[15888]:   Assuming default behaviour ('no_subtree_check').
 7월 24 15:21:29 loki exportfs[15888]:   NOTE: this default has changed since nfs-utils version 1.0.x
 7월 24 15:21:29 loki exportfs[15888]: exportfs: Failed to stat /home/dragon/Works/github.sec/nnfw: No such file or directory
 7월 24 15:21:29 loki systemd[1]: nfs-server.service: Control process exited, code=exited status=1
 7월 24 15:21:29 loki systemd[1]: nfs-server.service: Failed with result 'exit-code'.
 7월 24 15:21:29 loki systemd[1]: Stopped NFS server and services.

dragon@loki:~/Works/github/ONE$ ls ../../github.sec
flatbuffers  microbixby_e2easr  NE10  nnfw_ci  ODIN-ASR  rss.sh  tensorflow  tensorflow_asr  tflite_run  tv_utils

dragon@loki:~/Works/github/ONE$ sudo vi /etc/exports # do something
dragon@loki:~/Works/github/ONE$ sudo service nfs-kernel-server restart

build

BUILD_TYPE=Release TARGET_ARCH=armv7l CROSS_BUILD=1 make -f Makefile.template configure build install
OPTIONS="-DEXPERIMENTAL_RUY_FEATURE=1" \
BUILD_TYPE=Release TARGET_ARCH=armv7l CROSS_BUILD=1 make -f Makefile.template configure build install

rss & vm data

master: https://github.com/YongseopKim/ONE/tree/pr/benchmark_additional_info
draft: https://github.com/YongseopKim/ONE/tree/test/use_external_data_pulled

http://core-analyzer.sourceforge.net/index_files/Page335.html

Main arena calls sbrk() which usually starts immediately after the executable’s data section. On the other hand, a dynamic arena uses mmap() with fixed size, e.g. 64MB, and make sure the starting address is also aligned on the multiple of the size. If the first heap is used up, a new fix-sized heap is mmap-ed, which links to the dynamic arena’s heap data “strcut malloc_state”.

If the user’s request exceeds a threshold, e.g. 128KB, which may be changed dynamically, ptmalloc calls mmap() to allocate the memory and munmap() when it is freed. This class of memory blocks is not linked by any heap data.

not main thread's stack is allocated by mmap and it is measured as a heap by valgrind --tool=massif --pages-as-heap=yes

enable tflite_2.2.0

Makefile
-> ./nnfw {configure|build} # if configure
-> infra/nnfw/command/configure
-> cmake infra/nnfw
-> infra/nnfw/CMakeLists.txt

infra/nnfw/CMakeLists.txt

  • infra/cmake/modules/*.cmake # like cmake features
  • macro(nnfw_find_package PREFIX) -> nnfw/cmake/packages/${PREFIX}Config.cmake
  • infra/nnfw/camke/CfgOptionFlags.cmake # which build?
  • infra/nnfw/camke/ApplyCompileFlags.cmake # compile options
  • compute/CMakeLists.txt
  • runtime/CMakeLists.txt
  • tests/CMakeLists.txt
  • tools/CMakeLists.txt

example)
compute/cker/CMakeLists.txt

  • nnfw_find_package(Ruy REQUIRED)
  • nnfw/cmake/packages/RuyConfig.cmake
    • nnfw/cmake/packages/RuySourceConfig.cmake # download ruy source
    • nnfw/cmake/packages/Ruy/CMakeLists.txt # build

runtime/libs/tflite/CMakeLists.txt

  • nnfw_find_package(TensorFlowLite QUIET)
  • nnfw/cmake/packages/TensorFlowLiteConfig.cmake
    • nnfw_find_package(...)
    • nnfw_find_package(TensorFlowSource)
      • download tensorflow v1.13.1
    • nnfw/cmake/packages/TensorFlowLite/CMakeLists.txt

compiler/nnkit-tflite/CMakeLists.txt

  • nnas_find_package(TensorFlowLite QUIET EXACT 1.12)
    • infra/cmake/packages/TensorFlowLite-1.12/TensorFlowLiteConfig.cmake

about package

package vs devel-package

debuginfo vs debugsource

Debuginfo packages

The debuginfo packages provide debugging information needed to provide human-readable names for binary code features. These packages contain .debug files, which contain DWARF debugging information. These files are installed to the /usr/lib/debug directory.

Debugsource packages

The debugsource packages contain the source files used for compiling the binary code. With both respective debuginfo and debugsource package installed, debuggers such as GDB or LLDB can relate the execution of binary code to the source code. The source code files are installed to the /usr/src/debug directory.

about linux kernel

Linux Standard Base Specification

ld: linker

vdso & vvar: https://lwn.net/Articles/615809/

  • vdso: a memory region belonging to the address space of every user-mode process
  • vvar: The values of the vvar variables are set from the values of other kernel variables not accessible to user-space code.

vectors

interpreter -> on demand memory

  • 용어 관련 interpreter -> on demand memory
  • 좀 더 명확히 목표
  • 트레이닝만?
  • 모델은?
  • hw 제약은 어디까지
  • LLM INFERENCE?
  • 인풋으로 정해진 데이터의 최적화는 어떻게?
  • onert-micro, tf
  • on device training: customization personalization
  • 베이스라인 차별점

Memory Format

Efficient PyTorch: Tensor Memory Format Matters

valgrind + cachegrind

memory format

  • https://pytorch.org/blog/tensor-memory-format-matters/#memory-formats-supported-by-pytorch-operators
  • Contiguous: Tensor memory is in the same order as the tensor’s dimensions.
  • ChannelsLast: Irrespective of the dimension order, the 2d (image) tensor is laid out as an HWC or NHWC (N: batch, H: height, W: width, C: channels) tensor in memory. The dimensions could be permuted in any order.
  • ChannelsLast3d: For 3d tensors (video tensors), the memory is laid out in THWC (Time, Height, Width, Channels) or NTHWC (N: batch, T: time, H: height, W: width, C: channels) format. The dimensions could be permuted in any order.

debug

gdb plugin - gef

https://gef.readthedocs.io/en/master/

cd odin_asr_data; gdb -ex 'source ./.gdbinit' --args OnDeviceE2EASR_tizen ./OnDeviceE2EASR.ko-KR.conf ./general.scp /tmp/log.log0 0 0
sh-3.2# cat .gdbinit 
source ~/peda/peda.py
source ~/Pwngdb/pwngdb.py
source ~/Pwngdb/angelheap/gdbinit.py

define hook-run
python
import angelheap
angelheap.init_angelheap()
end
end

thread

info threads
thread <tid>
thread apply all bt

break thread

b ruy::Pack8bitNeonOutOfOrder4Cols LWP 21869 if bartab > lim
b ruy::Pack8bitNeonOutOfOrder4Cols LWP 21869 if bartab > lim

`.vscode/tasks.json`

{
    // See https://go.microsoft.com/fwlink/?LinkId=733558
    // for the documentation about the tasks.json format
    "version": "2.0.0",
    "tasks": [
        {
            "label": "nncc configure",
            "type": "shell",
            "command": "${workspaceFolder}/nncc",
            "args": ["configure"]
        },
        {
            "label": "nncc build",
            "type": "shell",
            "command": "${workspaceFolder}/nncc",
            "args": ["build"],
            "dependsOn": ["nncc configure"]
        }
    ]
}

tensorflow 관련

$ python3 -c "import tensorflow as tf; print('TensorFlow version:', tf.__version__)"
...
TensorFlow version: 2.15.0

Investigate tflite v2.3.0

-DTFLITE_WITH_RUY_GEMV 의 의미

  • 이게 없으면 batch_size >= 16 부터 RUY KERNEL이 적용
  • 이게 있으면 batch_size 에 상관없이 적용되는데 asr 모델에서 모두 batch_size 자체는 작기 때문에 이게 필요했었음
  • tflite vanilla 의 의미에서는 이게 없는 것이 natural

profile tool - valgrind

massif

tools

sudo apt install valgrind
sudo apt install massif-visualizer

run

BACKENDS=cpu valgrind --tool=massif --pages-as-heap=no --detailed-freq=1 ./Product/out/bin/nnpackage_run benchmark_nnpkg_models/inception_v3
valgrind --tool=massif --pages-as-heap=yes --detailed-freq=1 ./Product/out/bin/tflite_run benchmark_nnpkg_models/inception_v3/inception_v3.tflite

options

--quiet --massif-out-file=<file>
--max-snapshots=1000

build ONE/tflite

xu4

BUILD_TYPE=Release TARGET_ARCH=armv7l CROSS_BUILD=1 make -f Makefile.template configure build install

android

#4

OPTIONS="-DDOWNLOAD_BOOST=ON -DBUILD_BOOST=ON -DBUILD_NNPACKAGE_RUN=ON -DBUILD_TFLITE_RUN=ON" \
BUILD_TYPE=Release TARGET_OS=android CROSS_BUILD=1 \
NDK_DIR=`pwd`/tools/cross/ndk/r20/ndk \
EXT_HDF5_DIR=`pwd`/hdf5_prebuilt \
make -f Makefile.template configure build install

x86-64

BUILD_TYPE=Release make -f Makefile.template configure build install

nnas build

NNAS_BUILD_PREFIX=build/release \
NNCC_WORKSPACE=build/release \
BUILD_TYPE=Release \
NPROC=4 \
./nnas create-package --preset 20220323 --prefix build/install

LM/AM

ViT(Vision Transformer)
MSA(Multi-head Self Attention)

onert_train

$ file ./Product/out/bin/onert_train
./Product/out/bin/onert_train: ELF 64-bit LSB pie executable, x86-64, version 1 (GNU/Linux), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=7481670079cd705859d016d652d5e9dbd17c42b6, for GNU/Linux 3.2.0, with debug_info, not stripped
$ ./Product/out/bin/onert_train
E: Require one of options modelfile, nnpackage, or path.
$ ./Product/out/bin/onert_train --help
onert_train

Usage: ./Product/out/bin/onert_train[model path] [<options>]


General options:
  -h [ --help ]                         Print available options
  --version                             Print version and exit immediately
  --nnpackage arg                       NN Package file(directory) name
  --modelfile arg                       NN Model filename
  --path arg                            NN Package or NN Modelfile path
  --export_path arg                     Path to export circle
  --load_input:raw arg                  NN Model Raw Input data file
                                        The datafile must have data for each input number.
                                        If there are 3 inputs, the data of input0 must exist as
                                        much as data_length, and the data for input1 and input2
                                        must be held sequentially as data_length.

  --load_expected:raw arg               NN Model Raw Expected data file
                                        (Same data policy with load_input:raw)

  -m [ --mem_poll ] arg (=0)            Check memory polling
  --epoch arg (=5)                      Epoch number (default: 5)
  --batch_size arg (=32)                Batch size (default: 32)
  --learning_rate arg (=0.00100000005)  Learning rate (default: 0.001)
  --loss arg (=0)                       Loss type
                                        0: MEAN_SQUARED_ERROR (default)
                                        1: CATEGORICAL_CROSSENTROPY

  --loss_reduction_type arg (=0)        Loss Reduction type
                                        0: AUTO (default)
                                        1: SUM_OVER_BATCH_SIZE
                                        2: SUM

  --optimizer arg (=0)                  Optimizer type
                                        0: SGD (default)
                                        1: Adam

  --metric arg (=-1)                    Metricy type
                                          Simply calculates the metric value using the variables
                                        (default: none)
                                        0: CATEGORICAL_ACCURACY

  --validation_split arg (=0)           Float between 0 and 1. Fraction of the training data to be
                                        used as validation data.
  -v [ --verbose_level ] arg (=0)       Verbose level
                                        0: prints the only result. Messages btw run don't print
                                        1: prints result and message btw run
                                        2: prints all of messages to print

  --output_sizes arg                    The output buffer size in JSON 1D array
                                        If not given, the model's output sizes are used
                                        e.g. '[0, 40, 2, 80]' to set 0th tensor to 40 and 2nd
                                        tensor to 80.
$ ./Product/out/bin/onert_train --version
onert_train (nnfw runtime: v1.27.0)

transformer

ONE test

nnapi

BACKENDS="cpu" ./Product/out/unittest/nnapi_gtest  --gtest_filter=-$(grep -v '#' "./Product/out/unittest/nnapi_gtest.skip.armv7l-linux.cpu" | tr '\n' ':')

tflite benchmark sleep/delay

tensorflow/lite/tools/benchmark/

  params.AddParam("run_delay", BenchmarkParam::Create<float>(-1.0f));

      CreateFlag<float>("run_delay", &params_, "delay between runs in seconds"),
// benchmark_model.cc
Stat<int64_t> BenchmarkModel::Run(int min_num_times, float min_secs,
                                  float max_secs, RunType run_type,
                                  TfLiteStatus* invoke_status) {
    // ...
  for (int run = 0; (run < min_num_times || now_us < min_finish_us) &&
                    now_us <= max_finish_us;
       run++) {
    ResetInputsAndOutputs();
    listeners_.OnSingleRunStart(run_type);
    int64_t start_us = profiling::time::NowMicros();
    TfLiteStatus status = RunImpl();
    int64_t end_us = profiling::time::NowMicros();
    listeners_.OnSingleRunEnd();

    run_stats.UpdateStat(end_us - start_us);
    util::SleepForSeconds(params_.Get<float>("run_delay")); // HERE
// benchmark_utils.h
// A convenient function that wraps tflite::profiling::time::SleepForMicros and
// simply return if 'sleep_seconds' is negative.
void SleepForSeconds(double sleep_seconds);

// benchmark_utils.cc
void SleepForSeconds(double sleep_seconds) {
  if (sleep_seconds <= 0.0) {
    return;
  }
  // If requested, sleep between runs for an arbitrary amount of time.
  // This can be helpful to determine the effect of mobile processor
  // scaling and thermal throttling.
  tflite::profiling::time::SleepForMicros(
      static_cast<uint64_t>(sleep_seconds * 1e6));
}

Investigate tflite benchmark_model

binary

  • benchmark_model == tflite_run or nnpackage_run
  • benchmark_performance == test nnapi/gpu delegate

tools/benchmark/BUILD

"benchmark_model_main"

  • srcs: "benchmark_main.cc"
  • deps: benchmark_tflite_model_lib"

"benchmark_model"

  • deps: benchmark_model_main"

"benchmark_model_performance_options"

  • srcs: "benchmark_tflite_performance_options_main.cc"
  • deps: benchmark_performance_options", benchmark_tflite_model_lib"

...

Apply MMapedData into Tensor on cpu backend

ITensor: has uint8_t *_buffer

  • <- IPortableTensor: has no padding
    • <- cpu_common::Tensor
    • <- controlflow::UserTensor

cpu_common::Tensor // const_tensor or io_tensor

  • ir::OperandInfo: shape, static/dynamic, datatype, isconst
  • ir::Layout
  • uin8_t *_buffer or std::shared_ptr

IPortableTensor 를 상속하게 되면, no padding이 된다. cpu에서는 NHWC 밖에 layout을 지원하지 않음

ITensorRegistry: has ITensor

  • <- PortableTensorRegistryTemplate<T_Tensor>
    • has T_Tensor and IPortableTensor
    • managed: T_Tensor
    • external: IPortableTensor

ITensorBuilder

  • <- cpu::TensorBuilder
    • cpu_common::TensorRegitry == PortableTensorRegistryTemplate<cpu_common::Tensor>
    • cpu_common::StaticTensorManager
    • cpu_common::DynamicTensorManager

cpu_common::DynamicTensorManager -> IDynamicTensorManager # for dynamic tensor

  • DynamicMemoryManager _dynamic_mem_mgr
  • TensorRegistry _tensors

cpu_common::StaticTensorManager -> ITensorManager # for static tensor

  • DynamicMemoryManager _const_mgr
    • Map<Index,Allocator> _mem_alloc_map
  • MemoryManager _nonconst_mgr
    • Map<Index,Block> _tensor_mem_map
    • IMemoryPlanner _mem_planner
    • Allocator _mem_alloc
  • TensorRegistry _tensors

cpu_common::Allocator

  • uint8_t[] _base

Conv vs DepthwiseConv

tensorflow

https://www.tensorflow.org/api_docs/python/tf/nn/conv2d

conv2d

Computes a 2-D convolution given input and 4-D filters tensors.

tf.nn.conv2d(
    input, filters, strides, padding, data_format='NHWC', dilations=None, name=None
)

input: batch_shape + [in_height, in_width, in_channels]
kernel: [filter_height, filter_width, in_channels, out_channels]

1.Flattens the filter to a 2-D matrix with shape [filter_height * filter_width * in_channels, output_channels].
2. Extracts image patches from the input tensor to form a virtual tensor of shape [batch, out_height, out_width, filter_height * filter_width * in_channels].
3. For each patch, right-multiplies the filter matrix and the image patch vector.

output[b, i, j, k] =
    sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
                    filter[di, dj, q, k]

depthwise conv2d

https://www.tensorflow.org/api_docs/python/tf/nn/depthwise_conv2d

tf.nn.depthwise_conv2d(
    input, filter, strides, padding, data_format=None, dilations=None, name=None
)

[filter_height, filter_width, in_channels, channel_multiplier] containing in_channels convolutional filters of depth 1, depthwise_conv2d applies a different filter to each input channel (expanding from 1 channel to channel_multiplier channels for each), then concatenates the results together. The output has in_channels * channel_multiplier channels.

output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
     filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
                                     strides[2] * j + rate[1] * dj, k]

nnapi

conv2d

The CONV_2D op sweeps a 2-D filter that can mix channels together over a batch of images, applying the filter to each window of each image of the appropriate size.

The output dimensions are functions of the filter dimensions, stride, and padding.

The values in the output tensor are computed as:

output[b, i, j, channel] =
    sum_{di, dj, k} (
        input[b, strides[1] * i + di, strides[2] * j + dj, k] *
        filter[channel, di, dj, k]
    ) + bias[channel]

depthwise conv2d

Given an input tensor of shape [batches, height, width, depth_in] and a filter tensor of shape [1, filter_height, filter_width, depth_out] containing depth_out convolutional filters of depth 1, DEPTHWISE_CONV applies a different filter to each input channel (expanding from 1 channel to channel_multiplier channels for each), then concatenates the results together.

The output has depth_out = depth_in * depth_multiplier channels. The output dimensions are functions of the filter dimensions, stride, and padding.

output[b, i, j, k * channel_multiplier + q] =
    sum_{di, dj} (
        input[b, strides[1] * i + di, strides[2] * j + dj, k] *
        filter[1, di, dj, k * channel_multiplier + q]
    ) + bias[k * channel_multiplier + q]

android

build

prebuilt hdf5: http://npuarchive.mooo.com/archive/hdf_android/hdf5-1.10.0-android-aarch64.tar.gz

build nnpackage_run for android

OPTIONS="-DDOWNLOAD_BOOST=ON -DBUILD_BOOST=ON -DBUILD_NNPACKAGE_RUN=ON -DBUILD_TFLITE_RUN=ON" \
BUILD_TYPE=Release TARGET_OS=android CROSS_BUILD=1 \
NDK_DIR=`pwd`/tools/cross/ndk/r20/ndk \
EXT_HDF5_DIR=`pwd`/hdf5_prebuilt \
make -f Makefile.template configure build install

What makes the diff btw ONE and tflite on android

INFO: Found applicable config definition build:v2 in file /home/dragon/Works/github/tensorflow/.bazelrc: \
--define=tf_api_version=2 \
--action_env=TF2_BEHAVIOR=1

INFO: Found applicable config definition build:xla in file /home/dragon/Works/github/tensorflow/.bazelrc: \
--action_env=TF_ENABLE_XLA=1 \
--define=with_xla_support=true

INFO: Found applicable config definition build:android_arm64 in file /home/dragon/Works/github/tensorflow/.bazelrc: \
--config=android \
--cpu=arm64-v8a \
--fat_apk_cpu=arm64-v8a

INFO: Found applicable config definition build:android in file /home/dragon/Works/github/tensorflow/.bazelrc: \
--crosstool_top=//external:android/crosstool \
--host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
--noenable_platform_specific_config \
--copt=-w --cxxopt=-std=c++14 \
--host_cxxopt=-std=c++14

Tasks - Profiling

  • current onert's tracer - ChromeTracer #19
  • atrace, ttrace, ...
  • tflite's profiling
  • LTTng == kprobes, Perf

onert_train process(steps) 생각하기

어떤 모델을 트레이닝 시키기 위해 onert에 Contribution하는 것을 생각해보자.

  • 포인트는...
  • 효율적이고 체계적으로
  • 왜냐면 앞으로도 계속 OP 추가 등은 계속 될것이기 때문 -> 이 이유가 가장 인식되어야 할 부분

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.