yongseopkim / one Goto Github PK
View Code? Open in Web Editor NEWThis project forked from samsung/one
On-device Neural Engine
License: Other
This project forked from samsung/one
On-device Neural Engine
License: Other
Let's profile
sudo dpkg-reconfigure ca-certificates
sudo update-ca-certificates
git config --global http.proxy $PROXY
git config --global https.proxy $PROXY
git config --system http.sslcainfo /etc/ssl/certs/SRnD_Web_Proxy_new.crt
git config --global https.sslVerify false
TODO
TBD
fc_hybrid
Operator 0: FULLY_CONNECTED
Fused Activation: NONE
Input Tensors[0, 41, 25]
Tensor 0 : buffer 43 | Empty | FLOAT32 | Memory 15.6K | Shape [4, 1000] (Placeholder_10)
Tensor 41 : buffer 14 | Filled | UINT8 | Memory 93.8K | Shape [96, 1000] (s_transformed/kernel/u/transpose)
Tensor 25 : buffer 1 | Filled | FLOAT32 | Memory 384.0B | Shape [96] (decoder_1/enc_ctx/Tensordot/MatMul_bias)
Output Tensors[36]
Tensor 36 : buffer 21 | Empty | FLOAT32 | Memory 1.5K | Shape [4, 96] (decoder_1/s_transformed/MatMul)
Operator 2: FULLY_CONNECTED
Fused Activation: NONE
Input Tensors[1, 26, 25]
Tensor 1 : buffer 23 | Empty | FLOAT32 | Memory 48.0K | Shape [4, 2, 1536] (Placeholder_9)
Tensor 26 : buffer 6 | Filled | UINT8 | Memory 144.0K | Shape [96, 1536] (decoder_1/enc_ctx/Tensordot/Reshape_1/transpose)
Tensor 25 : buffer 1 | Filled | FLOAT32 | Memory 384.0B | Shape [96] (decoder_1/enc_ctx/Tensordot/MatMul_bias)
Output Tensors[24]
Tensor 24 : buffer 13 | Empty | FLOAT32 | Memory 3.0K | Shape [8, 96] (decoder_1/enc_ctx/Tensordot/MatMul)
// CPU=NHWC, NHW, HW
// Y = aX+b
// output: h=8, w=96
// input: batch(n)=4, h=8, w=1536
// weight: h=96, w=1536 -> (later) h=1536 w=96
// bias: h=96
// [8,96] = [4,2,1536]*[96,1536]+[96]
// [8,96] = 4[2,1536]*[1536,96]+[96]
// [8,96] = 4[2,96]+[96]
// [8,96] = [8,96]+[96] = 8[96] + [96]
// weight * input = output
// [96,1536] * [4,8,1536] = [8,96]
// FullyConnectedLayer.cc
void FullyConnectedLayer::fullyConnectedHybrid() {
// ...
nnfw::cker::FullyConnectedHybrid(
op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
getTensorShape(_weights), reinterpret_cast<const int8_t *>(_weights->buffer()),
getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena);
}
// FullyConnected.h
inline void FullyConnectedHybrid(const FullyConnectedParams ¶ms,
const Shape &input_shape, const float *input_data,
const Shape &filter_shape, const int8_t *filter_data, // weights
const Shape &, const float *bias_data,
const Shape &output_shape, float *output_data,
FCTempArena &temp_arena) {
int total_input_size = input_shape.FlatSize();
const int input_size = filter_shape.Dims(1);
const int batch_size = total_input_size / input_size;
const int num_units = filter_shape.Dims(0);
// ...
// Quantize input from float to uint8 + quantization params (scaling factor).
float *scaling_factors_ptr = temp_arena.scaling_factors.data();
int8_t *quant_data = temp_arena.input_quantized.data();
// ...
auto output_size = output_shape.FlatSize();
temp_arena.accum_scratch.resize(output_size);
int32_t *scratch = temp_arena.accum_scratch.data();
MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
scaling_factors_ptr, batch_size, scratch, output_data,
/*result_stride=*/1);
}
// NeonTensorUtils.h
void NeonMatrixBatchVectorMultiplyAccumulate(
const int8_t *__restrict__ matrix, // weights or filter_data
const int m_rows, // row of weight, num_units of weights
const int m_cols, // col of weight, input_size
const int8_t *__restrict__ vectors, const float *scaling_factors, // quant_data from inputs
int n_batch,
int32_t *scratch, // temp or immediate output
float *__restrict__ result, int result_stride) // output_data
{
if (m_rows % 4 == 0 && result_stride == 1)
{
const int32_t *bias = static_cast<const int32_t *>(nullptr);
NeonCpuBackendGemm(vectors, bias, matrix, n_batch, m_cols, m_rows,
/*output_zp =*/0, scratch);
// ...
}
}
// NeonTensorUtils.h
void NeonCpuBackendGemm(const int8_t *input, // quant_data or vectors from inputs
const int32_t *bias,
const int8_t *input_to_gate_weights, // weights or filter_data or matrix
int32_t n_batch,
int32_t n_input, // col of weights or input_size
int32_t n_output,// row of weights or num_unit of weight
int32_t, int32_t *scratch) // scratch=weight * input
{
// ...
// [n_output,n_batch] = [n_output,n_input]*[n_input,n_batch]
ruy::Matrix<int8_t> ruy_lhs; // weights, int8, RowMajor, [n_output,n_input] == [row_of_weight,col_of_weight]
ruy::Matrix<int8_t> ruy_rhs; // input, int8, ColMajor, [n_input, n_batch] = [row_of_weight,batch]
ruy::Matrix<int32_t> ruy_dst; // output int32, ColMajor, [n_output, n_batch] = [row_of_weight,batch]
ruy_support::MakeRuyMatrix(lhs_params, input_to_gate_weights, &ruy_lhs);
ruy_support::MakeRuyMatrix(rhs_params, input, &ruy_rhs);
ruy_support::MakeRuyMatrix(dst_params, scratch, &ruy_dst);
ruy::BasicSpec<int32_t, int32_t> ruy_spec;
ruy_support::MakeRuySpec(gemm_params, &ruy_spec);
constexpr ruy::Path kRuyPath = ruy::kAllPaths;
ruy::Mul<kRuyPath>(ruy_lhs, ruy_rhs, ruy_spec, ruy_context, &ruy_dst);
}
// ruy/dispatch.h
template <...>
void DispatchMul(const Matrix<LhsScalar>& lhs, // [n_output,n_input], RowMajor, int8
const Matrix<RhsScalar>& rhs, // [n_input,n_batch], ColMajor, int8
const Spec& spec, Context* context,
Matrix<DstScalar>* dst) { // [n_output,n_batch], ColMajor, int32
// ...
Matrix<LhsScalar> transposed_lhs(lhs);
Transpose(&transposed_lhs); // [n_input,n_output], ColMajor, int8
TrMulParams params;
CreateTrMulParams<TrMulCompiledPaths>(transposed_lhs, rhs, spec, dst,
the_path, ¶ms);
SidePair<bool> cacheable(lhs.cacheable, rhs.cacheable);
HandlePrepackedCaching(¶ms, cacheable, context);
TrMul(¶ms, context);
}
// ruy/dispatch.h
inline void HandlePrepackedCaching(TrMulParams* params,
const SidePair<bool>& cacheable,
Context* context) {
if (context->cache_policy == CachePolicy::kNoCache) {
return;
}
if (context->cache_policy == CachePolicy::kCacheLHSOnNarrowMul) {
// dst: [n_output,n_batch] -> n_batch
if (!cacheable[Side::kLhs] || params->dst.layout.cols > 4) {
return;
}
PrepackedCache* prepacked_cache = context->GetPrepackedCache();
// cache_key is data address!!!
auto cache_key = std::make_pair(reinterpret_cast<void*>(params->run_kernel),
params->src[Side::kLhs].data);
auto it = prepacked_cache->FindAndUpdate(cache_key);
if (it != prepacked_cache->cend()) {
params->packed[Side::kLhs].data = it->second.first.data;
params->packed[Side::kLhs].sums = it->second.first.sums;
params->is_prepacked[Side::kLhs] = true;
return;
}
// Allocate the prepacked matrix.
PrepackedMatrix prepacked_lhs;
prepacked_lhs.data_size = DataSize(params->packed[Side::kLhs]);
prepacked_lhs.sums_size = SumsSize(params->packed[Side::kLhs]);
prepacked_cache->AllocatePrepackedMatrix(&prepacked_lhs);
params->packed[Side::kLhs].data = prepacked_lhs.data;
params->packed[Side::kLhs].sums = prepacked_lhs.sums;
params->is_prepacked[Side::kLhs] = true;
Tuning tuning = context->GetMainThreadTuning();
params->RunPack(Side::kLhs, tuning, 0,
params->packed[Side::kLhs].layout.cols);
prepacked_cache->Insert(cache_key, prepacked_lhs);
return;
}
}
How does Bazel work?
When running a build or a test, Bazel does the following:
Since all previous build work is cached, Bazel can identify and reuse cached artifacts and only rebuild or retest what’s changed. To further enforce correctness, you can set up Bazel to run builds and tests hermetically through sandboxing, minimizing skew and maximizing reproducibility.
https://docs.bazel.build/versions/master/build-ref.html
In a repository - WORKSPACE
https://docs.bazel.build/versions/master/platforms-intro.html#android
Android
Bazel’s Android rules do not yet support platforms to select Android toolchains.
They do support setting --platforms to select NDK toolchains: see here.
Most importantly, --fat_apk_cpu, which builds multi-architecture fat APKs, does not work with platform-enabled C++. This is because it sets legacy flags like --cpu and --crosstool_top, which platform-enabled C++ rules don’t read. Until this is migrated, using --fat_apk_cpu with --platforms requires platform mappings.
https://docs.bazel.build/versions/master/android-ndk.html#integration-with-platforms-and-toolchains
.bazelrc
https://docs.bazel.build/versions/master/guide.html#bazelrc-the-bazel-configuration-file
odroid@odroid:/home/dragon/Works/github/ONE$ USE_NNAPI=1 BACKENDS="cpu" ./Product/out/bin/tflite_run ./nnpkg_asr_models_multi_threaded/dec2/dec2.tflite -w0 -r2 -m0
nnapi function 'ANeuralNetworksModel_create' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so'
nnapi function 'ANeuralNetworksModel_addOperand' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so'
nnapi function 'ANeuralNetworksModel_setOperandValue' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so'
nnapi function 'ANeuralNetworksModel_addOperation' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so'
nnapi function 'ANeuralNetworksModel_identifyInputsAndOutputs' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so'
nnapi function 'ANeuralNetworksModel_finish' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so'
nnapi function 'ANeuralNetworksCompilation_create' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so'
nnapi function 'ANeuralNetworksCompilation_finish' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so'
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x752e58
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x752e58 hybrid
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x752e58 batch_size > 4
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x74c858
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x74c858 hybrid
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x74c858 batch_size > 4
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x752570
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x752570 hybrid
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x752570 cached: 0x754a28
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x753080
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x753080 hybrid
virtual void onert::backend::cpu::ops::FullyConnectedLayer::prepare() 0x753080 cached: 0xb541f008
input tensor indices = [1,0,]
nnapi function 'ANeuralNetworksExecution_create' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so'
nnapi function 'ANeuralNetworksExecution_setInput' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so'
nnapi function 'ANeuralNetworksExecution_setOutput' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so'
nnapi function 'ANeuralNetworksExecution_startCompute' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so'
nnapi function 'ANeuralNetworksEvent_wait' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so'
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570 hybrid cache: 0x754a28
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570 hybrid cache: 0x754a28 is done
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570 tensor->decrease_ref() will run
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570 tensor->ref is 0
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570 tensor->decrease_ref() is done
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x753080
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x753080 hybrid cache: 0xb541f008
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x753080 hybrid cache: 0xb541f008 is done
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x753080 tensor->decrease_ref() will run
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x753080 tensor->ref is 0
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x753080 tensor->decrease_ref() is done
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752e58
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752e58 hybrid cache: (nil)
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752e58 hybrid cache: (nil) is done
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752e58 cached weight is nullptr
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x74c858
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x74c858 hybrid cache: (nil)
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x74c858 hybrid cache: (nil) is done
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x74c858 cached weight is nullptr
nnapi function 'ANeuralNetworksEvent_free' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so'
nnapi function 'ANeuralNetworksExecution_free' is loaded from '/home/dragon/Works/github/ONE/Product/armv7l-linux.debug/out/bin/../lib/libneuralnetworks.so'
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570 hybrid cache: 0x754a28
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570 hybrid cache: 0x754a28 is done
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x752570 weight is already freed
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x753080
void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid() 0x753080 hybrid cache: 0xb541f008
Segmentation fault
Let's investigate
sudo dpkg-reconfigure tzdata
sudo apt-get install ntp
[host]
mkdir ~/nfs
chmod 777 ~/nfs
sudo vi /etc/exports
/home/dragon/nfs 10.113.76.107 (rw,all_squash,async)
service nfs-kernel-server restart
service rpcbind restart
[client]
mkdir -p nfs
mkdir -p nnfw
mount -t nfs 10.113.79.41:/home/dragon/NNFW/nfs nfs
mount -t nfs 10.113.79.41:/home/dragon/NNFW/nnfw nnfw
/etc/fstab
10.113.79.41:/home/dragon/NNFW/nfs /root/nfs nfs auto,nofail,noatime,nolock,intr,tcp,actimeo=1800 0 0
10.113.79.41:/home/dragon/NNFW/nnfw /root/nnfw nfs auto,nofail,noatime,nolock,intr,tcp,actimeo=1800 0 0
mount -a
root@odroid:~# mount -a
mount.nfs: requested NFS version or transport protocol is not supported
check host's nfs server
dragon@loki:~/Works/github/ONE$ sudo systemctl status nfs-kernel-server
● nfs-server.service - NFS server and services
Loaded: loaded (/lib/systemd/system/nfs-server.service; enabled; vendor preset: enabled)
Active: failed (Result: exit-code) since Fri 2020-07-24 15:21:29 KST; 42s ago
Process: 15890 ExecStopPost=/usr/sbin/exportfs -f (code=exited, status=0/SUCCESS)
Process: 15889 ExecStopPost=/usr/sbin/exportfs -au (code=exited, status=0/SUCCESS)
Process: 15888 ExecStartPre=/usr/sbin/exportfs -r (code=exited, status=1/FAILURE)
7월 24 15:21:29 loki exportfs[15888]: exportfs: /etc/exports [2]: Neither 'subtree_check' or 'no_subtree_check' specified for export "10.113.221.160:/home/dragon/Works/armnn-tflite".
7월 24 15:21:29 loki exportfs[15888]: Assuming default behaviour ('no_subtree_check').
7월 24 15:21:29 loki exportfs[15888]: NOTE: this default has changed since nfs-utils version 1.0.x
7월 24 15:21:29 loki exportfs[15888]: exportfs: /etc/exports [3]: Neither 'subtree_check' or 'no_subtree_check' specified for export "10.113.221.160:/home/dragon/Works/github/ONE".
7월 24 15:21:29 loki exportfs[15888]: Assuming default behaviour ('no_subtree_check').
7월 24 15:21:29 loki exportfs[15888]: NOTE: this default has changed since nfs-utils version 1.0.x
7월 24 15:21:29 loki exportfs[15888]: exportfs: Failed to stat /home/dragon/Works/github.sec/nnfw: No such file or directory
7월 24 15:21:29 loki systemd[1]: nfs-server.service: Control process exited, code=exited status=1
7월 24 15:21:29 loki systemd[1]: nfs-server.service: Failed with result 'exit-code'.
7월 24 15:21:29 loki systemd[1]: Stopped NFS server and services.
dragon@loki:~/Works/github/ONE$ ls ../../github.sec
flatbuffers microbixby_e2easr NE10 nnfw_ci ODIN-ASR rss.sh tensorflow tensorflow_asr tflite_run tv_utils
dragon@loki:~/Works/github/ONE$ sudo vi /etc/exports # do something
dragon@loki:~/Works/github/ONE$ sudo service nfs-kernel-server restart
BUILD_TYPE=Release TARGET_ARCH=armv7l CROSS_BUILD=1 make -f Makefile.template configure build install
OPTIONS="-DEXPERIMENTAL_RUY_FEATURE=1" \
BUILD_TYPE=Release TARGET_ARCH=armv7l CROSS_BUILD=1 make -f Makefile.template configure build install
master: https://github.com/YongseopKim/ONE/tree/pr/benchmark_additional_info
draft: https://github.com/YongseopKim/ONE/tree/test/use_external_data_pulled
http://core-analyzer.sourceforge.net/index_files/Page335.html
Main arena calls sbrk() which usually starts immediately after the executable’s data section. On the other hand, a dynamic arena uses mmap() with fixed size, e.g. 64MB, and make sure the starting address is also aligned on the multiple of the size. If the first heap is used up, a new fix-sized heap is mmap-ed, which links to the dynamic arena’s heap data “strcut malloc_state”.
If the user’s request exceeds a threshold, e.g. 128KB, which may be changed dynamically, ptmalloc calls mmap() to allocate the memory and munmap() when it is freed. This class of memory blocks is not linked by any heap data.
not main thread's stack is allocated by mmap and it is measured as a heap by valgrind --tool=massif --pages-as-heap=yes
Makefile
-> ./nnfw {configure|build} # if configure
-> infra/nnfw/command/configure
-> cmake infra/nnfw
-> infra/nnfw/CMakeLists.txt
infra/nnfw/CMakeLists.txt
example)
compute/cker/CMakeLists.txt
runtime/libs/tflite/CMakeLists.txt
compiler/nnkit-tflite/CMakeLists.txt
The debuginfo packages provide debugging information needed to provide human-readable names for binary code features. These packages contain .debug files, which contain DWARF debugging information. These files are installed to the /usr/lib/debug directory.
The debugsource packages contain the source files used for compiling the binary code. With both respective debuginfo and debugsource package installed, debuggers such as GDB or LLDB can relate the execution of binary code to the source code. The source code files are installed to the /usr/src/debug directory.
Linux Standard Base Specification
ld: linker
vdso & vvar: https://lwn.net/Articles/615809/
vectors
Efficient PyTorch: Tensor Memory Format Matters
valgrind + cachegrind
memory format
google benchmark https://github.com/google/benchmark
https://gef.readthedocs.io/en/master/
cd odin_asr_data; gdb -ex 'source ./.gdbinit' --args OnDeviceE2EASR_tizen ./OnDeviceE2EASR.ko-KR.conf ./general.scp /tmp/log.log0 0 0
sh-3.2# cat .gdbinit
source ~/peda/peda.py
source ~/Pwngdb/pwngdb.py
source ~/Pwngdb/angelheap/gdbinit.py
define hook-run
python
import angelheap
angelheap.init_angelheap()
end
end
info threads
thread <tid>
thread apply all bt
break thread
b ruy::Pack8bitNeonOutOfOrder4Cols LWP 21869 if bartab > lim
b ruy::Pack8bitNeonOutOfOrder4Cols LWP 21869 if bartab > lim
{
// See https://go.microsoft.com/fwlink/?LinkId=733558
// for the documentation about the tasks.json format
"version": "2.0.0",
"tasks": [
{
"label": "nncc configure",
"type": "shell",
"command": "${workspaceFolder}/nncc",
"args": ["configure"]
},
{
"label": "nncc build",
"type": "shell",
"command": "${workspaceFolder}/nncc",
"args": ["build"],
"dependsOn": ["nncc configure"]
}
]
}
TBD
$ python3 -c "import tensorflow as tf; print('TensorFlow version:', tf.__version__)"
...
TensorFlow version: 2.15.0
-DTFLITE_WITH_RUY_GEMV
의 의미
sudo apt install valgrind
sudo apt install massif-visualizer
BACKENDS=cpu valgrind --tool=massif --pages-as-heap=no --detailed-freq=1 ./Product/out/bin/nnpackage_run benchmark_nnpkg_models/inception_v3
valgrind --tool=massif --pages-as-heap=yes --detailed-freq=1 ./Product/out/bin/tflite_run benchmark_nnpkg_models/inception_v3/inception_v3.tflite
options
--quiet --massif-out-file=<file>
--max-snapshots=1000
BUILD_TYPE=Release TARGET_ARCH=armv7l CROSS_BUILD=1 make -f Makefile.template configure build install
OPTIONS="-DDOWNLOAD_BOOST=ON -DBUILD_BOOST=ON -DBUILD_NNPACKAGE_RUN=ON -DBUILD_TFLITE_RUN=ON" \
BUILD_TYPE=Release TARGET_OS=android CROSS_BUILD=1 \
NDK_DIR=`pwd`/tools/cross/ndk/r20/ndk \
EXT_HDF5_DIR=`pwd`/hdf5_prebuilt \
make -f Makefile.template configure build install
BUILD_TYPE=Release make -f Makefile.template configure build install
Install git sdk on Windows https://github.com/git-for-windows/build-extra/releases
Install cmake on Windows
v3.16.3 works: https://github.com/Kitware/CMake/releases?page=7
v3.22 doesn't work: https://cmake.org/download/
NNAS_BUILD_PREFIX=build ./nnas create-package --preset 20210910_windows --prefix install 2>&1 | tee log3
NNAS_BUILD_PREFIX=build/release \
NNCC_WORKSPACE=build/release \
BUILD_TYPE=Release \
NPROC=4 \
./nnas create-package --preset 20220323 --prefix build/install
CLOG?
ViT(Vision Transformer)
MSA(Multi-head Self Attention)
Relevant issue: #7
$ file ./Product/out/bin/onert_train
./Product/out/bin/onert_train: ELF 64-bit LSB pie executable, x86-64, version 1 (GNU/Linux), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=7481670079cd705859d016d652d5e9dbd17c42b6, for GNU/Linux 3.2.0, with debug_info, not stripped
$ ./Product/out/bin/onert_train
E: Require one of options modelfile, nnpackage, or path.
$ ./Product/out/bin/onert_train --help
onert_train
Usage: ./Product/out/bin/onert_train[model path] [<options>]
General options:
-h [ --help ] Print available options
--version Print version and exit immediately
--nnpackage arg NN Package file(directory) name
--modelfile arg NN Model filename
--path arg NN Package or NN Modelfile path
--export_path arg Path to export circle
--load_input:raw arg NN Model Raw Input data file
The datafile must have data for each input number.
If there are 3 inputs, the data of input0 must exist as
much as data_length, and the data for input1 and input2
must be held sequentially as data_length.
--load_expected:raw arg NN Model Raw Expected data file
(Same data policy with load_input:raw)
-m [ --mem_poll ] arg (=0) Check memory polling
--epoch arg (=5) Epoch number (default: 5)
--batch_size arg (=32) Batch size (default: 32)
--learning_rate arg (=0.00100000005) Learning rate (default: 0.001)
--loss arg (=0) Loss type
0: MEAN_SQUARED_ERROR (default)
1: CATEGORICAL_CROSSENTROPY
--loss_reduction_type arg (=0) Loss Reduction type
0: AUTO (default)
1: SUM_OVER_BATCH_SIZE
2: SUM
--optimizer arg (=0) Optimizer type
0: SGD (default)
1: Adam
--metric arg (=-1) Metricy type
Simply calculates the metric value using the variables
(default: none)
0: CATEGORICAL_ACCURACY
--validation_split arg (=0) Float between 0 and 1. Fraction of the training data to be
used as validation data.
-v [ --verbose_level ] arg (=0) Verbose level
0: prints the only result. Messages btw run don't print
1: prints result and message btw run
2: prints all of messages to print
--output_sizes arg The output buffer size in JSON 1D array
If not given, the model's output sizes are used
e.g. '[0, 40, 2, 80]' to set 0th tensor to 40 and 2nd
tensor to 80.
$ ./Product/out/bin/onert_train --version
onert_train (nnfw runtime: v1.27.0)
let's fill
까먹은 게 많으니 다시 봐보자.
Revisit
ViT
karpathy
nnapi
BACKENDS="cpu" ./Product/out/unittest/nnapi_gtest --gtest_filter=-$(grep -v '#' "./Product/out/unittest/nnapi_gtest.skip.armv7l-linux.cpu" | tr '\n' ':')
tensorflow/lite/tools/benchmark/
params.AddParam("run_delay", BenchmarkParam::Create<float>(-1.0f));
CreateFlag<float>("run_delay", ¶ms_, "delay between runs in seconds"),
// benchmark_model.cc
Stat<int64_t> BenchmarkModel::Run(int min_num_times, float min_secs,
float max_secs, RunType run_type,
TfLiteStatus* invoke_status) {
// ...
for (int run = 0; (run < min_num_times || now_us < min_finish_us) &&
now_us <= max_finish_us;
run++) {
ResetInputsAndOutputs();
listeners_.OnSingleRunStart(run_type);
int64_t start_us = profiling::time::NowMicros();
TfLiteStatus status = RunImpl();
int64_t end_us = profiling::time::NowMicros();
listeners_.OnSingleRunEnd();
run_stats.UpdateStat(end_us - start_us);
util::SleepForSeconds(params_.Get<float>("run_delay")); // HERE
// benchmark_utils.h
// A convenient function that wraps tflite::profiling::time::SleepForMicros and
// simply return if 'sleep_seconds' is negative.
void SleepForSeconds(double sleep_seconds);
// benchmark_utils.cc
void SleepForSeconds(double sleep_seconds) {
if (sleep_seconds <= 0.0) {
return;
}
// If requested, sleep between runs for an arbitrary amount of time.
// This can be helpful to determine the effect of mobile processor
// scaling and thermal throttling.
tflite::profiling::time::SleepForMicros(
static_cast<uint64_t>(sleep_seconds * 1e6));
}
binary
tools/benchmark/BUILD
"benchmark_model_main"
"benchmark_model"
"benchmark_model_performance_options"
...
ITensor: has uint8_t *_buffer
cpu_common::Tensor // const_tensor or io_tensor
IPortableTensor 를 상속하게 되면, no padding이 된다. cpu에서는 NHWC 밖에 layout을 지원하지 않음
ITensorRegistry: has ITensor
ITensorBuilder
cpu_common::DynamicTensorManager -> IDynamicTensorManager # for dynamic tensor
cpu_common::StaticTensorManager -> ITensorManager # for static tensor
cpu_common::Allocator
https://www.tensorflow.org/api_docs/python/tf/nn/conv2d
Computes a 2-D convolution given input and 4-D filters tensors.
tf.nn.conv2d(
input, filters, strides, padding, data_format='NHWC', dilations=None, name=None
)
input: batch_shape + [in_height, in_width, in_channels]
kernel: [filter_height, filter_width, in_channels, out_channels]
1.Flattens the filter to a 2-D matrix with shape [filter_height * filter_width * in_channels, output_channels].
2. Extracts image patches from the input tensor to form a virtual tensor of shape [batch, out_height, out_width, filter_height * filter_width * in_channels].
3. For each patch, right-multiplies the filter matrix and the image patch vector.
output[b, i, j, k] =
sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
filter[di, dj, q, k]
https://www.tensorflow.org/api_docs/python/tf/nn/depthwise_conv2d
tf.nn.depthwise_conv2d(
input, filter, strides, padding, data_format=None, dilations=None, name=None
)
[filter_height, filter_width, in_channels, channel_multiplier] containing in_channels convolutional filters of depth 1, depthwise_conv2d applies a different filter to each input channel (expanding from 1 channel to channel_multiplier channels for each), then concatenates the results together. The output has in_channels * channel_multiplier channels.
output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
strides[2] * j + rate[1] * dj, k]
The CONV_2D op sweeps a 2-D filter that can mix channels together over a batch of images, applying the filter to each window of each image of the appropriate size.
The output dimensions are functions of the filter dimensions, stride, and padding.
The values in the output tensor are computed as:
output[b, i, j, channel] =
sum_{di, dj, k} (
input[b, strides[1] * i + di, strides[2] * j + dj, k] *
filter[channel, di, dj, k]
) + bias[channel]
Given an input tensor of shape [batches, height, width, depth_in] and a filter tensor of shape [1, filter_height, filter_width, depth_out] containing depth_out convolutional filters of depth 1, DEPTHWISE_CONV applies a different filter to each input channel (expanding from 1 channel to channel_multiplier channels for each), then concatenates the results together.
The output has depth_out = depth_in * depth_multiplier channels. The output dimensions are functions of the filter dimensions, stride, and padding.
output[b, i, j, k * channel_multiplier + q] =
sum_{di, dj} (
input[b, strides[1] * i + di, strides[2] * j + dj, k] *
filter[1, di, dj, k * channel_multiplier + q]
) + bias[k * channel_multiplier + q]
prebuilt hdf5: http://npuarchive.mooo.com/archive/hdf_android/hdf5-1.10.0-android-aarch64.tar.gz
build nnpackage_run for android
OPTIONS="-DDOWNLOAD_BOOST=ON -DBUILD_BOOST=ON -DBUILD_NNPACKAGE_RUN=ON -DBUILD_TFLITE_RUN=ON" \
BUILD_TYPE=Release TARGET_OS=android CROSS_BUILD=1 \
NDK_DIR=`pwd`/tools/cross/ndk/r20/ndk \
EXT_HDF5_DIR=`pwd`/hdf5_prebuilt \
make -f Makefile.template configure build install
INFO: Found applicable config definition build:v2 in file /home/dragon/Works/github/tensorflow/.bazelrc: \
--define=tf_api_version=2 \
--action_env=TF2_BEHAVIOR=1
INFO: Found applicable config definition build:xla in file /home/dragon/Works/github/tensorflow/.bazelrc: \
--action_env=TF_ENABLE_XLA=1 \
--define=with_xla_support=true
INFO: Found applicable config definition build:android_arm64 in file /home/dragon/Works/github/tensorflow/.bazelrc: \
--config=android \
--cpu=arm64-v8a \
--fat_apk_cpu=arm64-v8a
INFO: Found applicable config definition build:android in file /home/dragon/Works/github/tensorflow/.bazelrc: \
--crosstool_top=//external:android/crosstool \
--host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
--noenable_platform_specific_config \
--copt=-w --cxxopt=-std=c++14 \
--host_cxxopt=-std=c++14
https://pip.pypa.io/en/stable/topics/configuration/
[global]
trusted-host = pypi.org
files.pythonhosted.org
pypi.python.org
어떤 모델을 트레이닝 시키기 위해 onert에 Contribution하는 것을 생각해보자.
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.