mlverse / torch Goto Github PK
View Code? Open in Web Editor NEWR Interface to Torch
Home Page: https://torch.mlverse.org
License: Other
R Interface to Torch
Home Page: https://torch.mlverse.org
License: Other
#48 implemented basic datasets support. In the end we have something like this:
x <- torch_randn(1000, 100)
y <- torch_randn(1000, 1)
dataset <- utils_dataset_tensor(x, y)
dl <- DataLoader$new(dataset = dataset, batch_size = 32)
iter <- dl$.iter()
b <- iter$.next()
In python, you can easily iterate over a class that has an __iter__
method, so the API looks like just:
for x, y in dl:
...
We need to think how this API would look like in R.
Would be great if nnf_nll_loss would have a default value for ignore_index
https://github.com/mlverse/torch/blob/master/R/nnf-loss.R#L343
e.g., these should be shifted by one:
> x <- torch_arange(0, 4)$view(c(2,2))
> x
torch_tensor
0 1
2 3
[ CPUFloatType{2,2} ]
> x$sum(dim = 0)
torch_tensor
2
4
[ CPUFloatType{2} ]
> x$sum(dim = 1)
torch_tensor
1
5
[ CPUFloatType{2} ]
> t <- torch_tensor(2, device = torch_device("cuda"))
> t$to(torch_device("cpu"))
Error: cpp_torch_method_to_self_Tensor_device_Device does not exist
Run `rlang::last_error()` to see where the error occurred.
7.
stop(fallback)
6.
signal_abort(cnd)
5.
rlang::abort(glue::glue(..., .envir = env), class = "value_error") at conditions.R#2
4.
value_error("{fun_name} does not exist") at codegen-utils.R#196
3.
call_c_function(fun_name = "to", args = args, expected_types = expected_types,
nd_args = nd_args, return_types = return_types, fun_type = "method") at gen-method.R#4556
2.
o(x, x$private, ...) at R7.R#69
1.
t$to(torch_device("cpu"))
I've been trying to reproduce this https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
Using the development version of the sentencepiece package to get embeddings
That seems to work. Some features before I could build this are functionalities to increase the batch_size. For this I think what is needed is pack_padded_sequence / pad_packed_sequence?
Any chance to put this on your timeline?
library(sentencepiece) #remotes::install_github("bnosac/sentencepiece")
library(word2vec) # from CRAN
library(torch)
downloads <- sentencepiece_download_model("english", vocab_size = 25000, dim = 25, model_dir = getwd())
bpemb <- BPEembed(file_sentencepiece = downloads$file_model,
file_word2vec = downloads$glove.bin$file_model)
##
## DOCS AT https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
##
ner_bilstm <- nn_module(
classname = "ner_bilstm",
initialize = function(bpemb, tagset_size, hidden_size = 32, num_layers = 1, seq_len = 100) {
self$bpemb <- bpemb
self$embedding_dim <- bpemb$dim
self$seq_len <- seq_len
self$tagset_size <- as.integer(tagset_size)
self$num_layers <- as.integer(num_layers)
self$hidden_size <- as.integer(hidden_size)
## LSTM Layer
self$rnn <- nn_rnn(mode = "LSTM", input_size = self$embedding_dim,
hidden_size = self$hidden_size, num_layers = self$num_layers,
dropout = 0, bidirectional = TRUE, nonlinearity = "tanh", batch_first = TRUE)
## Map of LSTM features to tag space
self$linear <- nn_linear(in_features = hidden_size * 2, out_features = tagset_size, bias = TRUE)
},
forward = function(tokensequence) {
# current implementation only works for 1 tokensequence at a time
batch_size <- 1L
## Tokenize and get the embedding of the tokens - sentencepiece splits according to subwords, take the average embedding of the subwords
emb <- predict(self$bpemb, newdata = tokensequence, type = "encode")
emb <- lapply(emb, colMeans)
emb <- do.call(rbind, emb)
## Pad zeros to the maximum length of the text sequence + put these in a tensor
#emb <- rbind(emb, matrix(0, nrow = seq_len - nrow(emb), ncol = ncol(emb)))
n_tokens <- nrow(emb)
## Forward pass, geting the LSTM features
emb <- torch_tensor(emb, dtype = torch_float())
emb <- torch_reshape(emb, list(batch_size, -1, self$embedding_dim))
#rnn_out <- self$rnn(emb)
rnn_out <- self$rnn(emb, torch_zeros(self$num_layers * 2, batch_size, self$hidden_size, dtype = emb$dtype()))
rnn_out <- rnn_out[[1]]
## Put the LSTM feature in 1 line per token and do a Forward pass over the linear layer
rnn_out <- torch_reshape(rnn_out, list(n_tokens, -1))
tag_space <- self$linear(rnn_out)
## Softmax by token, output is 1 line per token with as many columns as there are labels to predict
tag_scores <- torch_reshape(tag_space, list(-1, self$tagset_size))
nnf_log_softmax(tag_scores, dim = 1L)
}
)
traindata <- list(
data.frame(doc_id = 1,
token = c("the", "dog", "ate", "the", "apple"),
entity = c("DET", "NN", "V", "DET", "NN"),
entity_nr = c(0, 1, 2, 0, 1), stringsAsFactors = FALSE),
data.frame(doc_id = 2,
token = c("everybody", "read", "that", "book"),
entity = c("NN", "V", "DET", "NN"),
entity_nr = c(1, 2, 0, 1), stringsAsFactors = FALSE))
model <- ner_bilstm(bpemb = bpemb, tagset_size = 3, hidden_size = 8)
optimizer <- optim_sgd(model$parameters, lr = 0.1, momentum = 0.9)
for(epoch in 1:10){
cat(sprintf("%s epoch %s", Sys.time(), epoch), sep = "\n")
for(b in traindata){
optimizer$zero_grad()
tokens <- b[["token"]]
reality <- b[["entity_nr"]]
reality <- torch_tensor(reality, dtype = torch_long())
tag_scores <- model(tokens)
loss <- nnf_nll_loss(tag_scores, reality, ignore_index = -1)
loss$backward()
optimizer$step()
}
}
Implement multiworker data loaders
Explain creation functions. see: https://pytorch.org/cppdocs/notes/tensor_creation.html
x <- torch_randn(4, 3)
y <- x[ , 1, drop = FALSE]
torch_size(y) # (4,1)
loss = (y_pred - y).pow(2).sum().item()
The bucket we are uploading lantern builds should be public for downloads, so it can be used by users when first installing torch.
We could upload to github releases: see for instance: https://github.com/svenstaro/upload-release-action
library(torch)
x <- torch_randn(1000, 10)
index <- 1:10
x[index,1:2]
We currently only support cpu builds of lantern
From R CHECK:
Found the following executable file:
src/torchpkg.dll
Source packages should not contain undeclared executable files.
Looks like that needs to be cleaned.
export CUDNN_INCLUDE_DIR=/usr/include/cuda
export TORCH_BACKEND=CUDA
export CUDA_HOME=/usr
export CUDA_TOOLKIT_ROOT_DIR=/usr
Which @skeydan uses in her build.
x <- torch_randn(n, d_in)
y <- x[ : , 1, NULL]
This works for me when building example-app
for the new API binaries (with cuda which is not the point - yet...)
/usr/bin/c++ -D_GLIBCXX_USE_CXX11_ABI=1
-rdynamic CMakeFiles/example-app.dir/example-app.cpp.o -o example-app
-L/usr/lib64 -Wl,-rpath,/home/key/libtorch/lib:/usr/lib64
/home/key/libtorch/lib/libtorch.so /home/key/libtorch/lib/libc10.so
-lcuda -lnvrtc -lnvToolsExt -lcudart /home/key/libtorch/lib/libc10_cuda.so
-lpthread /home/key/libtorch/lib/libc10_cuda.so /home/key/libtorch/lib/libc10.so
-lnvToolsExt -lcudart -lcufft /usr/lib64/libcurand.so /usr/lib64/libcudnn.so
/usr/lib64/libculibos.a -ldl /usr/lib64/libculibos.a -ldl -lcublas
-Wl,--no-as-needed,/home/key/libtorch/lib/libtorch.so -Wl,--as-needed -lcudart
Is it ok for me to change MakeVars.in
to say
-D_GLIBCXX_USE_CXX11_ABI=1
and remove the compiler warning?
#67 added support for packed sequences.
Now we can add support for packed sequences in the RNN layers. See https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/rnn.py#L209-L218
I think for OS X, we would want to install via Homebrew like V8, arrow, etc. Otherwise, without updating XCode I'm getting the following error while building the package.
2020-01-10 11:09:03 URL:https://github-production-release-asset-2e65be.s3.amazonaws.com/58414589/5faa9200-9c0e-11e9-8023-06d3ddb1dc9a?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200110T190901Z&X-Amz-Expires=300&X-Amz-Signature=6caa76afd7b279272addf378e0992ae921d0dc6ba24f5225049172e9e248d18b&X-Amz-SignedHeaders=host&actor_id=0&response-content-disposition=attachment%3B%20filename%3Dmklml_mac_2019.0.5.20190502.tgz&response-content-type=application%2Foctet-stream [28288113/28288113] -> "/Users/javierluraschi/libtorch/mkl.tgz" [1]
unzipping mkl library
TORCH_BACKEND!='CUDA' - using 'CPU' backend
----------------------------------------------------------------
Using PKG_LIBS=-L/Users/javierluraschi/libtorch/lib -Wl,-rpath,/Users/javierluraschi/libtorch/lib -ltorch
Using PKG_CFLAGS=-I/Users/javierluraschi/libtorch/include -I/Users/javierluraschi/libtorch/include/torch/csrc/api/include
----------------------------------------------------------------
** libs
xcrun: error: invalid active developer path (/Library/Developer/CommandLineTools), missing xcrun at: /Library/Developer/CommandLineTools/usr/bin/xcrun
ERROR: compilation failed for package ‘torch’
* removing ‘/Library/Frameworks/R.framework/Versions/3.5/Resources/library/torch’
Exited with status 1.
Eg:
y <- torch_randn(1000)
y[1:10,..]
Currently the GPU CI is not marked as 'error' even if it fails.
Maybe using the -- error
option in docker run
will make it work.
In python when when converting a integer to tensor we get:
>>> import torch
>>> torch.tensor([1]).dtype
torch.int64
Most operations will expect torch_long()
and not int32
types - see for example #53 .
Great work!
Just a message to indicate that I really look forward in having access to libtorch directly from R. Especially the lstm, gru, rnn functionalities which are very usefull in NLP.
For developers, should be easy to install torch without re-downloading libtorch
Implement Iterable Dataset class and Data loaders for it
Provide wrappers for the cuda module: see https://pytorch.org/docs/stable/cuda.html
x <- torch_randn(5, 10)
x[1,]$shape
I'm mainly developing on Windows using a cpu.
At library(torchr)
this downloaded https://github.com/mlverse/torch/blob/master/R/lantern_install.R#L14 the precompiled dll's from libtorch.
When loading the package again library(torchr)
it complained about VCRUNTIME140_1.dll not being available on my computer.
Had to install the latest version of Microsoft Visual C++ Redistributable for Visual Studio 2019 to get rid of the error message (I deleted it 3 weeks ago while doing some computer cleanup) and get started.
After successful compilation:
> library(torchr)
You need to install libtorch in order to use torchr.
Do you want to download it now? ~100Mb (yes/no)yes
trying URL 'https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip'
Content type 'application/zip' length 103918316 bytes (99.1 MB)
==================================================
downloaded 99.1 MB
trying URL 'https://storage.googleapis.com/lantern-builds/refs/heads/master/latest/Linux.zip'
Content type 'application/zip' length 868607 bytes (848 KB)
==================================================
downloaded 848 KB
Torch was successfully installed.
Please restar your R session now.
Restarting R session...
> devtools::load_all(".")
Loading torchr
You need to install libtorch in order to use torchr.
Do you want to download it now? ~100Mb (yes/no)no
From the above I would have 3 questions/problems:
What if I already have a libtorch
I want to use? (I would hope that in this case, only lantern
the R package would need to be installed?)
If I agree to install torch, should I be able to choose between cpu and gpu builds?
I got positive feedback ("torch installed") but evidently it was not ...
This just to start the issue, will try to find out some stuff and comment below...
See comment in #51
Just a note for the documenation.
Had another machine running Ubuntu 14.04 (Trusty) but apparently having an outdated version for the build of libtorch.
> library(torch)
Error : .onLoad failed in loadNamespace() for 'torch', details:
call: cpp_lantern_init(lantern_install_path())
error: /home/jwijffels/R/x86_64-pc-linux-gnu-library/3.2/torch/deps/liblantern.so - /usr/lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.21' not found (required by /home/jwijffels/R/x86_64-pc-linux-gnu-library/3.2/torch/deps/liblantern.so)
Error: package or namespace load failed for ‘torch’
I inspected and my machine had GLIBCXX_3.4.19, apparently too old for the shared library.
t <- torch_tensor(2, device = torch_device("cuda"))
as_array(t)
I know this is not supposed to work (need to move to cpu first) but it would probably be nice not to segfault :-)
We need to autogenerate documentation for the torch_tensor
class.
Whenever we do it we should also add support for parsing the docs in the help_handlers
functions. see #77 .
Currently, with tensor modification, our strategy seems to be:
torch_
) only when a fresh object is returnedx$what()
) only when the existing object is modifiede.g. tx$view()
but torch_reshape
I assume this is why I don't see an x$clone()
(also I notice we have x$t_()
but not x$t()
)
But, a person coming from Python might expect $
access to work independently of whether there are side effects or not (there is an x.clone()
, etc.)
Not saying we have to follow this ... but partly we do: For example, there is x$mm
as well as x$mm_
in R ...
What do you think?
FYI
current version on windows fails to load appropriately the dll
It says it can not find lantern_TensorIndex_new which is n lantern.h
R version 4.0.1 (2020-06-06) -- "See Things Now"
Copyright (C) 2020 The R Foundation for Statistical Computing
Platform: x86_64-w64-mingw32/x64 (64-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
> library(torch)
Error: package or namespace load failed for ‘torch’:
.onLoad failed in loadNamespace() for 'torch', details:
call: cpp_lantern_init(lantern_install_path())
error: lantern_TensorIndex_new - Kan opgegeven procedure niet vinden.
Raw steps that may needed to successfully compile a torch program (starting from what is needed for me - will keep issue updated):
Example CMakeLists.txt
:
cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
project(example-app)
find_package(Torch REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
add_executable(example-app example-app.cpp)
target_link_libraries(example-app "${TORCH_LIBRARIES}")
set_property(TARGET example-app PROPERTY CXX_STANDARD 14)
1)step 1 (cmake
): cannot find cudnn
[key@snowi build]$ cmake -DCMAKE_PREFIX_PATH=~/libtorch ..
-- Caffe2: CUDA detected: 10.1
-- Caffe2: CUDA nvcc is: /usr/bin/nvcc
-- Caffe2: CUDA toolkit directory: /usr
-- Caffe2: Header version is: 10.1
-- Could NOT find CUDNN (missing: CUDNN_INCLUDE_PATH) `
Solution:
add to .bashrc
:
export CUDNN_INCLUDE_DIR=/usr/include/cuda
make
): `/usr/local/cuda/lib64/libnvToolsExt.so``(?!?)make[2]: *** No rule to make target '/usr/local/cuda/lib64/libnvToolsExt.so', needed by 'example-app'. Stop.
see: https://discuss.pytorch.org/t/libtorch-cmake-issues/28246/7
This does not seem to have been solved.
Duplicate entries in CMakeFiles/example-app.dir/build.make
:
example-app: /usr/lib64/libnvToolsExt.so..
...
example-app: /usr/local/cuda/lib64/libnvToolsExt.so
same for libcudart
two wrong for libculibos
example-app: /usr/local/cuda/lib64/libculibos.a
example-app: /usr/local/cuda/lib64/libculibos.a
Also need to edit CMakeFiles/example-app.dir/link.txt
:
Pre:
/usr/bin/c++ -D_GLIBCXX_USE_CXX11_ABI=0 -rdynamic CMakeFiles/example-app.dir/example-app.cpp.o -o example-app -L/usr/local/cuda/lib64 -Wl,-rpath,/home/key/libtorch/lib:/usr/local/cuda/lib64 /home/key/libtorch/lib/libtorch.so /home/key/libtorch/lib/libc10.so -lcuda -lnvrtc -lnvToolsExt -lcudart /home/key/libtorch/lib/libc10_cuda.so -lpthread /home/key/libtorch/lib/libc10_cuda.so /home/key/libtorch/lib/libc10.so -lnvToolsExt -lcudart -lcufft /usr/lib64/libcurand.so /usr/lib64/libcudnn.so /usr/local/cuda/lib64/libculibos.a -ldl /usr/local/cuda/lib64/libculibos.a -ldl -lcublas -Wl,--no-as-needed,/home/key/libtorch/lib/libtorch.so -Wl,--as-needed -lcudart
Post:
/usr/bin/c++ -D_GLIBCXX_USE_CXX11_ABI=0 -rdynamic CMakeFiles/example-app.dir/example-app.cpp.o -o example-app -L/usr/lib64 -Wl,-rpath,/home/key/libtorch/lib:/usr/lib64 /home/key/libtorch/lib/libtorch.so /home/key/libtorch/lib/libc10.so -lcuda -lnvrtc -lnvToolsExt -lcudart /home/key/libtorch/lib/libc10_cuda.so -lpthread /home/key/libtorch/lib/libc10_cuda.so /home/key/libtorch/lib/libc10.so -lnvToolsExt -lcudart -lcufft /usr/lib64/libcurand.so /usr/lib64/libcudnn.so /usr/lib64/libculibos.a -ldl /usr/lib64/libculibos.a -ldl -lcublas -Wl,--no-as-needed,/home/key/libtorch/lib/libtorch.so -Wl,--as-needed -lcudart
This makes it work but I wouldn't know how it fits into the setup from an R package...
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.