After updating NVIDIA driver to 510.06, CUDA is finally recognized.
(codeGen_env) usr1@mak:~/projects/CodeGen$ python -m torch.utils.collect_env
Collecting environment information...
PyTorch version: 1.7.0
Is debug build: True
CUDA used to build PyTorch: 11.0
ROCM used to build PyTorch: N/A
OS: Ubuntu 20.04.3 LTS (x86_64)
GCC version: (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Clang version: Could not collect
CMake version: Could not collect
Python version: 3.6 (64-bit runtime)
**Is CUDA available: True**
CUDA runtime version: 10.1.243
GPU models and configuration: GPU 0: NVIDIA GeForce GTX 1080
Nvidia driver version: 510.06
cuDNN version: Could not collect
HIP runtime version: N/A
MIOpen runtime version: N/A
Versions of relevant libraries:
[pip3] numpy==1.19.5
[pip3] torch==1.7.0
[pip3] torchaudio==0.7.0a0+ac17b64
[pip3] torchvision==0.8.1
[conda] blas 1.0 mkl
[conda] cudatoolkit 11.0.3 h15472ef_9 conda-forge
[conda] ffmpeg 4.3 hf484d3e_0 pytorch
[conda] libblas 3.9.0 12_linux64_mkl conda-forge
[conda] libcblas 3.9.0 12_linux64_mkl conda-forge
[conda] liblapack 3.9.0 12_linux64_mkl conda-forge
[conda] mkl 2021.4.0 h06a4308_640
[conda] numpy 1.19.5 py36hfc0c790_2 conda-forge
[conda] pytorch 1.7.0 py3.6_cuda11.0.221_cudnn8.0.3_0 pytorch
[conda] pytorch-mutex 1.0 cuda pytorch
[conda] torchaudio 0.7.0 py36 pytorch
[conda] torchvision 0.8.1 py36_cu110 pytorch
(codeGen_env) usr1@mak:~/projects/CodeGen$
Successfully compiled cuda extensions but had to comment out this section in apex to suppress a runtime error as per the output suggestion. This may (or may not) be the cause of CUBLAS_STATUS_NOT_INITIALIZED
(see below).
if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
raise RuntimeError("Cuda extensions are being compiled with a version of Cuda that does " +
...
(codeGen_env) usr1@mak:~/projects/CodeGen/apex$ pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
/home/usr1/anaconda3/envs/codeGen_env/lib/python3.6/site-packages/pip/_internal/commands/install.py:245: UserWarning: Disabling all use of wheels due to the use of --build-option / --global-option / --install-option.
cmdoptions.check_install_build_global(options)
Using pip 21.3.1 from /home/usr1/anaconda3/envs/codeGen_env/lib/python3.6/site-packages/pip (python 3.6)
Processing /home/usr1/projects/CodeGen/apex
Running command python setup.py egg_info
torch.__version__ = 1.7.0
running egg_info
creating /tmp/pip-pip-egg-info-_podlbbz/apex.egg-info
writing /tmp/pip-pip-egg-info-_podlbbz/apex.egg-info/PKG-INFO
writing dependency_links to /tmp/pip-pip-egg-info-_podlbbz/apex.egg-info/dependency_links.txt
writing top-level names to /tmp/pip-pip-egg-info-_podlbbz/apex.egg-info/top_level.txt
writing manifest file '/tmp/pip-pip-egg-info-_podlbbz/apex.egg-info/SOURCES.txt'
reading manifest file '/tmp/pip-pip-egg-info-_podlbbz/apex.egg-info/SOURCES.txt'
adding license file 'LICENSE'
writing manifest file '/tmp/pip-pip-egg-info-_podlbbz/apex.egg-info/SOURCES.txt'
/home/usr1/projects/CodeGen/apex/setup.py:67: UserWarning: Option --pyprof not specified. Not installing PyProf dependencies!
warnings.warn("Option --pyprof not specified. Not installing PyProf dependencies!")
Preparing metadata (setup.py) ... done
Skipping wheel build for apex, due to binaries being disabled for it.
Installing collected packages: apex
Running command /home/usr1/anaconda3/envs/codeGen_env/bin/python3.6 -u -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'/home/usr1/projects/CodeGen/apex/setup.py'"'"'; __file__='"'"'/home/usr1/projects/CodeGen/apex/setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' --cpp_ext --cuda_ext install --record /tmp/pip-record-tezgj0d4/install-record.txt --single-version-externally-managed --compile --install-headers /home/usr1/anaconda3/envs/codeGen_env/include/python3.6m/apex
torch.__version__ = 1.7.0
/home/usr1/projects/CodeGen/apex/setup.py:67: UserWarning: Option --pyprof not specified. Not installing PyProf dependencies!
warnings.warn("Option --pyprof not specified. Not installing PyProf dependencies!")
Compiling cuda extensions with
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243
from /usr/bin
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/home/usr1/projects/CodeGen/apex/setup.py", line 159, in <module>
check_cuda_torch_binary_vs_bare_metal(CUDA_HOME)
File "/home/usr1/projects/CodeGen/apex/setup.py", line 103, in check_cuda_torch_binary_vs_bare_metal
"https://github.com/NVIDIA/apex/pull/323#discussion_r287021798. "
RuntimeError: Cuda extensions are being compiled with a version of Cuda that does not match the version used to compile Pytorch binaries. Pytorch binaries were compiled with Cuda 11.0.
In some cases, a minor-version mismatch will not cause later errors: https://github.com/NVIDIA/apex/pull/323#discussion_r287021798. You can try commenting out this check (at your own risk).
int DBG_change(int level_offset)
{
int retval = 0;
if ((debug_level += level_offset) < MSG_DEBUG_LEVEL_NONE)
{
debug_level = MSG_DEBUG_LEVEL_NONE;
debug_status = 0;
}
if (! debug_file_ptr && debug_level >= MSG_DEBUG_LEVEL_NONE)
{
retval = DBG_setup((char *) NULL, (char *) NULL);
}
else if (debug_file_ptr && debug_level <= MSG_DEBUG_LEVEL_NONE)
DBG_close();
return (retval);
}
static void DBG_set_level(char *app_name)
{
char *envptr;
char debug_env[MAX_ENVVARLEN + 1];
if ((app_name != NULL) && *app_name)
{
strncpy(debug_env, app_name, MAX_ENVVARLEN);
strncat(debug_env, "_DEBUG_LEVEL", MAX_ENVVARLEN - strlen(debug_env));
}
else
{
strcpy(debug_env, "DEBUG_LEVEL");
}
envptr = (char *) getenv(debug_env);
if (envptr == NULL)
debug_level = MSG_DEBUG_LEVEL_NONE;
else if (! strcmp(envptr, "TRUE") || ! strcmp(envptr, "true"))
debug_level = MSG_DEBUG_LEVEL_ON_MIN;
else if (! strcmp(envptr, "MIN") || ! strcmp(envptr, "min"))
debug_level = MSG_DEBUG_LEVEL_ON_MIN;
else if (! strcmp(envptr, "NORM") || ! strcmp(envptr, "norm"))
debug_level = MSG_DEBUG_LEVEL_ON_NORM;
else if (! strcmp(envptr, "MAX") || ! strcmp(envptr, "max"))
debug_level = MSG_DEBUG_LEVEL_ON_MAX;
else if (! strcmp(envptr, "FALSE") || ! strcmp(envptr, "false"))
debug_level = MSG_DEBUG_LEVEL_NONE;
else if (isdigit(envptr[0]))
{
if ((debug_level = (int) atoi(envptr)) <= MSG_DEBUG_LEVEL_NONE)
debug_level = MSG_DEBUG_LEVEL_NONE;
}
else
{
debug_level = MSG_DEBUG_LEVEL_NONE;
}
}
(codeGen_env) usr1@mak:~/projects/CodeGen$ python -m codegen_sources.model.translate --src_lang cpp --tgt_lang java --model_path TransCoder_model_1.pth --beam_size 10 < csrc.c
adding to path /home/usr1/projects/CodeGen
INFO - 12/13/21 16:50:08 - 0:00:05 - ============ Model Reloading
INFO - 12/13/21 16:50:08 - 0:00:05 - Reloading encoder from TransCoder_model_1.pth ...
WARNING - 12/13/21 16:50:13 - 0:00:09 - Lang cpp_sa matched to pretrained cpp_sa lang embedding.
WARNING - 12/13/21 16:50:13 - 0:00:09 - Lang java_sa matched to pretrained java_sa lang embedding.
WARNING - 12/13/21 16:50:13 - 0:00:09 - Lang python_sa matched to pretrained python_sa lang embedding.
WARNING - 12/13/21 16:50:13 - 0:00:09 - The size of position embeddings in current model is 2048, the size of reloaded is 1024. need to repeat last positions 1024 times.
INFO - 12/13/21 16:50:13 - 0:00:10 - Reloading decoders from TransCoder_model_1.pth ...
WARNING - 12/13/21 16:50:14 - 0:00:11 - Lang cpp_sa matched to pretrained cpp_sa lang embedding.
WARNING - 12/13/21 16:50:14 - 0:00:11 - Lang java_sa matched to pretrained java_sa lang embedding.
WARNING - 12/13/21 16:50:14 - 0:00:11 - Lang python_sa matched to pretrained python_sa lang embedding.
WARNING - 12/13/21 16:50:14 - 0:00:11 - The size of position embeddings in current model is 2048, the size of reloaded is 1024. need to repeat last positions 1024 times.
INFO - 12/13/21 16:50:14 - 0:00:11 - Number of parameters (encoder): 143239641
INFO - 12/13/21 16:50:14 - 0:00:11 - Number of parameters (decoders): 168442329
INFO - 12/13/21 16:50:14 - 0:00:11 - Number of decoders: 1
...
/opt/conda/conda-bld/pytorch_1603729128610/work/aten/src/ATen/native/cuda/Indexing.cu:658: indexSelectLargeIndex: block: [158,0,0], thread: [124,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1603729128610/work/aten/src/ATen/native/cuda/Indexing.cu:658: indexSelectLargeIndex: block: [158,0,0], thread: [125,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1603729128610/work/aten/src/ATen/native/cuda/Indexing.cu:658: indexSelectLargeIndex: block: [158,0,0], thread: [126,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1603729128610/work/aten/src/ATen/native/cuda/Indexing.cu:658: indexSelectLargeIndex: block: [158,0,0], thread: [127,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
Traceback (most recent call last):
File "/home/usr1/anaconda3/envs/codeGen_env/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/home/usr1/anaconda3/envs/codeGen_env/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/usr1/projects/CodeGen/codegen_sources/model/translate.py", line 276, in <module>
beam_size=params.beam_size,
File "/home/usr1/projects/CodeGen/codegen_sources/model/translate.py", line 192, in translate
enc1 = self.encoder("fwd", x=x1, lengths=len1, langs=langs1, causal=False)
File "/home/usr1/anaconda3/envs/codeGen_env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/usr1/projects/CodeGen/codegen_sources/model/src/model/transformer.py", line 433, in forward
return self.fwd(**kwargs)
File "/home/usr1/projects/CodeGen/codegen_sources/model/src/model/transformer.py", line 526, in fwd
attn = self.attentions[i](tensor, attn_mask, use_cache=use_cache)
File "/home/usr1/anaconda3/envs/codeGen_env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/usr1/projects/CodeGen/codegen_sources/model/src/model/transformer.py", line 243, in forward
q = shape(self.q_lin(input)) # (bs, n_heads, qlen, dim_per_head)
File "/home/usr1/anaconda3/envs/codeGen_env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/usr1/anaconda3/envs/codeGen_env/lib/python3.6/site-packages/torch/nn/modules/linear.py", line 93, in forward
return F.linear(input, self.weight, self.bias)
File "/home/usr1/anaconda3/envs/codeGen_env/lib/python3.6/site-packages/torch/nn/functional.py", line 1692, in linear
output = input.matmul(weight.t())
RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`