I've been trying to initialize a dora experiment on two 4090s. Specifically, I am training the HTDemucs model from FB. I've run into this issue when uploading a new dataset, but now it seems whenever I initialize dora or run any command with dora, I get the following error. @adefossez
File "/home/robertthomas/.local/bin/dora", line 5, in <module>
from dora.__main__ import main
File "/home/robertthomas/.local/lib/python3.10/site-packages/dora/__init__.py", line 66, in <module>
from .explore import Explorer, Launcher
File "/home/robertthomas/.local/lib/python3.10/site-packages/dora/explore.py", line 27, in <module>
from .shep import Shepherd, Sheep
File "/home/robertthomas/.local/lib/python3.10/site-packages/dora/shep.py", line 25, in <module>
from .distrib import get_distrib_spec
File "/home/robertthomas/.local/lib/python3.10/site-packages/dora/distrib.py", line 14, in <module>
import torch
File "/home/robertthomas/.local/lib/python3.10/site-packages/torch/__init__.py", line 1465, in <module>
from . import _meta_registrations
File "/home/robertthomas/.local/lib/python3.10/site-packages/torch/_meta_registrations.py", line 7, in <module>
from torch._decomp import _add_op_to_registry, global_decomposition_table, meta_table
File "/home/robertthomas/.local/lib/python3.10/site-packages/torch/_decomp/__init__.py", line 169, in <module>
import torch._decomp.decompositions
File "/home/robertthomas/.local/lib/python3.10/site-packages/torch/_decomp/decompositions.py", line 10, in <module>
import torch._prims as prims
File "/home/robertthomas/.local/lib/python3.10/site-packages/torch/_prims/__init__.py", line 33, in <module>
from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
File "/home/robertthomas/.local/lib/python3.10/site-packages/torch/_subclasses/__init__.py", line 3, in <module>
from torch._subclasses.fake_tensor import (
File "/home/robertthomas/.local/lib/python3.10/site-packages/torch/_subclasses/fake_tensor.py", line 13, in <module>
from torch._guards import Source
File "/home/robertthomas/.local/lib/python3.10/site-packages/torch/_guards.py", line 14, in <module>
import sympy # type: ignore[import]
File "/home/robertthomas/.local/lib/python3.10/site-packages/sympy/__init__.py", line 30, in <module>
from sympy.core.cache import lazy_function
File "/home/robertthomas/.local/lib/python3.10/site-packages/sympy/core/__init__.py", line 4, in <module>
from .sympify import sympify, SympifyError
File "/home/robertthomas/.local/lib/python3.10/site-packages/sympy/core/sympify.py", line 8, in <module>
from sympy.core.random import choice
File "/home/robertthomas/.local/lib/python3.10/site-packages/sympy/core/random.py", line 25, in <module>
from sympy.utilities.iterables import is_sequence
File "/home/robertthomas/.local/lib/python3.10/site-packages/sympy/utilities/__init__.py", line 4, in <module>
from .iterables import (flatten, group, take, subsets,
File "/home/robertthomas/.local/lib/python3.10/site-packages/sympy/utilities/iterables.py", line 16, in <module>
from sympy.utilities.misc import as_int
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 879, in exec_module
File "<frozen importlib._bootstrap_external>", line 1012, in get_code
File "<frozen importlib._bootstrap_external>", line 672, in _compile_bytecode
ValueError: bad marshal data (invalid reference)
terminate called after throwing an instance of 'c10::Error'
what(): Number of tensor lists has to match the depth.
Exception raised from multi_tensor_apply at ../aten/src/ATen/native/cuda/MultiTensorApply.cuh:92 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb90af9e4d7 in /home/robertthomas/.local/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, char const*) + 0x68 (0x7fb90af68434 in /home/robertthomas/.local/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #2: <unknown function> + 0x17b292d (0x7fb8ac1b292d in /home/robertthomas/.local/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10::impl::OperatorEntry::updateDispatchTableEntry_(c10::Dispatcher const&, c10::DispatchKey) + 0xe0 (0x7fb8d26cc500 in /home/robertthomas/.local/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #4: c10::impl::OperatorEntry::updateDispatchTable_(c10::Dispatcher const&, c10::DispatchKey) + 0xb5 (0x7fb8d26cc655 in /home/robertthomas/.local/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #5: c10::impl::OperatorEntry::deregisterKernel_(c10::Dispatcher const&, c10::optional<c10::DispatchKey>, std::_List_iterator<c10::impl::AnnotatedKernel>) + 0x3ff (0x7fb8d26cdc5f in /home/robertthomas/.local/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #6: c10::Dispatcher::deregisterImpl_(c10::OperatorHandle const&, c10::OperatorName const&, c10::optional<c10::DispatchKey>, std::_List_iterator<c10::impl::AnnotatedKernel>) + 0x59 (0x7fb8d26bf779 in /home/robertthomas/.local/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #7: <unknown function> + 0xe81705 (0x7fb8ab881705 in /home/robertthomas/.local/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #8: <unknown function> + 0x45495 (0x7fb923c45495 in /lib/x86_64-linux-gnu/libc.so.6)
frame #9: on_exit + 0 (0x7fb923c45610 in /lib/x86_64-linux-gnu/libc.so.6)
frame #10: <unknown function> + 0x29d97 (0x7fb923c29d97 in /lib/x86_64-linux-gnu/libc.so.6)
frame #11: __libc_start_main + 0x80 (0x7fb923c29e40 in /lib/x86_64-linux-gnu/libc.so.6)
frame #12: _start + 0x25 (0x55c151d9dba5 in /usr/bin/python3)
Aborted (core dumped)
Any advice or clues on how to debug are welcomed. Thank you