Comments (8)
Did you also try without these?
--nofollow-import-to=tensorflow --nofollow-import-to=keras --noinclude-default-mode=warning --noinclude-numba-mode=nofollow
It's not a given that code handling their absence is not your issue.
from nuitka.
And yes, very much better thank you. I am putting needs_example tag only because I asked a question, and that's how I know that I am waiting for a reply.
from nuitka.
Certainly, I attempted it without those and the error was the same.
I made an other test whit this reduced script:
#Version to test Nuitka
import os,sys
from pathlib import Path
import torch
from tqdm import tqdm
import random
"""
import transformers
from torch.utils import tensorboard
from torch.utils.tensorboard import SummaryWriter
"""
def get_config():
return {
"root": os.path.dirname(os.path.realpath(__name__)),
"file_name_source": "postgres.txt",
"folder_source": "source_ini",
"folder_source_tokenizer": "source_tokenizer",
"folder_tokenizer": "pretrained_tokenizer",
"folder_tensors": "pretrained_tensors",
"batch_size": 32,
"num_epochs": 4,
"lr": 1e-5,
"seq_len": 350,
"max_len": 512,
"model_folder": "MODAI",
"model_pretrained": "pretrained_model",
"model_trained": "trained_model",
"logging_dir": "logging_dir",
"model_basename": "tmodel_",
"preload": None,
"tokenizer_file": "tokenizer_{0}.json",
"experiment_name": "runs/tmodel"
}
def get_weights_file_path(config, epoch: str):
model_folder = config["model_folder"]
model_basename = config["model_basename"]
model_filename = f"{model_basename}{epoch}.pt"
return str(Path('.') / model_folder / model_filename)
def get_model_folder_base_path(config):
model_root = config["root"]
model_folder = config["model_folder"]
return str(Path('.') / model_root / model_folder)
def get_model_pretrained_tensors_path(config):
model_root = config["root"]
model_folder = config["folder_tokenizer"]
model_basename = config["folder_tensors"]
return str(Path('.') / model_root / model_folder / model_basename )
#Folder for pretrained model
def get_model_folder_base_pretrained_path(config):
model_root = config["root"]
model_folder = config["model_folder"]
model_basename = config["model_pretrained"]
folder=str(Path('.') / model_root / model_folder / model_basename )
print(f"Root folder: {model_root}")
return str(Path('.') / model_root / model_folder / model_basename )
#Folder for trained model
def get_model_folder_base_trained_path(config):
model_root = config["root"]
model_folder = config["model_folder"]
model_basename = config["model_trained"]
return str(Path('.') / model_root / model_folder / model_basename )
#Folder for logging
def get_model_folder_base_logging_path(config):
model_root = config["root"]
model_folder = config["model_folder"]
model_basename = config["model_trained"]
model_logging = config["logging_dir"]
return str(Path('.') / model_root / model_folder / model_basename / model_logging )
##Folder with text file training source
def get_folder_source_path(config):
model_root = config["root"]
model_basename = config["folder_source"]
return str(Path('.') / model_root / model_basename)
##Text file training source
def get_file_source_path(config):
model_root = config["root"]
model_basename = config["folder_source"]
file_basename = config["file_name_source"]
return str(Path('.') / model_root / model_basename / file_basename)
##le répertoire source contenant les fichiers source pour faire le training de tokenizer
def get_folder_source_tokenizer_path(config):
model_root = config["root"]
model_basename = config["folder_source_tokenizer"]
return str(Path('.') / model_root / model_basename)
##le repertoire source du tokenizer
def get_tokenizer_folder_path(config):
model_root = config["root"]
model_basename = config["folder_tokenizer"]
return str(Path('.') / model_root / model_basename)
##le fichier json du tokenizer (vocabuaire)
def get_tokenizer_file_path(config,version):
model_root = config["root"]
model_basename = config["folder_tokenizer"]
file_basename = config["tokenizer_file"].format(version)
return str(Path('.') / model_root / model_basename / f'{file_basename}')
def mlm(tensor,seed):
# create random array of floats with equal dims to tensor
torch.manual_seed(seed)
rand = torch.rand(tensor.shape)
# mask random 15% where token is not 0 <s>, 1 <unk>,2 <s/> or 4 <pad>
mask_arr = (rand < .15) * (tensor != 0) * (tensor != 1) * (tensor != 2) * (tensor != 4)
# loop through each row in tensor (cannot do in parallel)
for i in range(tensor.shape[0]):
# get indices of mask positions from mask array
selection = torch.flatten(mask_arr[i].nonzero()).tolist()
# mask tensor
tensor[i, selection] = 3 #mask id
return tensor
class Dataset(torch.utils.data.Dataset):
def __init__(self, encodings):
# store encodings internally
self.encodings = encodings
def __len__(self):
# return the number of samples
return self.encodings['input_ids'].shape[0]
def __getitem__(self, i):
# return dictionary of input_ids, attention_mask, and labels for index i
return {key: tensor[i] for key, tensor in self.encodings.items()}
def main():
config=get_config()
SEED=42#By default
torch.manual_seed(SEED)
"""
Pretrain model ...
"""
# initialize the tokenizer using the tokenizer we initialized and saved to file
##Folder with trained tokenizers
tokenizer_pretrained_folder = get_tokenizer_folder_path(config)
##Folder with splited source file
tokenizer_source_folder=get_folder_source_tokenizer_path(config)
print(f"tokenizer_source_folder: {tokenizer_source_folder}")
max_len = config["max_len"]
#set folder base model
model_base_name=get_model_folder_base_path(config)
path_pretrained_model=get_model_folder_base_pretrained_path(config)
#print("path_pretrained_model: ",path_pretrained_model)
if os.path.exists(path_pretrained_model):
print(f'The folder model [ {path_pretrained_model} ] already exists!')
sys.exit()
#Source of file text for training
paths = [str(x) for x in Path(tokenizer_source_folder).glob('**/*.txt')]
# initialize lists of tensors
input_ids = []
mask = []
labels = []
# open all files and show paths
for path in tqdm(paths[:200]):
# :50
# open the file and split into list by newline characters
with open(path, 'r', encoding='utf-8') as fp:
lines = fp.read().split('\n')
print (path)
print ("Done")
if __name__ == "__main__":
main()
and theses conditions:
py -m nuitka --standalone --noinclude-default-mode=warning --noinclude-numba-mode=nofollow --module-parameter=numba-disable-jit=yes --assume-yes-for-downloads --module-parameter=torch-disable-jit=yes --include-data-dir=../Nuitka_test/pretrained_tokenizer=pretrained_tokenizer --include-data-dir=../Nuitka_test/source_tokenizer=source_tokenizer MyScriptTest.py
and the executable works.
In my view, when utilizing the transformers module, the program attempts to write temporary files to a restricted area of the hard disk, resulting in an error due to the lack of administrative privileges on my end.
from nuitka.
Meanwhile, I conducted two additional tests.
In the first test, I copied the distribution generated by the second script (the version labeled #Version to test Nuitka) onto a machine where I have administrator privileges, and the executable worked as expected.
In the second test, I transferred the distribution generated by the main script (with the error discussed in this chat) to the same machine. The executable ran without any error messages, but there were no observable actions—no intermediate messages, no final message indicating "Done," and no regeneration of tensor files.
from nuitka.
Without a reproducer, there is not a lot to see I am afraid. Probably the model has dependencies of some sorts, but I cannot tell that without seeing it.
from nuitka.
I intend to generate the executable on a machine where I have administrator privileges. Thank you for your assistance and dedication to developing this tool. It's a remarkable work!
As a final test, I could provide a sample input text file and attempt to generate tensor files. Here is the transformer module which perhaps utilize some sophisticated dependencies.
from nuitka.
This is a configuration of a Python-compatible model aimed at replicating the error encountered during the execution of the compiled script.
Following each run, the "model_results" folder needs to be cleared.
PretrainTestNuitka2.zip
from nuitka.
Ok, once I got my "untrusted" Azure VM setup, I will give that a shot.
from nuitka.
Related Issues (20)
- segfault on mac arm
- `pygame-ce` still broken on Nuitka 2.4.8 HOT 9
- Cannot create standalone executable using setuptools
- Error loading QtWidgets.pyd HOT 3
- The behavior of included file in onefile and standalone mode is suggested to be the same. HOT 1
- scons: *** [static_src\CompiledFunctionType.obj] Error 2 HOT 1
- pydoc is not built correctly? HOT 2
- Using malloc_trim of ctypes throws segfault at runtime in gstreamer pipeline
- unsloth, bitsandbytes fail with triton installed, giving OSError: could not get source code
- FATAL: tk-inter: Plugin issue while working on 'module 'tkinterdnd2.TkinterDnD''
- Self-updating with tufup.client
- Nuitka-Scons unable to "linking" under a special condition: cannot find -l:libHacl_Hash_SHA2.a HOT 3
- nuitka build of "wxhello.py" file fails when using msys2 "C:\msys64" on Windows 10
- not support python3.12 typing syntax
- ModuleNotFoundError: No module named 'pyexpat'
- Sklearn missing .libs folder after compiling on newest windows-2022 github actions runner
- How to include dll exe files? --include-raw-dir not work, --include-data-dir change its behavior HOT 1
- MetaPathBasedLoader.c handle leak
- FATAL: anti-bloat: Error, failed to evaluate expression HOT 3
- cannot locate symbol "_Py_FalseStruct" referenced
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from nuitka.