System Info transformers ve

Hi <a class="user-mention notranslate" data-hovercard-type="user" data-hovercard-url="

cc <a class="user-mention notranslate" data-hovercard-type="user" data-hovercard-url="

You can't train a model that has been loaded with `device_map='auto'` in any distributed mode. about transformers HOT 2 OPEN

dannikay commented on July 17, 2024

You can't train a model that has been loaded with `device_map='auto'` in any distributed mode.

from transformers.

Comments (2)

RUFFY-369 commented on July 17, 2024 1

Hi @dannikay the solution is in the error itself. As told in the error message, you can't train a model that has been loaded with device_map='auto' with distributed mode when using device_map. But, you can train it by specifying --num_processes=1 or by launching with python {{myscript.py}}. The --num_processes=1 can be used like:
accelerate launch --num_processes 1 train.py by putting your code in the script.

Also, if you still want to use jupyter notebook instead of a python script you can 'utilise accelerate's library notebook_launcher utility, which allows for starting multi-gpu training based on code inside of a Jupyter Notebook.' Just do as such:

from accelerate import notebook_launcher

def train_accelerate():
  import pandas as pd
  from datasets import load_dataset
  from IPython.display import HTML, display

  dataset_name = "b-mc2/sql-create-context"
  dataset = load_dataset(dataset_name, split="train")


  def display_table(dataset_or_sample):
      # A helper fuction to display a Transformer dataset or single sample contains multi-line string nicely
      pd.set_option("display.max_colwidth", None)
      pd.set_option("display.width", None)
      pd.set_option("display.max_rows", None)

      if isinstance(dataset_or_sample, dict):
          df = pd.DataFrame(dataset_or_sample, index=[0])
      else:
          df = pd.DataFrame(dataset_or_sample)

      html = df.to_html().replace("\\n", "<br>")
      styled_html = f"""<style> .dataframe th, .dataframe tbody td {{ text-align: left; padding-right: 30px; }} </style> {html}"""
      display(HTML(styled_html))


  display_table(dataset.select(range(3)))

  split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
  train_dataset = split_dataset["train"]
  test_dataset = split_dataset["test"]

  print(f"Training dataset contains {len(train_dataset)} text-to-SQL pairs")
  print(f"Test dataset contains {len(test_dataset)} text-to-SQL pairs")

  PROMPT_TEMPLATE = """You are a powerful text-to-SQL model. Given the SQL tables and natural language question, your job is to write SQL query that answers the question.

  ### Table:
  {context}

  ### Question:
  {question}

  ### Response:
  {output}"""


  def apply_prompt_template(row):
      prompt = PROMPT_TEMPLATE.format(
          question=row["question"],
          context=row["context"],
          output=row["answer"],
      )
      return {"prompt": prompt}


  train_dataset = train_dataset.map(apply_prompt_template)
  display_table(train_dataset.select(range(1)))

  from transformers import AutoTokenizer

  token ="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"

  from huggingface_hub import login
  login(token=token)

  base_model_id = "mistralai/Mistral-7B-v0.1"

  # You can use a different max length if your custom dataset has shorter/longer input sequences.
  MAX_LENGTH = 256

  tokenizer = AutoTokenizer.from_pretrained(
      base_model_id,
      model_max_length=MAX_LENGTH,
      padding_side="left",
      add_eos_token=True,
  )
  tokenizer.pad_token = tokenizer.eos_token


  def tokenize_and_pad_to_fixed_length(sample):
      result = tokenizer(
          sample["prompt"],
          truncation=True,
          max_length=MAX_LENGTH,
          padding="max_length",
      )
      result["labels"] = result["input_ids"].copy()
      return result


  tokenized_train_dataset = train_dataset.map(tokenize_and_pad_to_fixed_length)

  assert all(len(x["input_ids"]) == MAX_LENGTH for x in tokenized_train_dataset)

  display_table(tokenized_train_dataset.select(range(1)))

  import torch
  from transformers import AutoModelForCausalLM, BitsAndBytesConfig


  quantization_config = BitsAndBytesConfig(
      # Load the model with 4-bit quantization
      load_in_4bit=True,
      # Use double quantization
      bnb_4bit_use_double_quant=True,
      # Use 4-bit Normal Float for storing the base model weights in GPU memory
      bnb_4bit_quant_type="nf4",
      # De-quantize the weights to 16-bit (Brain) float before the forward/backward pass
      bnb_4bit_compute_dtype=torch.bfloat16,
      # This allow CPU offload.
      llm_int8_enable_fp32_cpu_offload=True,
  )

  # https://huggingface.co/docs/accelerate/en/usage_guides/big_modeling
  # device_map = "auto" buffers model to CPU in case it does not fit GPU.
  model = AutoModelForCausalLM.from_pretrained(base_model_id,
                                              quantization_config=quantization_config,
                                              low_cpu_mem_usage=True,
                                              device_map="auto",
                                              torch_dtype=torch.float16)

  from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

  # Enabling gradient checkpointing, to make the training further efficient
  model.gradient_checkpointing_enable()
  # Set up the model for quantization-aware training e.g. casting layers, parameter freezing, etc.
  model = prepare_model_for_kbit_training(model)

  peft_config = LoraConfig(
      task_type="CAUSAL_LM",
      # This is the rank of the decomposed matrices A and B to be learned during fine-tuning. A smaller number will save more GPU memory but might result in worse performance.
      r=32,
      # This is the coefficient for the learned ΔW factor, so the larger number will typically result in a larger behavior change after fine-tuning.
      lora_alpha=64,
      # Drop out ratio for the layers in LoRA adaptors A and B.
      lora_dropout=0.1,
      # We fine-tune all linear layers in the model. It might sound a bit large, but the trainable adapter size is still only **1.16%** of the whole model.
      target_modules=[
          "q_proj",
          "k_proj",
          "v_proj",
          "o_proj",
          "gate_proj",
          "up_proj",
          "down_proj",
          "lm_head",
      ],
      # Bias parameters to train. 'none' is recommended to keep the original model performing equally when turning off the adapter.
      bias="none",
  )

  peft_model = get_peft_model(model, peft_config)
  peft_model.print_trainable_parameters()

  from datetime import datetime

  import transformers
  from transformers import TrainingArguments

  import mlflow

  # DeepSpeed requires a distributed environment even when only one process is used.
  # This emulates a launcher in the notebook
  import os

  os.environ["MASTER_ADDR"] = "localhost"
  os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
  os.environ["RANK"] = "0"
  os.environ["LOCAL_RANK"] = "0"
  os.environ["WORLD_SIZE"] = "1"
  os.environ["NCCL_DEBUG"] = "INFO"

  training_args = TrainingArguments(
      # Set this to mlflow for logging your training
      report_to="mlflow",
      # Name the MLflow run
      run_name=f"Mistral-7B-SQL-QLoRA-{datetime.now().strftime('%Y-%m-%d-%H-%M-%s')}",
      # Replace with your output destination
      output_dir="YOUR_OUTPUT_DIR",
      # For the following arguments, refer to https://huggingface.co/docs/transformers/main_classes/trainer
      per_device_train_batch_size=1,
      gradient_accumulation_steps=1,
      gradient_checkpointing=True,
      optim="paged_adamw_8bit",
      bf16=True,
      learning_rate=2e-5,
      lr_scheduler_type="constant",
      max_steps=500,
      save_steps=100,
      logging_steps=100,
      warmup_steps=5,
      # https://discuss.huggingface.co/t/training-llama-with-lora-on-multiple-gpus-may-exist-bug/47005/3
      ddp_find_unused_parameters=False,
      deepspeed="ds_zero3_config.json",
  )

  trainer = transformers.Trainer(
      model=peft_model,
      train_dataset=tokenized_train_dataset,
      data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
      args=training_args,
  )

  # use_cache=True is incompatible with gradient checkpointing.
  peft_model.config.use_cache = False

  trainer.train()
notebook_launcher(train_accelerate, args=(), num_processes=1)

In the above code your code is in a function which is passed to notebook_launcher with num_processes = 1 arg (1 for using 1 GPU)

Cheers!

from transformers.

amyeroberts commented on July 17, 2024

cc @SunMarc @muellerzr

from transformers.

You can't train a model that has been loaded with `device_map='auto'` in any distributed mode. about transformers HOT 2 OPEN

Comments (2)

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent