I'm finetuning a HuggingFace model with my own data using AzureML. The size of the dataset is 1.7 MB, and even when I've made batches of 5, I still get the same out of memory errors. The VM I specify has 28GB RAM, 16 GB of GPU RAM. I admit I don't know how to calculate the model's memory requirements.
Here's the script I use to launch the job
from azureml.core import Workspace
from azureml.core import Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import Environment
from azureml.core.runconfig import DockerConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core import ScriptRunConfig
# create workspace
ws = Workspace.get(name='', subscription_id='', resource_group='')
# create experiment
experiment_name = 'train-with-data'
experiment = Experiment(workspace = ws, name = experiment_name)
# Create environment
myenv = Environment("myenv")
conda = CondaDependencies()
conda.add_pip_package('transformers[torch]')
conda.add_pip_package('datasets')
myenv.python.conda_dependencies = conda
myenv.register(ws)
docker_config = DockerConfiguration(use_docker=True)
# Provision compute target
gpu_cluster_name = "gpu-nc4-cluster"
# Verify that cluster does not exist already
try:
cpu_cluster = ComputeTarget(workspace=ws, name=gpu_cluster_name)
print('Found existing cluster, use it.')
except ComputeTargetException:
compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC4AS_T4_V3',
max_nodes=4)
cpu_cluster = ComputeTarget.create(ws, gpu_cluster_name, compute_config)
cpu_cluster.wait_for_completion(show_output=True)
src = ScriptRunConfig(source_directory='../src',
script='main.py',
compute_target=cpu_cluster,
environment=myenv,
arguments=['dataset_batch_5.pkl'],
docker_runtime_config=docker_config)
run = experiment.submit(config=src)
run
And here are the contents of main.py:
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer
import pickle
import sys
import os
def main():
with open(sys.argv[1], "rb") as f:
# Load the data from the file
input_data = pickle.load(f)
print(input_data)
# I tried this but don't make a difference
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"
checkpoint = "Salesforce/codegen-350M-mono"
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(checkpoint, trust_remote_code=True).to(device)
# Data collator - Assembles data into batches for training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_args = TrainingArguments("/trainer")
trainer = Trainer(
model,
training_args,
train_dataset=input_data["train"],
eval_dataset=input_data["test"],
data_collator=data_collator,
tokenizer=tokenizer
)
trainer.train()
trainer.save_model()
return "Done"
if __name__=="__main__":
main()
The OOM error appears what seems like right after starting trainer.train:
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 15.75 GiB total capacity; 14.74 GiB already allocated; 11.62 MiB free; 15.06 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
I did try changing the cuda alloc setting (in main.py), but that didn't make a difference.
Note that I tried running this simple script as a test and it executed successfully:
import torch
foo = torch.tensor([1,2,3])
foo = foo.to('cuda')
I also tried loading main.py into a notebook and running from within AzureML but was having problems with the outdated transformers library that's in AzureML's pre-loaded environment.
I also thought of starting top in the terminal while the job is running but didn't see that option in the AzureML GUI.