I tried to implement a model parallelism, and found it performance similar (based on the GPU behaviour) when I was using the huggingface device_map="auto", so I wonder what is the difference (except the advanced technique they have)? Maybe I will have to look at their implementation.
import torch
import torch.nn as nn
class SplitModel(nn.Module):
def __init__(self, original_model):
super().__init__()
num_gpus = torch.cuda.device_count()
total_layers = len(original_model.model.layers)
layers_per_gpu = total_layers // num_gpus
self.layer_to_device = {}
self.embed_tokens = original_model.model.embed_tokens.to('cuda:0')
self.layer_to_device[self.embed_tokens] = 'cuda:0'
self.layers = nn.ModuleList()
for i, layer in enumerate(original_model.model.layers):
gpu_id = min(i // layers_per_gpu, num_gpus - 1) # Distribute layers across GPUs
assigned_gpu = f'cuda:{gpu_id}'
self.layers.append(layer.to(assigned_gpu))
self.layer_to_device[layer] = assigned_gpu
# Assign norm and lm_head to the last GPU
last_gpu = f'cuda:{num_gpus - 1}'
self.norm = original_model.model.norm.to(last_gpu)
self.lm_head = original_model.lm_head.to(last_gpu)
self.layer_to_device[self.norm] = last_gpu
self.layer_to_device[self.lm_head] = last_gpu
def forward(self, x):
x = x.to(self.layer_to_device[self.embed_tokens])
x = self.embed_tokens(x)
for layer in self.layers:
if isinstance(x, tuple):
x = x[0]
x = x.to(self.layer_to_device[layer])
x = layer(x)
if isinstance(x, tuple):
x = x[0]
x = x.to(self.layer_to_device[self.norm])
x = self.norm(x)
x = x.to(self.layer_to_device[self.lm_head])
x = self.lm_head(x)
return x
huggingface, split LLM into multiple GPUs