What's the difference between the huggingface implementation of device_map=‘auto’ and this scripts?

191 Views Asked by At

I tried to implement a model parallelism, and found it performance similar (based on the GPU behaviour) when I was using the huggingface device_map="auto", so I wonder what is the difference (except the advanced technique they have)? Maybe I will have to look at their implementation.

import torch
import torch.nn as nn

class SplitModel(nn.Module):
    def __init__(self, original_model):
        super().__init__()
        num_gpus = torch.cuda.device_count()
        total_layers = len(original_model.model.layers)
        layers_per_gpu = total_layers // num_gpus

        self.layer_to_device = {}
        self.embed_tokens = original_model.model.embed_tokens.to('cuda:0')
        self.layer_to_device[self.embed_tokens] = 'cuda:0'

        self.layers = nn.ModuleList()
        for i, layer in enumerate(original_model.model.layers):
            gpu_id = min(i // layers_per_gpu, num_gpus - 1)  # Distribute layers across GPUs
            assigned_gpu = f'cuda:{gpu_id}'
            self.layers.append(layer.to(assigned_gpu))
            self.layer_to_device[layer] = assigned_gpu

        # Assign norm and lm_head to the last GPU
        last_gpu = f'cuda:{num_gpus - 1}'
        self.norm = original_model.model.norm.to(last_gpu)
        self.lm_head = original_model.lm_head.to(last_gpu)
        self.layer_to_device[self.norm] = last_gpu
        self.layer_to_device[self.lm_head] = last_gpu

    def forward(self, x):
        x = x.to(self.layer_to_device[self.embed_tokens])
        x = self.embed_tokens(x)
        for layer in self.layers:
            if isinstance(x, tuple):
                x = x[0]
            x = x.to(self.layer_to_device[layer])
            x = layer(x)
        if isinstance(x, tuple):
            x = x[0]
        x = x.to(self.layer_to_device[self.norm])
        x = self.norm(x)
        x = x.to(self.layer_to_device[self.lm_head])
        x = self.lm_head(x)
        return x

huggingface, split LLM into multiple GPUs

0

There are 0 best solutions below