I have 8 nvidia gpu, each 80G, I am now train the 70B llama model, but cannot load model and its optimizer states into the gpu, so I move the optimizer states from gpu to cpu, only move the needed states to gpu when update the parameters, like below:
@torch.no_grad()
def step(self, closure: Callable = None):
"""
Performs a single optimization step.
Arguments:
closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group["params"]:
if p.grad is None:
continue
grad = p.grad
if grad.is_sparse:
raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
state = self.state[p]
# State initialization
if len(state) == 0:
state["step"] = 0
# Exponential moving average of gradient values
state["exp_avg"] = torch.zeros_like(p)
# Exponential moving average of squared gradient values
state["exp_avg_sq"] = torch.zeros_like(p)
exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
#modified1: only move the states corresponde to the updating parameters to gpu
if exp_avg.device != p.device:
exp_avg = exp_avg.to(p.device)
exp_avg = exp_avg.to(p.dtype)
if exp_avg_sq.device != p.device:
exp_avg_sq = exp_avg_sq.to(p.device)
exp_avg_sq = exp_avg_sq.to(p.dtype)
beta1, beta2 = group["betas"]
state["step"] += 1
# Decay the first and second moment running average coefficient
# In-place operations to update the averages at the same time
exp_avg.mul_(beta1).add_(grad, alpha=(1.0 - beta1))
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
denom = exp_avg_sq.sqrt().add_(group["eps"])
step_size = group["lr"]
if group["correct_bias"]: # No bias correction for Bert
bias_correction1 = 1.0 - beta1 ** state["step"]
bias_correction2 = 1.0 - beta2 ** state["step"]
step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
p.addcdiv_(exp_avg, denom, value=-step_size)
if group["weight_decay"] > 0.0:
p.add_(p, alpha=(-group["lr"] * group["weight_decay"]))
#modified2: when updated, the optimizer state will move to cpu to free gpu
state["exp_avg"] = exp_avg.to('cpu')
state["exp_avg_sq"] = exp_avg_sq.to('cpu')
return loss
I have marked the modified lines above as #modified:, the logic is simple, bug the training loss was confusing me:
{'loss': 1.4473, 'learning_rate': 1.9993007047883988e-05, 'epoch': 0.05}
{'loss': 1.3078, 'learning_rate': 1.9972037971811802e-05, 'epoch': 0.09}
{'loss': 1.2186, 'learning_rate': 1.9937122098932428e-05, 'epoch': 0.14}
{'loss': 0.9871, 'learning_rate': 1.9888308262251286e-05, 'epoch': 0.19}
{'loss': 0.9528, 'learning_rate': 1.9825664732332886e-05, 'epoch': 0.23}
{'loss': 0.8264, 'learning_rate': 1.9749279121818235e-05, 'epoch': 0.28}
{'loss': 0.8139, 'learning_rate': 1.9659258262890683e-05, 'epoch': 0.33}
{'loss': 0.802, 'learning_rate': 1.955572805786141e-05, 'epoch': 0.38}
{'loss': 0.7674, 'learning_rate': 1.9438833303083677e-05, 'epoch': 0.42}
{'loss': 0.8438, 'learning_rate': 1.9308737486442045e-05, 'epoch': 0.47}
{'loss': 0.8202, 'learning_rate': 1.9165622558699763e-05, 'epoch': 0.52}
{'loss': 0.8353, 'learning_rate': 1.900968867902419e-05, 'epoch': 0.56}
{'loss': 0.8675, 'learning_rate': 1.8841153935046098e-05, 'epoch': 0.61}
{'loss': 0.8664, 'learning_rate': 1.866025403784439e-05, 'epoch': 0.66}
{'loss': 0.9016, 'learning_rate': 1.8467241992282842e-05, 'epoch': 0.7}
{'loss': 0.8926, 'learning_rate': 1.826238774315995e-05, 'epoch': 0.75}
{'loss': 0.7756, 'learning_rate': 1.8045977797666685e-05, 'epoch': 0.8}
{'loss': 0.6959, 'learning_rate': 1.78183148246803e-05, 'epoch': 0.84}
{'loss': 1.2584, 'learning_rate': 1.757971723145453e-05, 'epoch': 0.89}
the loss is shake a lot, the optimizer I modified is transformers.AdamW, can some one tell me the reason why the loss is so unstable? Is my modification wrong somewhere?