Motif-Technologies
/

optimizer

fix(optimizer): resolve bug where weight decay was multiplied by wrong lr value

by dongseokmotif - opened Aug 27

←

Files changed (1) hide show

torch-ext/optimizer/muon.py CHANGED Viewed

@@ -104,7 +104,7 @@ def _compute_u(state, steps, rank, compute_stream):
 @torch.no_grad()
-def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
     with torch.cuda.stream(comm_stream):
@@ -133,7 +133,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             device_mesh=p.device_mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
-        p.data.add_(u, alpha=-lr)
 def default_is_muon(x, name):
@@ -387,7 +387,7 @@ class Muon(torch.optim.Optimizer):
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
                 _scatter(
-                    p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
         chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)

 @torch.no_grad()
+def _scatter(p, state, lr, adjusted_lr, weight_decay, rank, comm_stream):
     u = state.computed_u
     with torch.cuda.stream(comm_stream):
             device_mesh=p.device_mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
+        p.data.add_(u, alpha=-adjusted_lr)
 def default_is_muon(x, name):
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
                 _scatter(
+                    p, state, lr, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
         chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)