forked from pytorch/torchtitan
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_fused_rms_norm.py
72 lines (56 loc) · 2.06 KB
/
test_fused_rms_norm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import torch
from torch.distributed._tensor import (
distribute_tensor,
init_device_mesh,
Replicate,
Shard,
)
from torch.distributed._tensor.debug import CommDebugMode
from torch.testing._internal.common_utils import run_tests
from torch.testing._internal.distributed._tensor.common_dtensor import (
DTensorTestBase,
skip_if_lt_x_gpu,
with_comms,
)
from torchtitan.models.norms import fused_rms_norm_fn
class TestFusedRMSNorm(DTensorTestBase):
@property
def world_size(self):
return 4
@skip_if_lt_x_gpu(4)
@with_comms
def test_fused_rms_norm(self):
mesh = init_device_mesh(
device_type=self.device_type, mesh_shape=(self.world_size,)
)
x = torch.randn(4, 4, 4, device=self.device_type) # Shard(1)
w = torch.randn(4, device=self.device_type, requires_grad=True) # Replicate
dist_x = distribute_tensor(x, mesh, [Shard(1)])
dist_w = distribute_tensor(w, mesh, [Replicate()])
x = x.clone().detach()
w = w.clone().detach().requires_grad_()
self.assertEqual(dist_x.full_tensor(), x)
self.assertEqual(dist_w.full_tensor(), w)
# fused rmsnorm on DTensor
comm_mode = CommDebugMode()
# fused rmsnorm
with comm_mode:
dist_out = fused_rms_norm_fn(dist_x, dist_w)
self.assertEqual(comm_mode.get_total_counts(), 0)
with comm_mode:
dist_grad_out = torch.ones_like(dist_out)
dist_out.backward(dist_grad_out)
self.assertEqual(comm_mode.get_total_counts(), 0)
# fused rmsnorm on Tensor
out = fused_rms_norm_fn(x, w)
grad_out = torch.ones_like(out)
out.backward(grad_out)
self.assertEqual(dist_out.full_tensor(), out)
self.assertEqual(dist_grad_out.full_tensor(), grad_out)
if __name__ == "__main__":
run_tests()