forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbenchmark_utils.py
193 lines (144 loc) · 5.94 KB
/
benchmark_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import contextlib
import time
import os
import json
import torch
from torch.profiler import profile, ProfilerActivity
def synchronize():
pass
def dump_chrome_trace(f, input, trace_filename, optimize_ctx, activities, num_runs=1,
devices=None, kwargs_for_f=None, kwargs_for_profiler=None):
"""
Output the chrome trace of running f(input, **kwargs_for_f) with [optimize_ctx]
[num_runs] times to [trace_filename].
[activities] are the activities that the profiler will record, e.g. ProfilerActivity.CUDA.
Return total runtime without the profiler
Outputs to trace_filename
"""
if devices is None:
devices = ["cuda"]
global synchronize
if devices != ["cpu"] and torch.cuda.is_available():
synchronize = torch.cuda.synchronize
if kwargs_for_f is None:
kwargs_for_f = {}
if kwargs_for_profiler is None:
kwargs_for_profiler = {}
with optimize_ctx:
torch.manual_seed(1337)
for _ in range(5): # warmup runs
f(input, **kwargs_for_f)
synchronize()
torch.manual_seed(1337)
t0 = time.perf_counter()
for _ in range(num_runs):
f(input, **kwargs_for_f)
synchronize()
t1 = time.perf_counter()
timing = t1 - t0
with profile(activities=activities, **kwargs_for_profiler) as prof:
with optimize_ctx:
synchronize()
torch.manual_seed(1337)
for _ in range(num_runs):
f(input, **kwargs_for_f)
synchronize()
prof.export_chrome_trace(trace_filename)
return timing
def get_chrome_trace_events(filename):
f = open(filename)
data = json.load(f)
events = data["traceEvents"]
return events
def is_gpu_compute_event(event):
global gpu_pids
return "pid" in event and event["pid"] in gpu_pids and "ph" in event and event["ph"] == "X"
def get_sorted_gpu_events(events):
sorted_gpu_events = []
for event in events:
if(not is_gpu_compute_event(event)):
continue
sorted_gpu_events.append(event)
return sorted(sorted_gpu_events, key=lambda x: x["ts"])
def get_duration(sorted_gpu_events):
if len(sorted_gpu_events) == 0:
return 0
event = sorted_gpu_events[0]
current_end_time = event["ts"] + event["dur"]
total_duration = event["dur"]
for event in sorted_gpu_events[1:]:
start_time = max(event["ts"], current_end_time)
end_time = event["ts"] + event["dur"]
total_duration = total_duration + max(end_time - start_time, 0)
current_end_time = max(current_end_time, end_time)
return total_duration
def get_sorted_gpu_mm_conv_events(events):
def is_mm_conv_event(event):
return "name" in event and ("gemm" in event["name"] or "conv" in event["name"]
or "cutlass" in event["name"] or "wgrad" in event["name"])
gpu_events = get_sorted_gpu_events(events)
sorted_events = []
for event in gpu_events:
if(not is_mm_conv_event(event)):
continue
sorted_events.append(event)
return sorted_events
gpu_pids = []
def compute_utilization(filename: str, total_length: float):
"""
Process the chrome traces outputs by the pytorch profiler to compute GPU Utilization
and percent of times spent on matmal and convolution
Args:
filename(str): Name of chrome traces file produced by pytorch profiler
total_length(float): total length of the process without profiler in second
Return:
tuple: (GPU Utilization, percent of time spent on matmal and convolution)
"""
events = get_chrome_trace_events(filename)
# get pids of GPU events
global gpu_pids
gpu_pids = []
for event in events:
if "name" not in event:
continue
if event["name"] == 'process_labels' and "GPU" in event["args"]["labels"]:
gpu_pids.append(event["pid"])
total_length = total_length * 1e6
sorted_gpu_events = get_sorted_gpu_events(events)
utilization = get_duration(sorted_gpu_events) / total_length
sorted_gpu_mm_conv_events = get_sorted_gpu_mm_conv_events(events)
mm_conv_utilization = get_duration(sorted_gpu_mm_conv_events) / total_length
return utilization, mm_conv_utilization
def benchmark_utilization(f, input, trace_folder, optimize_ctx=None, trace_file_name="tmp_chrome_trace", num_runs=1):
"""
Benchmark the GPU Utilization and percent of time spent on matmal and convolution operations of
running f(input, **kwargs_for_f) with [optimize_ctx] [num_runs] times.
It will produce a chrome trace file in trace_folder/trace_file_name.json
Example:
```
def f(a):
return a.sum()
a = torch.rand(2**20, device="cuda")
utilization, mm_conv_utilization = benchmark_utilization(f, a, "tmp", trace_file_name = "tmp_chrome_trace")
```
Args:
f: function to benchmark
input: input to :attr:`f`
trace_folder: name of the folder to store the chrome trace
optimize_ctx: the context in which f will run
trace_file_name: name of the dumped chrome trace file, default to "tmp_chrome_trace"
num_runs: number of times to run f, excluding the warm-up runs, default to 1.
Return:
tuple: (GPU Utilization, percent of time spent on matmal and convolution)
"""
isExist = os.path.exists(trace_folder)
if not isExist:
os.makedirs(trace_folder)
print("create folder " + trace_folder)
if optimize_ctx is None:
optimize_ctx = contextlib.nullcontext()
chrome_trace_file_name = os.path.join(trace_folder, trace_file_name + ".json")
total_length = dump_chrome_trace(f, input, chrome_trace_file_name, optimize_ctx,
[ProfilerActivity.CUDA], num_runs=num_runs, devices="cuda")
utilization, mm_conv_utilization = compute_utilization(chrome_trace_file_name, total_length)
return utilization, mm_conv_utilization