from torch.profiler import profile, record_function, ProfilerActivity
with profile(activities=[
ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
with record_function("model_inference"):
model(inputs)
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
model_inference 10.84% 6.598ms 55.50% 33.792ms 33.792ms 0.000us 0.00% 57.264ms 57.264ms 1
aten::conv2d 0.63% 381.000us 37.83% 23.032ms 182.794us 0.000us 0.00% 53.513ms 424.706us 126
aten::convolution 0.49% 301.000us 37.20% 22.651ms 179.770us 0.000us 0.00% 53.513ms 424.706us 126
aten::_convolution 2.24% 1.366ms 36.71% 22.350ms 177.381us 0.000us 0.00% 53.513ms 424.706us 126
aten::cudnn_convolution 24.91% 15.165ms 29.84% 18.169ms 144.198us 50.862ms 88.82% 50.862ms 403.667us 126
volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148... 0.00% 0.000us 0.00% 0.000us 0.000us 24.387ms 42.59% 24.387ms 554.250us 44
volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_small_nh... 0.00% 0.000us 0.00% 0.000us 0.000us 4.449ms 7.77% 4.449ms 1.112ms 4
volta_scudnn_128x32_relu_medium_nn_v1 0.00% 0.000us 0.00% 0.000us 0.000us 4.185ms 7.31% 4.185ms 119.571us 35
volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_interior... 0.00% 0.000us 0.00% 0.000us 0.000us 3.614ms 6.31% 3.614ms 190.211us 19
volta_scudnn_128x128_relu_small_nn_v1 0.00% 0.000us 0.00% 0.000us 0.000us 2.959ms 5.17% 2.959ms 1.480ms 2
Self CPU time total: 60.885ms
Self CUDA time total: 57.264ms