From 6bd20f48e852c0c38233bf4b3a42d02364aad477 Mon Sep 17 00:00:00 2001 From: Alexander Wong Date: Sun, 5 Apr 2020 11:39:26 -0600 Subject: [PATCH 1/2] when displaying events, show occurrences of call --- torchprof/profile.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/torchprof/profile.py b/torchprof/profile.py index c7f6c70..319f8ea 100644 --- a/torchprof/profile.py +++ b/torchprof/profile.py @@ -3,7 +3,7 @@ from collections import namedtuple, defaultdict, OrderedDict Trace = namedtuple("Trace", ["path", "leaf", "module"]) -Measure = namedtuple("Measure", ["self_cpu_total", "cpu_total", "cuda_total"]) +Measure = namedtuple("Measure", ["self_cpu_total", "cpu_total", "cuda_total", "occurrences"]) def walk_modules(module, name="", path=()): @@ -133,9 +133,10 @@ def traces_to_display(traces, trace_events, show_events=False, paths=None): for event in events: current_tree[name][event.name] = { None: Measure( - event.self_cpu_time_total, - event.cpu_time_total, - event.cuda_time_total, + sum([e.self_cpu_time_total for e in events if e.name == event.name]), + sum([e.cpu_time_total for e in events if e.name == event.name]), + sum([e.cuda_time_total for e in events if e.name == event.name]), + len([e for e in events if e.name == event.name]) ) } else: @@ -143,6 +144,7 @@ def traces_to_display(traces, trace_events, show_events=False, paths=None): sum([e.self_cpu_time_total for e in events]), sum([e.cpu_time_total for e in events]), sum([e.cuda_time_total for e in events]), + len(trace_events[path]) ) current_tree = current_tree[name] tree_lines = flatten_tree(tree) @@ -155,10 +157,12 @@ def traces_to_display(traces, trace_events, show_events=False, paths=None): self_cpu_time = "" cpu_time = "" cuda_time = "" + occurrences = "" if measures: self_cpu_time = tprofiler.format_time(measures.self_cpu_total) cpu_time = tprofiler.format_time(measures.cpu_total) cuda_time = tprofiler.format_time(measures.cuda_total) + occurrences = str(measures.occurrences) pre = "" next_depths = [pl[0] for pl in tree_lines[idx + 1 :]] current = True @@ -175,22 +179,24 @@ def traces_to_display(traces, trace_events, show_events=False, paths=None): pre = dt[3] + pre depth -= 1 current = False - format_lines.append([pre + name, self_cpu_time, cpu_time, cuda_time]) + format_lines.append([pre + name, self_cpu_time, cpu_time, cuda_time, occurrences]) # construct the table - heading = ("Module", "Self CPU total", "CPU total", "CUDA total") + heading = ("Module", "Self CPU total", "CPU total", "CUDA total", "Occurrences") max_lens = [max(map(len, col)) for col in zip(*([heading] + format_lines))] # create the heading disp = "{:<{}s}".format(heading[0], max_lens[0]) + " | " disp += "{:>{}s}".format(heading[1], max_lens[1]) + " | " disp += "{:>{}s}".format(heading[2], max_lens[2]) + " | " - disp += "{:>{}s}".format(heading[3], max_lens[3]) + "\n" + disp += "{:>{}s}".format(heading[3], max_lens[3]) + " | " + disp += "{:>{}s}".format(heading[4], max_lens[4]) + "\n" disp += "-|-".join(["-" * mlen for mlen in max_lens]) + "\n" for line in format_lines: - label, self_cpu_time, cpu_time, cuda_time = line + label, self_cpu_time, cpu_time, cuda_time, occurrences = line disp += "{:<{}s}".format(label, max_lens[0]) + " | " disp += "{:>{}s}".format(self_cpu_time, max_lens[1]) + " | " disp += "{:>{}s}".format(cpu_time, max_lens[2]) + " | " - disp += "{:>{}s}".format(cuda_time, max_lens[3]) + "\n" + disp += "{:>{}s}".format(cuda_time, max_lens[3]) + " | " + disp += "{:>{}s}".format(occurrences, max_lens[4]) + "\n" return disp From 64d3d637f724ac219fbd1eafb31f786f65f98073 Mon Sep 17 00:00:00 2001 From: Alexander Wong Date: Sun, 5 Apr 2020 11:58:52 -0600 Subject: [PATCH 2/2] updated documentation to show Occurrences of layer --- README.md | 157 +++++++++++++++++++++++++++--------------------------- 1 file changed, 78 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index 57df03a..00528e8 100644 --- a/README.md +++ b/README.md @@ -24,53 +24,52 @@ with torchprof.Profile(model, use_cuda=True) as prof: print(prof.display(show_events=False)) # equivalent to `print(prof)` and `print(prof.display())` ``` ```text -Module | Self CPU total | CPU total | CUDA total ----------------|----------------|-----------|----------- -AlexNet | | | -├── features | | | -│├── 0 | 1.956ms | 7.714ms | 7.787ms -│├── 1 | 68.880us | 68.880us | 69.632us -│├── 2 | 85.639us | 155.948us | 155.648us -│├── 3 | 253.419us | 970.386us | 1.747ms -│├── 4 | 18.919us | 18.919us | 19.584us -│├── 5 | 30.910us | 54.900us | 55.296us -│├── 6 | 132.839us | 492.367us | 652.192us -│├── 7 | 17.990us | 17.990us | 18.432us -│├── 8 | 87.219us | 310.776us | 552.544us -│├── 9 | 17.620us | 17.620us | 17.536us -│├── 10 | 85.690us | 303.120us | 437.248us -│├── 11 | 17.910us | 17.910us | 18.400us -│└── 12 | 29.239us | 51.488us | 52.288us -├── avgpool | 49.230us | 85.740us | 88.960us -└── classifier | | | - ├── 0 | 626.236us | 1.239ms | 1.362ms - ├── 1 | 235.669us | 235.669us | 635.008us - ├── 2 | 17.990us | 17.990us | 18.432us - ├── 3 | 31.890us | 56.770us | 57.344us - ├── 4 | 39.280us | 39.280us | 212.128us - ├── 5 | 16.800us | 16.800us | 17.600us - └── 6 | 38.459us | 38.459us | 79.872us +Module | Self CPU total | CPU total | CUDA total | Occurrences +---------------|----------------|-----------|------------|------------ +AlexNet | | | | +├── features | | | | +│├── 0 | 1.671ms | 6.589ms | 6.701ms | 1 +│├── 1 | 62.430us | 62.430us | 63.264us | 1 +│├── 2 | 62.909us | 109.948us | 112.640us | 1 +│├── 3 | 225.389us | 858.376us | 1.814ms | 1 +│├── 4 | 18.999us | 18.999us | 19.456us | 1 +│├── 5 | 29.560us | 52.720us | 54.272us | 1 +│├── 6 | 136.959us | 511.216us | 707.360us | 1 +│├── 7 | 18.480us | 18.480us | 18.624us | 1 +│├── 8 | 84.380us | 300.700us | 590.688us | 1 +│├── 9 | 18.249us | 18.249us | 17.632us | 1 +│├── 10 | 81.289us | 289.946us | 470.016us | 1 +│├── 11 | 17.850us | 17.850us | 18.432us | 1 +│└── 12 | 29.350us | 52.260us | 52.288us | 1 +├── avgpool | 41.840us | 70.840us | 76.832us | 1 +└── classifier | | | | + ├── 0 | 66.400us | 122.110us | 125.920us | 1 + ├── 1 | 293.658us | 293.658us | 664.704us | 1 + ├── 2 | 17.600us | 17.600us | 18.432us | 1 + ├── 3 | 27.920us | 49.030us | 51.168us | 1 + ├── 4 | 40.590us | 40.590us | 208.672us | 1 + ├── 5 | 17.570us | 17.570us | 18.432us | 1 + └── 6 | 40.489us | 40.489us | 81.920us | 1 ``` To see the low level operations that occur within each layer, print the contents of `prof.display(show_events=True)`. ```text -Module | Self CPU total | CPU total | CUDA total -------------------------------|----------------|-----------|----------- -AlexNet | | | -├── features | | | -│├── 0 | | | -││├── conv2d | 15.740us | 1.956ms | 1.972ms -││├── convolution | 12.000us | 1.940ms | 1.957ms -││├── _convolution | 36.590us | 1.928ms | 1.946ms -││├── contiguous | 6.600us | 6.600us | 6.464us -││└── cudnn_convolution | 1.885ms | 1.885ms | 1.906ms -│├── 1 | | | -││└── relu_ | 68.880us | 68.880us | 69.632us -│├── 2 | | | -││├── max_pool2d | 15.330us | 85.639us | 84.992us -││└── max_pool2d_with_indices | 70.309us | 70.309us | 70.656us -│├── 3 | | | +Module | Self CPU total | CPU total | CUDA total | Occurrences +------------------------------|----------------|-----------|------------|------------ +AlexNet | | | | +├── features | | | | +│├── 0 | | | | +││├── conv2d | 13.370us | 1.671ms | 1.698ms | 1 +││├── convolution | 12.730us | 1.658ms | 1.685ms | 1 +││├── _convolution | 30.660us | 1.645ms | 1.673ms | 1 +││├── contiguous | 6.970us | 6.970us | 7.136us | 1 +││└── cudnn_convolution | 1.608ms | 1.608ms | 1.638ms | 1 +│├── 1 | | | | +││└── relu_ | 62.430us | 62.430us | 63.264us | 1 +│├── 2 | | | | +││├── max_pool2d | 15.870us | 62.909us | 63.488us | 1 +││└── max_pool2d_with_indices | 47.039us | 47.039us | 49.152us | 1 ... ``` @@ -85,17 +84,17 @@ print(trace[2]) print(event_lists_dict[trace[2].path][0]) ``` ```text ---------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg CUDA total % CUDA total CUDA time avg Number of Calls ---------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -conv2d 0.80% 15.740us 100.00% 1.956ms 1.956ms 25.32% 1.972ms 1.972ms 1 -convolution 0.61% 12.000us 99.20% 1.940ms 1.940ms 25.14% 1.957ms 1.957ms 1 -_convolution 1.87% 36.590us 98.58% 1.928ms 1.928ms 24.99% 1.946ms 1.946ms 1 -contiguous 0.34% 6.600us 0.34% 6.600us 6.600us 0.08% 6.464us 6.464us 1 -cudnn_convolution 96.37% 1.885ms 96.37% 1.885ms 1.885ms 24.47% 1.906ms 1.906ms 1 ---------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -Self CPU time total: 1.956ms -CUDA time total: 7.787ms +--------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- ----------------------------------- +Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg CUDA total % CUDA total CUDA time avg Number of Calls Input Shapes +--------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- ----------------------------------- +conv2d 0.80% 13.370us 100.00% 1.671ms 1.671ms 25.34% 1.698ms 1.698ms 1 [] +convolution 0.76% 12.730us 99.20% 1.658ms 1.658ms 25.15% 1.685ms 1.685ms 1 [] +_convolution 1.83% 30.660us 98.44% 1.645ms 1.645ms 24.97% 1.673ms 1.673ms 1 [] +contiguous 0.42% 6.970us 0.42% 6.970us 6.970us 0.11% 7.136us 7.136us 1 [] +cudnn_convolution 96.19% 1.608ms 96.19% 1.608ms 1.608ms 24.44% 1.638ms 1.638ms 1 [] +--------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- ----------------------------------- +Self CPU time total: 1.671ms +CUDA time total: 6.701ms ``` @@ -115,32 +114,32 @@ print(prof) ``` ```text -Module | Self CPU total | CPU total | CUDA total ----------------|----------------|-----------|----------- -AlexNet | | | -├── features | | | -│├── 0 | | | -│├── 1 | | | -│├── 2 | | | -│├── 3 | 2.846ms | 11.368ms | 0.000us -│├── 4 | | | -│├── 5 | | | -│├── 6 | | | -│├── 7 | | | -│├── 8 | | | -│├── 9 | | | -│├── 10 | | | -│├── 11 | | | -│└── 12 | | | -├── avgpool | | | -└── classifier | 12.016ms | 12.206ms | 0.000us - ├── 0 | | | - ├── 1 | | | - ├── 2 | | | - ├── 3 | | | - ├── 4 | | | - ├── 5 | | | - └── 6 | | | +Module | Self CPU total | CPU total | CUDA total | Occurrences +---------------|----------------|-----------|------------|------------ +AlexNet | | | | +├── features | | | | +│├── 0 | | | | +│├── 1 | | | | +│├── 2 | | | | +│├── 3 | 3.189ms | 12.717ms | 0.000us | 1 +│├── 4 | | | | +│├── 5 | | | | +│├── 6 | | | | +│├── 7 | | | | +│├── 8 | | | | +│├── 9 | | | | +│├── 10 | | | | +│├── 11 | | | | +│└── 12 | | | | +├── avgpool | | | | +└── classifier | 13.403ms | 14.011ms | 0.000us | 1 + ├── 0 | | | | + ├── 1 | | | | + ├── 2 | | | | + ├── 3 | | | | + ├── 4 | | | | + ├── 5 | | | | + └── 6 | | | | ```