Skip to content

Commit 3b4373c

Browse files
committed
Add a pod viewer tool for analyzing TPU pod performance.
1 parent 852cd48 commit 3b4373c

File tree

18 files changed

+1701
-1
lines changed

18 files changed

+1701
-1
lines changed

tensorboard/plugins/profile/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ py_binary(
9090
"profile_demo.google_chart_demo.json",
9191
"profile_demo.memory_viewer.json",
9292
"profile_demo.op_profile.json",
93+
"profile_demo.pod_viewer.json",
9394
],
9495
srcs_version = "PY2AND3",
9596
deps = [
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
package(default_visibility = ["//tensorboard:internal"])
2+
3+
load("//tensorboard/defs:web.bzl", "tf_web_library")
4+
5+
licenses(["notice"]) # Apache 2.0
6+
7+
tf_web_library(
8+
name = "details_card",
9+
srcs = [
10+
"details-card.html",
11+
"details-card.ts",
12+
],
13+
path = "/pod-viewer",
14+
deps = [
15+
"//tensorboard/components/tf_imports:polymer",
16+
"@org_polymer_paper_card",
17+
],
18+
)
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
<!--
2+
@license
3+
Copyright 2016 The TensorFlow Authors. All Rights Reserved.
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
Unless required by applicable law or agreed to in writing, software
9+
distributed under the License is distributed on an "AS IS" BASIS,
10+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
See the License for the specific language governing permissions and
12+
limitations under the License.
13+
-->
14+
15+
<!--
16+
details-card is a card that highlights detailed information of a selection.
17+
-->
18+
19+
<link rel="import" href="../polymer/polymer.html">
20+
<link rel="import" href="../paper-card/paper-card.html">
21+
22+
<dom-module id="details-card">
23+
<style>
24+
paper-card {
25+
--paper-card-header-color: white;
26+
--paper-card-header: {
27+
background-color: rgb(178,34,34);
28+
}
29+
width: 100%;
30+
max-height: 50vh;
31+
overflow-y: auto;
32+
}
33+
.card-content > div {
34+
top: -5px;
35+
margin-bottom: 1em;
36+
}
37+
.value {
38+
font-weight: normal;
39+
text-align: right;
40+
}
41+
.info {
42+
font-size: 20px;
43+
font-weight: bold;
44+
}
45+
.codeStyle {
46+
font-size: 14px;
47+
font-weight: normal;
48+
}
49+
</style>
50+
<template>
51+
<paper-card id="card" heading="[[name]] [[id]]"
52+
hidden="[[!name]]" elevation="2">
53+
<template is="dom-repeat" items=[[nodes]] as="node">
54+
<div class="card-content info">
55+
<div hidden="[[!isChannel]]">
56+
<p>Replica Id: <span class="value">[[node.replicaId]]</span></p>
57+
</div>
58+
<div hidden="[[isStepBreakdown]]">
59+
<p>Data Transferred: <span class="value">
60+
[[sizeMiB_(node.dataSize)]] MiB</span></p>
61+
<p>Latency: <span class="value">
62+
[[format_(node.durationUs)]] Us</span></p>
63+
<p>BW: <span class="value">
64+
[[bw_(node.dataSize, node.durationUs)]] GiB/s</span></p>
65+
</div>
66+
<div hidden="[[!isChannel]]">
67+
<p>Send Delay: <span class="value">
68+
[[format_(node.sendDelayUs)]] Us</span></p>
69+
<p>From: <span class="value">
70+
Chip[[chipId_(node.srcCoreId)]], Core[[nodeId_(node.srcCoreId)]]
71+
</span></p>
72+
<p>To: <span class="value">
73+
Chip[[chipId_(node.dstCoreId)]], Core[[nodeId_(node.dstCoreId)]]
74+
</span></p>
75+
<p>Hlo Names</p>
76+
<code class="codeStyle">
77+
<template is="dom-repeat" items=[[node.hloNames]]>"[[item]]"
78+
</template>
79+
</code>
80+
</div>
81+
<div hidden="[[!hasReplicaGroups]]">
82+
<h5>Replica Groups</h5>
83+
<code class="codeStyle">
84+
<template is="dom-repeat" items=[[node.replicaGroups]]>
85+
{[[item.replicaIds]]}<br>
86+
</template>
87+
</code>
88+
</div>
89+
<div hidden="[[!isStepBreakdown]]">
90+
<template is="dom-repeat" items=[[stepBreakdownEle]]>
91+
<h5> [[item.label]]: <span class="value">
92+
[[getStepBreakdownValue_(node, item.key)]] Us
93+
([[getStepBreakdownPct_(node, item.key)]])</span></h5>
94+
</template>
95+
</div>
96+
</div>
97+
</template>
98+
</paper-card>
99+
</template>
100+
<script src="details-card.js"></script>
101+
</dom-module>
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2+
Licensed under the Apache License, Version 2.0 (the "License");
3+
you may not use this file except in compliance with the License.
4+
You may obtain a copy of the License at
5+
http://www.apache.org/licenses/LICENSE-2.0
6+
Unless required by applicable law or agreed to in writing, software
7+
distributed under the License is distributed on an "AS IS" BASIS,
8+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
See the License for the specific language governing permissions and
10+
limitations under the License.
11+
==============================================================================*/
12+
13+
namespace pod_viewer_details_card {
14+
15+
Polymer({
16+
is: 'details-card',
17+
properties: {
18+
nodes: {
19+
type: Array,
20+
notify: true,
21+
observer: 'updateCard_',
22+
},
23+
name: {
24+
type: String,
25+
value: null,
26+
},
27+
id: {
28+
type: Number,
29+
},
30+
utilization: {
31+
type: Number,
32+
},
33+
isChannel: {
34+
type: Boolean,
35+
value: false,
36+
},
37+
isAllReduce: {
38+
type: Boolean,
39+
value: false,
40+
},
41+
hasReplicaGroups: {
42+
type: Boolean,
43+
value: false,
44+
},
45+
isStepBreakdown: {
46+
type: Boolean,
47+
value: false,
48+
},
49+
stepBreakdownEle: {
50+
type: Array,
51+
},
52+
},
53+
/**
54+
* Update the details card.
55+
*/
56+
updateCard_: function(nodes) {
57+
if (!nodes || nodes.length == 0) return;
58+
this.isChannel = false;
59+
this.isAllReduce = false;
60+
this.isStepBreakdown = false;
61+
this.hasReplicaGroups = false;
62+
if (nodes[0].channelId) {
63+
this.name = 'Channel #';
64+
this.id = nodes[0].channelId;
65+
this.isChannel = true;
66+
} else if (nodes[0].hostName) {
67+
this.name = 'Step breakdown of chip';
68+
this.id = nodes[0].chipId;
69+
this.isStepBreakdown = true;
70+
} else if (nodes[0].replicaGroups) {
71+
this.name = nodes[0].name;
72+
this.id = null;
73+
this.isAllReduce = true;
74+
this.hasReplicaGroups = nodes[0].replicaGroups.length;
75+
}
76+
},
77+
/**
78+
* Converts from number of bytes to MiB.
79+
*/
80+
bytesToMiB_: function(numBytes: number): number {
81+
return numBytes / 1048576;
82+
},
83+
/**
84+
* Return the formatted data size in MiB.
85+
*/
86+
sizeMiB_: function(dataSize: undefined|number): string {
87+
if (!dataSize) {
88+
return '';
89+
}
90+
return this.format_(this.bytesToMiB_(dataSize));
91+
},
92+
/**
93+
* Return the formatted link bandwidth in GiB/s.
94+
* The link bandwidth here is defined by the data size transferred over the
95+
* duration between the start of the send operation to the end of the
96+
* recv-done operation.
97+
*/
98+
bw_: function(dataSize: undefined|number, duration: undefined|number):
99+
string {
100+
if (!dataSize || !duration) {
101+
return '';
102+
}
103+
return this.format_(dataSize / duration / 1073.74);
104+
},
105+
/**
106+
* Return the chip id given the global core id.
107+
*/
108+
chipId_: function(coreId: undefined|number): number {
109+
if (!coreId) {
110+
return 0;
111+
}
112+
return Math.floor(coreId / 2);
113+
},
114+
/**
115+
* Return the node ordinal given the global core id.
116+
*/
117+
nodeId_: function(coreId: undefined|number): number {
118+
if (!coreId) {
119+
return 0;
120+
}
121+
return coreId & 1;
122+
},
123+
/**
124+
* Format a number with two digits after the decimal point.
125+
*/
126+
format_: function(number: undefined|number): string {
127+
return number == null ? '' : number.toFixed(2);
128+
},
129+
/**
130+
* Return a formatted value associated with a specific breakdown.
131+
*/
132+
getStepBreakdownValue_: function(node, key): string {
133+
if (!key || !node) {
134+
return '';
135+
}
136+
return this.format_(node[key]);
137+
},
138+
/**
139+
* Return a the percentage of a specific breakdown.
140+
*/
141+
getStepBreakdownPct_: function(node, key): string {
142+
if (!key || !node || !node.totalDurationUs) {
143+
return '';
144+
}
145+
return (node[key] / node.totalDurationUs * 100).toFixed(2) + '%';
146+
},
147+
ready() {
148+
this.stepBreakdownEle = [
149+
{key: 'highFlopsComputeUs', label: 'High flops compute'},
150+
{key: 'lowFlopsComputeUs', label: 'Low flops compute'},
151+
{key: 'hostInfeedDurationUs', label: 'Infeed'},
152+
{key: 'hostOutfeedDurationUs', label: 'Outfeed'},
153+
{key: 'crsDurationUs', label: 'All reduce'},
154+
{key: 'sendDurationUs', label: 'Send'},
155+
{key: 'recvDurationUs', label: 'Recv'}
156+
];
157+
},
158+
});
159+
160+
} // namespace pod_viewer_details_card
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
package(default_visibility = ["//tensorboard:internal"])
2+
3+
load("//tensorboard/defs:web.bzl", "tf_web_library")
4+
5+
licenses(["notice"]) # Apache 2.0
6+
7+
tf_web_library(
8+
name = "pod_viewer_dashboard",
9+
srcs = [
10+
"pod-viewer-dashboard.html",
11+
"pod-viewer-dashboard.ts",
12+
],
13+
path = "/pod-viewer",
14+
deps = [
15+
"//tensorboard/components/tf_imports:polymer",
16+
"//tensorboard/plugins/profile/pod_viewer/details_card",
17+
"//tensorboard/plugins/profile/pod_viewer/stack_bar_chart",
18+
"//tensorboard/plugins/profile/pod_viewer/topology_graph",
19+
"@org_polymer_paper_slider",
20+
],
21+
)

0 commit comments

Comments
 (0)