Skip to content

Commit e35b903

Browse files
committed
Make sure to skip fine-grained missing data
This skips missing pod or node data at the pod (if any container data is missing) or node level, so that things like autoscalers can make proper decisions.
1 parent 4b80d4d commit e35b903

File tree

2 files changed

+42
-18
lines changed

2 files changed

+42
-18
lines changed

pkg/sources/summary/summary.go

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,29 @@ func (src *summaryMetricsSource) Collect(ctx context.Context) (*sources.MetricsB
109109

110110
var errs []error
111111
errs = append(errs, src.decodeNodeStats(&summary.Node, &res.Nodes[0])...)
112-
for i, pod := range summary.Pods {
113-
errs = append(errs, src.decodePodStats(&pod, &res.Pods[i])...)
112+
if len(errs) != 0 {
113+
// if we had errors providing node metrics, discard the data point
114+
// so that we don't incorrectly report metric values as zero.
115+
res.Nodes = res.Nodes[:1]
114116
}
115117

118+
num := 0
119+
for _, pod := range summary.Pods {
120+
podErrs := src.decodePodStats(&pod, &res.Pods[num])
121+
errs = append(errs, podErrs...)
122+
if len(podErrs) != 0 {
123+
// NB: we explicitly want to discard pods with partial results, since
124+
// the horizontal pod autoscaler takes special action when a pod is missing
125+
// metrics (and zero CPU or memory does not count as "missing metrics")
126+
127+
// we don't care if we reuse slots in the result array,
128+
// because they get completely overwritten in decodePodStats
129+
continue
130+
}
131+
num++
132+
}
133+
res.Pods = res.Pods[:num]
134+
116135
return res, utilerrors.NewAggregate(errs)
117136
}
118137

@@ -130,15 +149,16 @@ func (src *summaryMetricsSource) decodeNodeStats(nodeStats *stats.NodeStats, tar
130149
}
131150
var errs []error
132151
if err := decodeCPU(&target.CpuUsage, nodeStats.CPU); err != nil {
133-
errs = append(errs, fmt.Errorf("unable to get CPU for node %q: %v", src.node.ConnectAddress, err))
152+
errs = append(errs, fmt.Errorf("unable to get CPU for node %q, discarding data: %v", src.node.ConnectAddress, err))
134153
}
135154
if err := decodeMemory(&target.MemoryUsage, nodeStats.Memory); err != nil {
136-
errs = append(errs, fmt.Errorf("unable to get memory for node %q: %v", src.node.ConnectAddress, err))
155+
errs = append(errs, fmt.Errorf("unable to get memory for node %q, discarding data: %v", src.node.ConnectAddress, err))
137156
}
138157
return errs
139158
}
140159

141160
func (src *summaryMetricsSource) decodePodStats(podStats *stats.PodStats, target *sources.PodMetricsPoint) []error {
161+
// completely overwrite data in the target
142162
*target = sources.PodMetricsPoint{
143163
Name: podStats.PodRef.Name,
144164
Namespace: podStats.PodRef.Namespace,
@@ -160,10 +180,10 @@ func (src *summaryMetricsSource) decodePodStats(podStats *stats.PodStats, target
160180
},
161181
}
162182
if err := decodeCPU(&point.CpuUsage, container.CPU); err != nil {
163-
errs = append(errs, fmt.Errorf("unable to get CPU for container %q in pod %s/%s on node %q: %v", container.Name, target.Namespace, target.Name, src.node.ConnectAddress, err))
183+
errs = append(errs, fmt.Errorf("unable to get CPU for container %q in pod %s/%s on node %q, discarding data: %v", container.Name, target.Namespace, target.Name, src.node.ConnectAddress, err))
164184
}
165185
if err := decodeMemory(&point.MemoryUsage, container.Memory); err != nil {
166-
errs = append(errs, fmt.Errorf("unable to get memory for container %q in pod %s/%s on node %q: %v", container.Name, target.Namespace, target.Name, src.node.ConnectAddress, err))
186+
errs = append(errs, fmt.Errorf("unable to get memory for container %q in pod %s/%s on node %q: %v, discarding data", container.Name, target.Namespace, target.Name, src.node.ConnectAddress, err))
167187
}
168188

169189
target.Containers[i] = point

pkg/sources/summary/summary_test.go

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -124,22 +124,23 @@ func verifyPods(summary *stats.Summary, batch *sources.MetricsBatch) {
124124
var expectedPods []interface{}
125125
for _, pod := range summary.Pods {
126126
containers := make([]sources.ContainerMetricsPoint, len(pod.Containers))
127+
missingData := false
127128
for i, container := range pod.Containers {
128129
var cpuUsage, memoryUsage resource.Quantity
129130
var timestamp time.Time
130-
if container.CPU != nil {
131-
if container.CPU.UsageNanoCores != nil {
132-
cpuUsage = *resource.NewScaledQuantity(int64(*container.CPU.UsageNanoCores), -9)
133-
}
134-
timestamp = container.CPU.Time.Time
131+
if container.CPU == nil || container.CPU.UsageNanoCores == nil {
132+
missingData = true
133+
break
135134
}
136-
if container.Memory != nil {
137-
if container.Memory.WorkingSetBytes != nil {
138-
memoryUsage = *resource.NewQuantity(int64(*container.Memory.WorkingSetBytes), resource.BinarySI)
139-
}
140-
if timestamp.IsZero() {
141-
timestamp = container.Memory.Time.Time
142-
}
135+
cpuUsage = *resource.NewScaledQuantity(int64(*container.CPU.UsageNanoCores), -9)
136+
timestamp = container.CPU.Time.Time
137+
if container.Memory == nil || container.Memory.WorkingSetBytes == nil {
138+
missingData = true
139+
break
140+
}
141+
memoryUsage = *resource.NewQuantity(int64(*container.Memory.WorkingSetBytes), resource.BinarySI)
142+
if timestamp.IsZero() {
143+
timestamp = container.Memory.Time.Time
143144
}
144145

145146
containers[i] = sources.ContainerMetricsPoint{
@@ -151,6 +152,9 @@ func verifyPods(summary *stats.Summary, batch *sources.MetricsBatch) {
151152
},
152153
}
153154
}
155+
if missingData {
156+
continue
157+
}
154158
expectedPods = append(expectedPods, sources.PodMetricsPoint{
155159
Name: pod.PodRef.Name,
156160
Namespace: pod.PodRef.Namespace,

0 commit comments

Comments
 (0)