Skip to content

Commit 5eceaf5

Browse files
Merge pull request cri-o#6923 from haircommander/resource-metrics
metrics: add metric for resource stalled at stage
2 parents 6cfd5c1 + 259e798 commit 5eceaf5

File tree

6 files changed

+64
-5
lines changed

6 files changed

+64
-5
lines changed

docs/crio.8.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ crio [GLOBAL OPTIONS] command [COMMAND OPTIONS] [ARGUMENTS...]
309309

310310
**--metrics-cert**="": Certificate for the secure metrics endpoint.
311311

312-
**--metrics-collectors**="": Enabled metrics collectors. (default: "operations", "operations_latency_microseconds_total", "operations_latency_microseconds", "operations_errors", "image_pulls_by_digest", "image_pulls_by_name", "image_pulls_by_name_skipped", "image_pulls_failures", "image_pulls_successes", "image_pulls_layer_size", "image_layer_reuse", "containers_oom_total", "containers_oom", "processes_defunct", "operations_total", "operations_latency_seconds", "operations_latency_seconds_total", "operations_errors_total", "image_pulls_bytes_total", "image_pulls_skipped_bytes_total", "image_pulls_failure_total", "image_pulls_success_total", "image_layer_reuse_total", "containers_oom_count_total", "containers_seccomp_notifier_count_total")
312+
**--metrics-collectors**="": Enabled metrics collectors. (default: "operations", "operations_latency_microseconds_total", "operations_latency_microseconds", "operations_errors", "image_pulls_by_digest", "image_pulls_by_name", "image_pulls_by_name_skipped", "image_pulls_failures", "image_pulls_successes", "image_pulls_layer_size", "image_layer_reuse", "containers_oom_total", "containers_oom", "processes_defunct", "operations_total", "operations_latency_seconds", "operations_latency_seconds_total", "operations_errors_total", "image_pulls_bytes_total", "image_pulls_skipped_bytes_total", "image_pulls_failure_total", "image_pulls_success_total", "image_layer_reuse_total", "containers_oom_count_total", "containers_seccomp_notifier_count_total", "resources_stalled_at_stage")
313313

314314
**--metrics-key**="": Certificate key for the secure metrics endpoint.
315315

server/metrics/metrics.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ type Metrics struct {
8989
metricImageLayerReuseTotal *prometheus.CounterVec
9090
metricContainersOOMCountTotal *prometheus.CounterVec
9191
metricContainersSeccompNotifierCountTotal *prometheus.CounterVec
92+
metricResourcesStalledAtStage *prometheus.CounterVec
9293
}
9394

9495
var instance *Metrics
@@ -319,6 +320,14 @@ func New(config *libconfig.MetricsConfig) *Metrics {
319320
},
320321
[]string{"name", "syscall"},
321322
),
323+
metricResourcesStalledAtStage: prometheus.NewCounterVec(
324+
prometheus.CounterOpts{
325+
Subsystem: collectors.Subsystem,
326+
Name: collectors.ResourcesStalledAtStage.String(),
327+
Help: "Resource creation stage pod or container is stalled at.",
328+
},
329+
[]string{"stage"},
330+
),
322331
}
323332
return Instance()
324333
}
@@ -550,6 +559,15 @@ func (m *Metrics) MetricImagePullsByNameAdd(add float64, values ...string) {
550559
c.Add(add)
551560
}
552561

562+
func (m *Metrics) MetricResourcesStalledAtStage(stage string) {
563+
c, err := m.metricResourcesStalledAtStage.GetMetricWithLabelValues(stage)
564+
if err != nil {
565+
logrus.Warnf("Unable to write resource stalled at stage metric: %v", err)
566+
return
567+
}
568+
c.Inc()
569+
}
570+
553571
// createEndpoint creates a /metrics endpoint for prometheus monitoring.
554572
func (m *Metrics) createEndpoint() (*http.ServeMux, error) {
555573
for collector, metric := range map[collectors.Collector]prometheus.Collector{
@@ -579,6 +597,7 @@ func (m *Metrics) createEndpoint() (*http.ServeMux, error) {
579597
collectors.ImageLayerReuseTotal: m.metricImageLayerReuseTotal,
580598
collectors.ContainersOOMCountTotal: m.metricContainersOOMCountTotal,
581599
collectors.ContainersSeccompNotifierCountTotal: m.metricContainersSeccompNotifierCountTotal,
600+
collectors.ResourcesStalledAtStage: m.metricResourcesStalledAtStage,
582601
} {
583602
if m.config.MetricsCollectors.Contains(collector) {
584603
logrus.Debugf("Enabling metric: %s", collector.Stripped())

server/otel-collector/collectors/collectors.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@ const (
101101

102102
// ContainersSeccompNotifierCountTotal is the key for the CRI-O container seccomp notifier metrics per container name and syscalls.
103103
ContainersSeccompNotifierCountTotal Collector = crioPrefix + "containers_seccomp_notifier_count_total"
104+
105+
// ResourcesStalledAtStage is the key for the resources stalled at different stages in container and pod creation.
106+
ResourcesStalledAtStage Collector = crioPrefix + "resources_stalled_at_stage"
104107
)
105108

106109
// FromSlice converts a string slice to a Collectors type.
@@ -148,6 +151,7 @@ func All() Collectors {
148151
ImageLayerReuseTotal.Stripped(),
149152
ContainersOOMCountTotal.Stripped(),
150153
ContainersSeccompNotifierCountTotal.Stripped(),
154+
ResourcesStalledAtStage.Stripped(),
151155
}
152156
}
153157

server/otel-collector/collectors/collectors_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,12 @@ var _ = t.Describe("Collectors", func() {
6161
collectors.ImageLayerReuseTotal,
6262
collectors.ContainersOOMCountTotal,
6363
collectors.ContainersSeccompNotifierCountTotal,
64+
collectors.ResourcesStalledAtStage,
6465
} {
6566
Expect(all.Contains(collector)).To(BeTrue())
6667
}
6768

68-
Expect(all).To(HaveLen(25))
69+
Expect(all).To(HaveLen(26))
6970
})
7071
})
7172

server/utils.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
cryptUtils "github.com/containers/ocicrypt/utils"
1515
"github.com/containers/storage/pkg/mount"
1616
"github.com/cri-o/cri-o/internal/log"
17+
"github.com/cri-o/cri-o/server/metrics"
1718
v1 "github.com/opencontainers/image-spec/specs-go/v1"
1819
"github.com/sirupsen/logrus"
1920
types "k8s.io/cri-api/pkg/apis/runtime/v1"
@@ -151,13 +152,13 @@ func isContextError(err error) bool {
151152
}
152153

153154
func (s *Server) getResourceOrWait(ctx context.Context, name, resourceType string) (string, error) {
155+
ctx, span := log.StartSpan(ctx)
156+
defer span.End()
157+
154158
// In 99% of cases, we shouldn't hit this timeout. Instead, the context should be cancelled.
155159
// This is really to catch an unlikely case where the kubelet doesn't cancel the context.
156160
// Adding on top of the specified deadline ensures this deadline will be respected, regardless of
157161
// how Kubelet's runtime-request-timeout changes.
158-
ctx, span := log.StartSpan(ctx)
159-
defer span.End()
160-
161162
resourceCreationWaitTime := time.Minute * 4
162163
if initialDeadline, ok := ctx.Deadline(); ok {
163164
resourceCreationWaitTime += time.Until(initialDeadline)
@@ -172,6 +173,7 @@ func (s *Server) getResourceOrWait(ctx context.Context, name, resourceType strin
172173
return "", fmt.Errorf("error attempting to watch for %s %s: no longer found", resourceType, name)
173174
}
174175
log.Infof(ctx, "Creation of %s %s not yet finished. Currently at stage %v. Waiting up to %v for it to finish", resourceType, name, stage, resourceCreationWaitTime)
176+
metrics.Instance().MetricResourcesStalledAtStage(stage)
175177
var err error
176178
select {
177179
// We should wait as long as we can (within reason), thus stalling the kubelet's sync loop.

test/timeout.bats

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,23 @@ function wait_clean() {
8686
[[ "$output" == "$created_ctr_id" ]]
8787
}
8888

89+
@test "emit metric when sandbox is re-requested" {
90+
create_pinns "$CANCEL_TIMEOUT"
91+
92+
# need infra container so runp can timeout in conmon
93+
PORT=$(free_port)
94+
CONTAINER_ENABLE_METRICS=true CONTAINER_METRICS_PORT="$PORT" CONTAINER_DROP_INFRA_CTR=false start_crio
95+
run crictl runp -T "$CANCEL_TIMEOUT" "$TESTDATA"/sandbox_config.json
96+
echo "$output"
97+
[[ "$output" == *"context deadline exceeded"* ]]
98+
[ "$status" -ne 0 ]
99+
100+
run ! crictl runp -T "$CANCEL_TIMEOUT" "$TESTDATA"/sandbox_config.json
101+
102+
METRIC=$(curl -sf "http://localhost:$PORT/metrics" | grep 'crio_resources_stalled_at_stage{')
103+
[[ "$METRIC" == 'container_runtime_crio_resources_stalled_at_stage{stage="sandbox container runtime creation"} 1' ]]
104+
}
105+
89106
@test "should not clean up container after timeout" {
90107
start_crio
91108
pod_id=$(crictl runp "$TESTDATA"/sandbox_config.json)
@@ -203,6 +220,22 @@ function wait_clean() {
203220
crictl create "$pod_id" "$TESTDATA"/container_config.json "$TESTDATA"/sandbox_config.json
204221
}
205222

223+
@test "emit metric when container is re-requested" {
224+
PORT=$(free_port)
225+
CONTAINER_ENABLE_METRICS=true CONTAINER_METRICS_PORT="$PORT" start_crio
226+
pod_id=$(crictl runp "$TESTDATA"/sandbox_config.json)
227+
228+
run crictl create -T "$CANCEL_TIMEOUT" "$pod_id" "$TESTDATA"/container_config.json "$TESTDATA"/sandbox_config.json
229+
echo "$output"
230+
[[ "$output" == *"context deadline exceeded"* ]]
231+
[ "$status" -ne 0 ]
232+
233+
run ! crictl create -T "$CANCEL_TIMEOUT" "$pod_id" "$TESTDATA"/container_config.json "$TESTDATA"/sandbox_config.json
234+
235+
METRIC=$(curl -sf "http://localhost:$PORT/metrics" | grep 'crio_resources_stalled_at_stage{')
236+
[[ "$METRIC" == 'container_runtime_crio_resources_stalled_at_stage{stage="container runtime creation"} 1' ]]
237+
}
238+
206239
# this test case is paranoid, but mostly checks that we can't
207240
# operate on a pod that's not created, and that we don't mark
208241
# a timed out pod as created before it's re-requested

0 commit comments

Comments
 (0)