Merge pull request cri-o#6923 from haircommander/resource-metrics

openshift-merge-robot · web-flow · commit 5eceaf5ef88e · 2023-06-02T19:25:28.000-04:00
metrics: add metric for resource stalled at stage
diff --git a/docs/crio.8.md b/docs/crio.8.md
@@ -309,7 +309,7 @@ crio [GLOBAL OPTIONS] command [COMMAND OPTIONS] [ARGUMENTS...]
 
 **--metrics-cert**="": Certificate for the secure metrics endpoint.
 
-**--metrics-collectors**="": Enabled metrics collectors. (default: "operations", "operations_latency_microseconds_total", "operations_latency_microseconds", "operations_errors", "image_pulls_by_digest", "image_pulls_by_name", "image_pulls_by_name_skipped", "image_pulls_failures", "image_pulls_successes", "image_pulls_layer_size", "image_layer_reuse", "containers_oom_total", "containers_oom", "processes_defunct", "operations_total", "operations_latency_seconds", "operations_latency_seconds_total", "operations_errors_total", "image_pulls_bytes_total", "image_pulls_skipped_bytes_total", "image_pulls_failure_total", "image_pulls_success_total", "image_layer_reuse_total", "containers_oom_count_total", "containers_seccomp_notifier_count_total")
+**--metrics-collectors**="": Enabled metrics collectors. (default: "operations", "operations_latency_microseconds_total", "operations_latency_microseconds", "operations_errors", "image_pulls_by_digest", "image_pulls_by_name", "image_pulls_by_name_skipped", "image_pulls_failures", "image_pulls_successes", "image_pulls_layer_size", "image_layer_reuse", "containers_oom_total", "containers_oom", "processes_defunct", "operations_total", "operations_latency_seconds", "operations_latency_seconds_total", "operations_errors_total", "image_pulls_bytes_total", "image_pulls_skipped_bytes_total", "image_pulls_failure_total", "image_pulls_success_total", "image_layer_reuse_total", "containers_oom_count_total", "containers_seccomp_notifier_count_total", "resources_stalled_at_stage")
 
 **--metrics-key**="": Certificate key for the secure metrics endpoint.
 
diff --git a/server/metrics/metrics.go b/server/metrics/metrics.go
@@ -89,6 +89,7 @@ type Metrics struct {
 	metricImageLayerReuseTotal                *prometheus.CounterVec
 	metricContainersOOMCountTotal             *prometheus.CounterVec
 	metricContainersSeccompNotifierCountTotal *prometheus.CounterVec
+	metricResourcesStalledAtStage             *prometheus.CounterVec
 }
 
 var instance *Metrics
@@ -319,6 +320,14 @@ func New(config *libconfig.MetricsConfig) *Metrics {
 			},
 			[]string{"name", "syscall"},
 		),
+		metricResourcesStalledAtStage: prometheus.NewCounterVec(
+			prometheus.CounterOpts{
+				Subsystem: collectors.Subsystem,
+				Name:      collectors.ResourcesStalledAtStage.String(),
+				Help:      "Resource creation stage pod or container is stalled at.",
+			},
+			[]string{"stage"},
+		),
 	}
 	return Instance()
 }
@@ -550,6 +559,15 @@ func (m *Metrics) MetricImagePullsByNameAdd(add float64, values ...string) {
 	c.Add(add)
 }
 
+func (m *Metrics) MetricResourcesStalledAtStage(stage string) {
+	c, err := m.metricResourcesStalledAtStage.GetMetricWithLabelValues(stage)
+	if err != nil {
+		logrus.Warnf("Unable to write resource stalled at stage metric: %v", err)
+		return
+	}
+	c.Inc()
+}
+
 // createEndpoint creates a /metrics endpoint for prometheus monitoring.
 func (m *Metrics) createEndpoint() (*http.ServeMux, error) {
 	for collector, metric := range map[collectors.Collector]prometheus.Collector{
@@ -579,6 +597,7 @@ func (m *Metrics) createEndpoint() (*http.ServeMux, error) {
 		collectors.ImageLayerReuseTotal:                m.metricImageLayerReuseTotal,
 		collectors.ContainersOOMCountTotal:             m.metricContainersOOMCountTotal,
 		collectors.ContainersSeccompNotifierCountTotal: m.metricContainersSeccompNotifierCountTotal,
+		collectors.ResourcesStalledAtStage:             m.metricResourcesStalledAtStage,
 	} {
 		if m.config.MetricsCollectors.Contains(collector) {
 			logrus.Debugf("Enabling metric: %s", collector.Stripped())
diff --git a/server/otel-collector/collectors/collectors.go b/server/otel-collector/collectors/collectors.go
@@ -101,6 +101,9 @@ const (
 
 	// ContainersSeccompNotifierCountTotal is the key for the CRI-O container seccomp notifier metrics per container name and syscalls.
 	ContainersSeccompNotifierCountTotal Collector = crioPrefix + "containers_seccomp_notifier_count_total"
+
+	// ResourcesStalledAtStage is the key for the resources stalled at different stages in container and pod creation.
+	ResourcesStalledAtStage Collector = crioPrefix + "resources_stalled_at_stage"
 )
 
 // FromSlice converts a string slice to a Collectors type.
@@ -148,6 +151,7 @@ func All() Collectors {
 		ImageLayerReuseTotal.Stripped(),
 		ContainersOOMCountTotal.Stripped(),
 		ContainersSeccompNotifierCountTotal.Stripped(),
+		ResourcesStalledAtStage.Stripped(),
 	}
 }
 
diff --git a/server/otel-collector/collectors/collectors_test.go b/server/otel-collector/collectors/collectors_test.go
@@ -61,11 +61,12 @@ var _ = t.Describe("Collectors", func() {
 				collectors.ImageLayerReuseTotal,
 				collectors.ContainersOOMCountTotal,
 				collectors.ContainersSeccompNotifierCountTotal,
+				collectors.ResourcesStalledAtStage,
 			} {
 				Expect(all.Contains(collector)).To(BeTrue())
 			}
 
-			Expect(all).To(HaveLen(25))
+			Expect(all).To(HaveLen(26))
 		})
 	})
 
diff --git a/server/utils.go b/server/utils.go
@@ -14,6 +14,7 @@ import (
 	cryptUtils "github.com/containers/ocicrypt/utils"
 	"github.com/containers/storage/pkg/mount"
 	"github.com/cri-o/cri-o/internal/log"
+	"github.com/cri-o/cri-o/server/metrics"
 	v1 "github.com/opencontainers/image-spec/specs-go/v1"
 	"github.com/sirupsen/logrus"
 	types "k8s.io/cri-api/pkg/apis/runtime/v1"
@@ -151,13 +152,13 @@ func isContextError(err error) bool {
 }
 
 func (s *Server) getResourceOrWait(ctx context.Context, name, resourceType string) (string, error) {
+	ctx, span := log.StartSpan(ctx)
+	defer span.End()
+
 	// In 99% of cases, we shouldn't hit this timeout. Instead, the context should be cancelled.
 	// This is really to catch an unlikely case where the kubelet doesn't cancel the context.
 	// Adding on top of the specified deadline ensures this deadline will be respected, regardless of
 	// how Kubelet's runtime-request-timeout changes.
-	ctx, span := log.StartSpan(ctx)
-	defer span.End()
-
 	resourceCreationWaitTime := time.Minute * 4
 	if initialDeadline, ok := ctx.Deadline(); ok {
 		resourceCreationWaitTime += time.Until(initialDeadline)
@@ -172,6 +173,7 @@ func (s *Server) getResourceOrWait(ctx context.Context, name, resourceType strin
 		return "", fmt.Errorf("error attempting to watch for %s %s: no longer found", resourceType, name)
 	}
 	log.Infof(ctx, "Creation of %s %s not yet finished. Currently at stage %v. Waiting up to %v for it to finish", resourceType, name, stage, resourceCreationWaitTime)
+	metrics.Instance().MetricResourcesStalledAtStage(stage)
 	var err error
 	select {
 	// We should wait as long as we can (within reason), thus stalling the kubelet's sync loop.
diff --git a/test/timeout.bats b/test/timeout.bats
@@ -86,6 +86,23 @@ function wait_clean() {
 	[[ "$output" == "$created_ctr_id" ]]
 }
 
+@test "emit metric when sandbox is re-requested" {
+	create_pinns "$CANCEL_TIMEOUT"
+
+	# need infra container so runp can timeout in conmon
+	PORT=$(free_port)
+	CONTAINER_ENABLE_METRICS=true CONTAINER_METRICS_PORT="$PORT" CONTAINER_DROP_INFRA_CTR=false start_crio
+	run crictl runp -T "$CANCEL_TIMEOUT" "$TESTDATA"/sandbox_config.json
+	echo "$output"
+	[[ "$output" == *"context deadline exceeded"* ]]
+	[ "$status" -ne 0 ]
+
+	run ! crictl runp -T "$CANCEL_TIMEOUT" "$TESTDATA"/sandbox_config.json
+
+	METRIC=$(curl -sf "http://localhost:$PORT/metrics" | grep 'crio_resources_stalled_at_stage{')
+	[[ "$METRIC" == 'container_runtime_crio_resources_stalled_at_stage{stage="sandbox container runtime creation"} 1' ]]
+}
+
 @test "should not clean up container after timeout" {
 	start_crio
 	pod_id=$(crictl runp "$TESTDATA"/sandbox_config.json)
@@ -203,6 +220,22 @@ function wait_clean() {
 	crictl create "$pod_id" "$TESTDATA"/container_config.json "$TESTDATA"/sandbox_config.json
 }
 
+@test "emit metric when container is re-requested" {
+	PORT=$(free_port)
+	CONTAINER_ENABLE_METRICS=true CONTAINER_METRICS_PORT="$PORT" start_crio
+	pod_id=$(crictl runp "$TESTDATA"/sandbox_config.json)
+
+	run crictl create -T "$CANCEL_TIMEOUT" "$pod_id" "$TESTDATA"/container_config.json "$TESTDATA"/sandbox_config.json
+	echo "$output"
+	[[ "$output" == *"context deadline exceeded"* ]]
+	[ "$status" -ne 0 ]
+
+	run ! crictl create -T "$CANCEL_TIMEOUT" "$pod_id" "$TESTDATA"/container_config.json "$TESTDATA"/sandbox_config.json
+
+	METRIC=$(curl -sf "http://localhost:$PORT/metrics" | grep 'crio_resources_stalled_at_stage{')
+	[[ "$METRIC" == 'container_runtime_crio_resources_stalled_at_stage{stage="container runtime creation"} 1' ]]
+}
+
 # this test case is paranoid, but mostly checks that we can't
 # operate on a pod that's not created, and that we don't mark
 # a timed out pod as created before it's re-requested

Original file line number	Diff line number	Diff line change
`@@ -101,6 +101,9 @@ const (`
`101`	`101`
`102`	`102`	`// ContainersSeccompNotifierCountTotal is the key for the CRI-O container seccomp notifier metrics per container name and syscalls.`
`103`	`103`	`ContainersSeccompNotifierCountTotal Collector = crioPrefix + "containers_seccomp_notifier_count_total"`
	`104`	`+`
	`105`	`+ // ResourcesStalledAtStage is the key for the resources stalled at different stages in container and pod creation.`
	`106`	`+ ResourcesStalledAtStage Collector = crioPrefix + "resources_stalled_at_stage"`
`104`	`107`	`)`
`105`	`108`
`106`	`109`	`// FromSlice converts a string slice to a Collectors type.`
`@@ -148,6 +151,7 @@ func All() Collectors {`
`148`	`151`	`ImageLayerReuseTotal.Stripped(),`
`149`	`152`	`ContainersOOMCountTotal.Stripped(),`
`150`	`153`	`ContainersSeccompNotifierCountTotal.Stripped(),`
	`154`	`+ ResourcesStalledAtStage.Stripped(),`
`151`	`155`	`}`
`152`	`156`	`}`
`153`	`157`