Skip to content

Commit 5b79f0e

Browse files
saschagrunertPaul Lou
authored andcommitted
Add container out of memory metrics
This patch adds a new metric `container_runtime_crio_containers_oom_total` and `container_runtime_crio_containers_oom`, which collects out of memory (oom) metrics global and by container name. This also includes sandboxes, since we reuse the CRI-O internal name for every container. Signed-off-by: Sascha Grunert <[email protected]>
1 parent 191f731 commit 5b79f0e

File tree

7 files changed

+124
-68
lines changed

7 files changed

+124
-68
lines changed

internal/oci/runtime_oci.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/containers/storage/pkg/pools"
1919
"github.com/cri-o/cri-o/internal/log"
2020
"github.com/cri-o/cri-o/pkg/config"
21+
"github.com/cri-o/cri-o/server/metrics"
2122
"github.com/cri-o/cri-o/utils"
2223
"github.com/fsnotify/fsnotify"
2324
json "github.com/json-iterator/go"
@@ -892,6 +893,17 @@ func (r *runtimeOCI) UpdateContainerStatus(c *Container) error {
892893
oomFilePath := filepath.Join(c.bundlePath, "oom")
893894
if _, err = os.Stat(oomFilePath); err == nil {
894895
c.state.OOMKilled = true
896+
897+
// Collect total metric
898+
metrics.CRIOContainersOOMTotal.Inc()
899+
900+
// Collect metric by container name
901+
counter, err := metrics.CRIOContainersOOM.GetMetricWithLabelValues(c.Name())
902+
if err != nil {
903+
logrus.Warnf("Unable to write OOM metric by container: %v", err)
904+
} else {
905+
counter.Inc()
906+
}
895907
}
896908

897909
return nil

internal/oci/runtime_vm.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"github.com/containerd/containerd/runtime/v2/task"
1818
"github.com/containerd/ttrpc"
1919
"github.com/containers/libpod/pkg/cgroups"
20+
"github.com/cri-o/cri-o/server/metrics"
2021
"github.com/cri-o/cri-o/utils"
2122
"github.com/cri-o/cri-o/utils/errdefs"
2223
"github.com/cri-o/cri-o/utils/fifo"
@@ -664,6 +665,17 @@ func (r *runtimeVM) updateContainerStatus(c *Container) error {
664665
oomFilePath := filepath.Join(c.bundlePath, "oom")
665666
if _, err = os.Stat(oomFilePath); err == nil {
666667
c.state.OOMKilled = true
668+
669+
// Collect total metric
670+
metrics.CRIOContainersOOMTotal.Inc()
671+
672+
// Collect metric by container name
673+
counter, err := metrics.CRIOContainersOOM.GetMetricWithLabelValues(c.Name())
674+
if err != nil {
675+
logrus.Warnf("Unable to write OOM metric by container: %v", err)
676+
} else {
677+
counter.Inc()
678+
}
667679
}
668680
}
669681
return nil
@@ -729,12 +741,12 @@ func (r *runtimeVM) ContainerStats(c *Container, _ string) (*ContainerStats, err
729741
return nil, errors.Wrap(err, "failed to extract container metrics")
730742
}
731743

732-
metrics, ok := stats.(*cgroups.Metrics)
744+
m, ok := stats.(*cgroups.Metrics)
733745
if !ok {
734746
return nil, errors.Errorf("Unknown stats type %T", stats)
735747
}
736748

737-
return metricsToCtrStats(c, metrics), nil
749+
return metricsToCtrStats(c, m), nil
738750
}
739751

740752
// SignalContainer sends a signal to a container process.

server/metrics/metrics.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@ const (
3535
// CRIOImageLayerReuseKey is the key for the CRI-O image layer reuse metrics.
3636
CRIOImageLayerReuseKey = "crio_image_layer_reuse"
3737

38+
// CRIOContainersOOMTotalKey is the key for the total CRI-O container out of memory metrics.
39+
CRIOContainersOOMTotalKey = "crio_containers_oom_total"
40+
41+
// CRIOContainersOOMKey is the key for the CRI-O container out of memory metrics per container name.
42+
CRIOContainersOOMKey = "crio_containers_oom"
43+
3844
subsystem = "container_runtime"
3945
)
4046

@@ -131,6 +137,25 @@ var (
131137
},
132138
[]string{"name"},
133139
)
140+
141+
// CRIOContainersOOMTotal collects container out of memory (oom) metrics for every container and sandboxes.
142+
CRIOContainersOOMTotal = prometheus.NewCounter(
143+
prometheus.CounterOpts{
144+
Subsystem: subsystem,
145+
Name: CRIOContainersOOMTotalKey,
146+
Help: "Amount of containers killed because they ran out of memory (OOM)",
147+
},
148+
)
149+
150+
// CRIOContainersOOM collects container out of memory (oom) metrics per container and sandbox name.
151+
CRIOContainersOOM = prometheus.NewCounterVec(
152+
prometheus.CounterOpts{
153+
Subsystem: subsystem,
154+
Name: CRIOContainersOOMKey,
155+
Help: "Amount of containers killed because they ran out of memory (OOM) by their name",
156+
},
157+
[]string{"name"},
158+
)
134159
)
135160

136161
var registerMetrics sync.Once
@@ -147,6 +172,8 @@ func Register() {
147172
prometheus.MustRegister(CRIOImagePullsFailures)
148173
prometheus.MustRegister(CRIOImagePullsSuccesses)
149174
prometheus.MustRegister(CRIOImageLayerReuse)
175+
prometheus.MustRegister(CRIOContainersOOMTotal)
176+
prometheus.MustRegister(CRIOContainersOOM)
150177
})
151178
}
152179

test/ctr.bats

Lines changed: 0 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1494,72 +1494,6 @@ function wait_until_exit() {
14941494
[ "$status" -eq 0 ]
14951495
}
14961496

1497-
@test "ctr expose metrics with default port" {
1498-
# start crio with default port 9090
1499-
port="9090"
1500-
CONTAINER_ENABLE_METRICS=true start_crio
1501-
# ensure metrics port is listening
1502-
listened=$(check_metrics_port $port)
1503-
if [[ "$listened" -ne 0 ]]; then
1504-
skip "$CONTAINER_METRICS_PORT is not listening"
1505-
fi
1506-
1507-
run crictl runp "$TESTDATA"/sandbox_config.json
1508-
echo "$output"
1509-
[ "$status" -eq 0 ]
1510-
pod_id="$output"
1511-
run crictl create "$pod_id" "$TESTDATA"/container_redis.json "$TESTDATA"/sandbox_config.json
1512-
echo "$output"
1513-
[ "$status" -eq 0 ]
1514-
ctr_id="$output"
1515-
run crictl start "$ctr_id"
1516-
[ "$status" -eq 0 ]
1517-
1518-
# get metrics
1519-
run curl http://localhost:$port/metrics -k
1520-
[ "$status" -eq 0 ]
1521-
1522-
run crictl stopp "$pod_id"
1523-
echo "$output"
1524-
[ "$status" -eq 0 ]
1525-
run crictl rmp "$pod_id"
1526-
echo "$output"
1527-
[ "$status" -eq 0 ]
1528-
}
1529-
1530-
@test "ctr expose metrics with custom port" {
1531-
# start crio with custom port
1532-
port="4321"
1533-
CONTAINER_ENABLE_METRICS=true CONTAINER_METRICS_PORT=$port start_crio
1534-
# ensure metrics port is listening
1535-
listened=$(check_metrics_port $port)
1536-
if [[ "$listened" -ne 0 ]]; then
1537-
skip "$CONTAINER_METRICS_PORT is not listening"
1538-
fi
1539-
1540-
run crictl runp "$TESTDATA"/sandbox_config.json
1541-
echo "$output"
1542-
[ "$status" -eq 0 ]
1543-
pod_id="$output"
1544-
run crictl create "$pod_id" "$TESTDATA"/container_redis.json "$TESTDATA"/sandbox_config.json
1545-
echo "$output"
1546-
[ "$status" -eq 0 ]
1547-
ctr_id="$output"
1548-
run crictl start "$ctr_id"
1549-
[ "$status" -eq 0 ]
1550-
1551-
# get metrics
1552-
run curl http://localhost:$port/metrics -k
1553-
[ "$status" -eq 0 ]
1554-
1555-
run crictl stopp "$pod_id"
1556-
echo "$output"
1557-
[ "$status" -eq 0 ]
1558-
run crictl rmp "$pod_id"
1559-
echo "$output"
1560-
[ "$status" -eq 0 ]
1561-
}
1562-
15631497

15641498
@test "privileged ctr -- check for rw mounts" {
15651499
start_crio

test/helpers.bash

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,11 @@ function check_journald() {
369369
echo "0"
370370
}
371371

372+
# get a random available port
373+
function free_port() {
374+
python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()'
375+
}
376+
372377
# Check whether metrics port is listening
373378
function check_metrics_port() {
374379
if ! netstat -lanp | grep "$1" >/dev/null; then

test/metrics.bats

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/usr/bin/env bats
2+
# vim: set syntax=sh:
3+
4+
load helpers
5+
6+
function setup() {
7+
setup_test
8+
}
9+
10+
function teardown() {
11+
cleanup_test
12+
}
13+
14+
@test "metrics with default port" {
15+
# start crio with default port 9090
16+
PORT="9090"
17+
CONTAINER_ENABLE_METRICS=true start_crio
18+
if ! port_listens "$PORT"; then
19+
skip "Metrics port $PORT not listening"
20+
fi
21+
22+
# get metrics
23+
curl -sf "http://localhost:$PORT/metrics"
24+
}
25+
26+
@test "metrics with random port" {
27+
# start crio with custom port
28+
PORT=$(free_port)
29+
CONTAINER_ENABLE_METRICS=true CONTAINER_METRICS_PORT=$PORT start_crio
30+
31+
crictl run "$TESTDATA"/container_redis.json "$TESTDATA"/sandbox_config.json
32+
33+
# get metrics
34+
curl -sf "http://localhost:$PORT/metrics" | grep crio_operations
35+
}
36+
37+
@test "metrics container oom" {
38+
PORT=$(free_port)
39+
CONTAINER_ENABLE_METRICS=true CONTAINER_METRICS_PORT=$PORT start_crio
40+
41+
jq '.image.image = "quay.io/crio/oom"
42+
| .linux.resources.memory_limit_in_bytes = 25165824
43+
| .command = ["/oom"]' \
44+
"$TESTDATA/container_config.json" > "$TESTDIR/config.json"
45+
CTR_ID=$(crictl run "$TESTDIR/config.json" "$TESTDATA/sandbox_config.json")
46+
47+
# Wait for container to OOM
48+
CNT=0
49+
while [ $CNT -le 100 ]; do
50+
CNT=$((CNT + 1))
51+
OUTPUT=$(crictl inspect --output yaml "$CTR_ID")
52+
if [[ "$OUTPUT" == *"OOMKilled"* ]]; then
53+
break
54+
fi
55+
sleep 10
56+
done
57+
[[ "$OUTPUT" == *"OOMKilled"* ]]
58+
59+
METRIC=$(curl -sf "http://localhost:$PORT/metrics" | grep '^container_runtime_crio_containers_oom_total')
60+
[[ "$METRIC" == 'container_runtime_crio_containers_oom_total 1' ]]
61+
62+
METRIC=$(curl -sf "http://localhost:$PORT/metrics" | grep 'crio_containers_oom{')
63+
[[ "$METRIC" == 'container_runtime_crio_containers_oom{name="k8s_container1_podsandbox1_redhat.test.crio_redhat-test-crio_1"} 1' ]]
64+
}

tutorials/metrics.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ Beside the [default golang based metrics][2], CRI-O provides the following addit
3838
| `crio_image_pulls_by_name_skipped` | `name` | Counter | Bytes skipped by CRI-O image pulls by name. |
3939
| `crio_image_pulls_successes` | `name` | Counter | Successful image pulls by image name |
4040
| `crio_image_pulls_failures` | `name`, `error` | Counter | Failed image pulls by image name and their error category. |
41+
| `crio_containers_oom_total` | | Counter | Total number of containers killed because they ran out of memory (OOM) |
42+
| `crio_containers_oom` | `name` | Counter | Containers killed because they ran out of memory (OOM) by their name |
4143

4244
- Available CRI-O RPC's from the [gRPC API][3]: `Attach`, `ContainerStats`, `ContainerStatus`,
4345
`CreateContainer`, `Exec`, `ExecSync`, `ImageFsInfo`, `ImageStatus`,

0 commit comments

Comments
 (0)