Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,8 @@ mockgen: \
mock-image-types \
mock-ocicni-types \
mock-seccompociartifact-types \
mock-ociartifact-types
mock-ociartifact-types \
mock-systemd

.PHONY: mock-containereventserver
mock-containereventserver: ${MOCKGEN}
Expand Down Expand Up @@ -526,6 +527,13 @@ mock-ociartifact-types: ${MOCKGEN}
-destination ${MOCK_PATH}/ociartifact/ociartifact.go \
github.com/cri-o/cri-o/internal/config/ociartifact Impl

.PHONY: mock-systemd
mock-systemd: ${MOCKGEN}
${MOCKGEN} \
-package systemdmock \
-destination ${MOCK_PATH}/systemd/systemd.go \
github.com/cri-o/cri-o/internal/watchdog Systemd

MANPAGES_MD := $(wildcard docs/*.md)
MANPAGES := $(MANPAGES_MD:%.md=%)

Expand Down
6 changes: 4 additions & 2 deletions cmd/crio/daemon_linux.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
package main

import (
systemdDaemon "github.com/coreos/go-systemd/v22/daemon"
"github.com/coreos/go-systemd/v22/daemon"
"github.com/sirupsen/logrus"

"github.com/cri-o/cri-o/internal/watchdog"
)

func sdNotify() {
if _, err := systemdDaemon.SdNotify(false, "READY=1"); err != nil {
if _, err := watchdog.DefaultSystemd().Notify(daemon.SdNotifyReady); err != nil {
logrus.Warnf("Failed to sd_notify systemd: %v", err)
}
}
Expand Down
1 change: 1 addition & 0 deletions contrib/systemd/crio.service
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ OOMScoreAdjust=-999
TimeoutStartSec=0
Restart=on-failure
RestartSec=10
WatchdogSec=60s

[Install]
WantedBy=multi-user.target
Expand Down
15 changes: 9 additions & 6 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,13 @@ require (
golang.org/x/sys v0.24.0
google.golang.org/grpc v1.66.0
google.golang.org/protobuf v1.34.2
k8s.io/api v0.31.0
k8s.io/apimachinery v0.31.0
k8s.io/client-go v0.31.0
k8s.io/cri-api v0.31.0
k8s.io/api v0.31.3
k8s.io/apimachinery v0.31.3
k8s.io/client-go v0.31.3
k8s.io/cri-api v0.31.3
k8s.io/cri-client v0.31.3
k8s.io/klog/v2 v2.130.1
k8s.io/kubelet v0.31.0
k8s.io/kubelet v0.31.3
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8
sigs.k8s.io/release-sdk v0.12.1
sigs.k8s.io/release-utils v0.8.4
Expand Down Expand Up @@ -190,6 +191,7 @@ require (
github.com/sigstore/rekor v1.3.6 // indirect
github.com/sigstore/sigstore v1.8.4 // indirect
github.com/skeema/knownhosts v1.2.2 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/stefanberger/go-pkcs11uri v0.0.0-20230803200340-78284954bff6 // indirect
github.com/sylabs/sif/v2 v2.18.0 // indirect
github.com/tchap/go-patricia/v2 v2.3.1 // indirect
Expand Down Expand Up @@ -227,7 +229,8 @@ require (
gopkg.in/warnings.v0 v0.1.2 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/apiserver v0.31.0 // indirect
k8s.io/apiserver v0.31.3 // indirect
k8s.io/component-base v0.31.3 // indirect
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
Expand Down
28 changes: 16 additions & 12 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2046,22 +2046,26 @@ honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt
honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
honnef.co/go/tools v0.1.3/go.mod h1:NgwopIslSNH47DimFoV78dnkksY2EFtX0ajyb3K/las=
k8s.io/api v0.31.0 h1:b9LiSjR2ym/SzTOlfMHm1tr7/21aD7fSkqgD/CVJBCo=
k8s.io/api v0.31.0/go.mod h1:0YiFF+JfFxMM6+1hQei8FY8M7s1Mth+z/q7eF1aJkTE=
k8s.io/apimachinery v0.31.0 h1:m9jOiSr3FoSSL5WO9bjm1n6B9KROYYgNZOb4tyZ1lBc=
k8s.io/apimachinery v0.31.0/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo=
k8s.io/apiserver v0.31.0 h1:p+2dgJjy+bk+B1Csz+mc2wl5gHwvNkC9QJV+w55LVrY=
k8s.io/apiserver v0.31.0/go.mod h1:KI9ox5Yu902iBnnyMmy7ajonhKnkeZYJhTZ/YI+WEMk=
k8s.io/client-go v0.31.0 h1:QqEJzNjbN2Yv1H79SsS+SWnXkBgVu4Pj3CJQgbx0gI8=
k8s.io/client-go v0.31.0/go.mod h1:Y9wvC76g4fLjmU0BA+rV+h2cncoadjvjjkkIGoTLcGU=
k8s.io/cri-api v0.31.0 h1:6o0XrhWlc1/zseGCh+aMScdXCg5nT6KCGdyx7HQkSKo=
k8s.io/cri-api v0.31.0/go.mod h1:Po3TMAYH/+KrZabi7QiwQI4a692oZcUOUThd/rqwxrI=
k8s.io/api v0.31.3 h1:umzm5o8lFbdN/hIXbrK9oRpOproJO62CV1zqxXrLgk8=
k8s.io/api v0.31.3/go.mod h1:UJrkIp9pnMOI9K2nlL6vwpxRzzEX5sWgn8kGQe92kCE=
k8s.io/apimachinery v0.31.3 h1:6l0WhcYgasZ/wk9ktLq5vLaoXJJr5ts6lkaQzgeYPq4=
k8s.io/apimachinery v0.31.3/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo=
k8s.io/apiserver v0.31.3 h1:+1oHTtCB+OheqFEz375D0IlzHZ5VeQKX1KGXnx+TTuY=
k8s.io/apiserver v0.31.3/go.mod h1:PrxVbebxrxQPFhJk4powDISIROkNMKHibTg9lTRQ0Qg=
k8s.io/client-go v0.31.3 h1:CAlZuM+PH2cm+86LOBemaJI/lQ5linJ6UFxKX/SoG+4=
k8s.io/client-go v0.31.3/go.mod h1:2CgjPUTpv3fE5dNygAr2NcM8nhHzXvxB8KL5gYc3kJs=
k8s.io/component-base v0.31.3 h1:DMCXXVx546Rfvhj+3cOm2EUxhS+EyztH423j+8sOwhQ=
k8s.io/component-base v0.31.3/go.mod h1:xME6BHfUOafRgT0rGVBGl7TuSg8Z9/deT7qq6w7qjIU=
k8s.io/cri-api v0.31.3 h1:dsZXzrGrCEwHjsTDlAV7rutEplpMLY8bfNRMIqrtXjo=
k8s.io/cri-api v0.31.3/go.mod h1:Po3TMAYH/+KrZabi7QiwQI4a692oZcUOUThd/rqwxrI=
k8s.io/cri-client v0.31.3 h1:9ZwddaNJomqkTBYQqSmB+Ccns3beY4HyYDwmRtWTCJM=
k8s.io/cri-client v0.31.3/go.mod h1:klbWiYkOatOQOkXOYZMZMGSTM8q9eC/efsYGuXcgPes=
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7FjZpUb45WallggurYhKGag=
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98=
k8s.io/kubelet v0.31.0 h1:IlfkBy7QTojGEm97GuVGhtli0HL/Pgu4AdayiF76yWo=
k8s.io/kubelet v0.31.0/go.mod h1:s+OnqnfdIh14PFpUb7NgzM53WSYXcczA3w/1qSzsRc8=
k8s.io/kubelet v0.31.3 h1:DIXRAmvVGp42mV2vpA1GCLU6oO8who0/vp3Oq6kSpbI=
k8s.io/kubelet v0.31.3/go.mod h1:KSdbEfNy5VzqUlAHlytA/fH12s+sE1u8fb/8JY9sL/8=
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A=
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
lukechampine.com/uint128 v1.1.1/go.mod h1:c4eWIwlEGaxC/+H1VguhU4PHXNWDCDMUlWdIWl2j1gk=
Expand Down
27 changes: 27 additions & 0 deletions internal/watchdog/suite_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package watchdog_test

import (
"testing"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

. "github.com/cri-o/cri-o/test/framework"
)

// TestWatchdog runs the created specs.
func TestWatchdog(t *testing.T) {
RegisterFailHandler(Fail)
RunFrameworkSpecs(t, "Watchdog")
}

var t *TestFramework

var _ = BeforeSuite(func() {
t = NewTestFramework(NilFunc, NilFunc)
t.Setup()
})

var _ = AfterSuite(func() {
t.Teardown()
})
43 changes: 43 additions & 0 deletions internal/watchdog/systemd.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package watchdog

import (
"time"

"github.com/coreos/go-systemd/v22/daemon"
)

// Systemd is the main interface for supported systemd functionality.
type Systemd interface {
WatchdogEnabled() (time.Duration, error)
Notify(string) (bool, error)
}

type defaultSystemd struct{}

// DefaultSystemd returns the default systemd implementation.
func DefaultSystemd() Systemd {
return &defaultSystemd{}
}

// WatchdogEnabled returns watchdog information for a service.
// Processes should call Notify(daemon.SdNotifyWatchdog) every
// time / 2.
//
// It returns one of the following:
// (0, nil) - watchdog isn't enabled or we aren't the watched PID.
// (0, err) - an error happened (e.g. error converting time).
// (time, nil) - watchdog is enabled and we can send ping. time is delay
// before inactive service will be killed.
func (*defaultSystemd) WatchdogEnabled() (time.Duration, error) {
return daemon.SdWatchdogEnabled(false)

Check warning on line 32 in internal/watchdog/systemd.go

View check run for this annotation

Codecov / codecov/patch

internal/watchdog/systemd.go#L31-L32

Added lines #L31 - L32 were not covered by tests
}

// Notify sends a message to the init daemon. It is common to ignore the error.
//
// It returns one of the following:
// (false, nil) - notification not supported (i.e. NOTIFY_SOCKET is unset).
// (false, err) - notification supported, but failure happened (e.g. error connecting to NOTIFY_SOCKET or while sending data).
// (true, nil) - notification supported, data has been sent.
func (d *defaultSystemd) Notify(state string) (bool, error) {
return daemon.SdNotify(false, state)

Check warning on line 42 in internal/watchdog/systemd.go

View check run for this annotation

Codecov / codecov/patch

internal/watchdog/systemd.go#L41-L42

Added lines #L41 - L42 were not covered by tests
}
101 changes: 101 additions & 0 deletions internal/watchdog/watchdog.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
package watchdog

import (
"context"
"errors"
"fmt"
"sync/atomic"
"time"

"github.com/coreos/go-systemd/v22/daemon"
"k8s.io/apimachinery/pkg/util/wait"

"github.com/cri-o/cri-o/internal/log"
)

// Watchdog is the main structure for this package.
type Watchdog struct {
systemd Systemd
backoff wait.Backoff
healthCheckers []HealthCheckFn
notifications atomic.Uint64
}

const minInterval = time.Second

// HealthCheckFn is the health checker function type definition.
type HealthCheckFn func(context.Context, time.Duration) error

// New creates a new systemd Watchdog instance.
func New(healthCheckers ...HealthCheckFn) *Watchdog {
return &Watchdog{
systemd: DefaultSystemd(),
backoff: wait.Backoff{
Duration: time.Second,
Factor: 2.0,
Jitter: 0.1,
Steps: 2,
},
healthCheckers: healthCheckers,
}
}

// Start runs the watchdog.
func (w *Watchdog) Start(ctx context.Context) error {
interval, err := w.systemd.WatchdogEnabled()
if err != nil {
return fmt.Errorf("configure watchdog: %w", err)
}

if interval == 0 {
log.Infof(ctx, "No systemd watchdog enabled")
return nil
}

if interval <= minInterval {
return fmt.Errorf("watchdog timeout of %v should be at least %v", interval, minInterval)
}
interval /= 2

log.Infof(ctx, "Starting systemd watchdog using interval: %v", interval)

go wait.Forever(func() {
if err := w.runHealthCheckers(ctx, interval); err != nil {
log.Errorf(ctx, "Will not notify watchdog because CRI-O is unhealthy: %v", err)
return
}

if err := wait.ExponentialBackoff(w.backoff, func() (bool, error) {
gotAck, err := w.systemd.Notify(daemon.SdNotifyWatchdog)
w.notifications.Add(1)
if err != nil {
log.Warnf(ctx, "Failed to notify systemd watchdog, retrying: %v", err)
return false, nil
}
if !gotAck {
return false, errors.New("notification not supported (NOTIFY_SOCKET is unset)")
}

log.Debugf(ctx, "Systemd watchdog successfully notified")
return true, nil
}); err != nil {
log.Errorf(ctx, "Failed to notify watchdog: %v", err)
}
}, interval)

return nil
}

// Notifications returns the amount of done systemd notifications.
func (w *Watchdog) Notifications() uint64 {
return w.notifications.Load()
}

func (w *Watchdog) runHealthCheckers(ctx context.Context, timeout time.Duration) error {
for _, hc := range w.healthCheckers {
if err := hc(ctx, timeout); err != nil {
return fmt.Errorf("health checker failed: %w", err)
}
}
return nil
}
Loading
Loading