From 925d69a18da8e9199bb7754b3559607de33e1782 Mon Sep 17 00:00:00 2001 From: shlo Date: Mon, 24 Feb 2020 11:07:57 +0800 Subject: [PATCH 1/7] fix wording --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index be7f57b28..e71c7baa1 100644 --- a/README.md +++ b/README.md @@ -267,7 +267,7 @@ See [NPD e2e test documentation](https://github.com/kubernetes/node-problem-dete ## Problem Maker -[Problem maker](https://github.com/kubernetes/node-problem-detector/blob/master/test/e2e/problemmaker/README.md) is a program used in NPD e2e tests to generate/simulate node problems. It is ONLY indented to be used by NPD e2e tests. Please do NOT run it on your workstation, as it could cause real node problems. +[Problem maker](https://github.com/kubernetes/node-problem-detector/blob/master/test/e2e/problemmaker/README.md) is a program used in NPD e2e tests to generate/simulate node problems. It is ONLY intended to be used by NPD e2e tests. Please do NOT run it on your workstation, as it could cause real node problems. # Docs From 7fd465e195fc2a9e203775c9f24c9c445cf3d513 Mon Sep 17 00:00:00 2001 From: Andrew DeMaria Date: Thu, 5 Mar 2020 19:04:31 -0700 Subject: [PATCH 2/7] Add namespace option for events --- cmd/options/options.go | 3 +++ .../problemclient/problem_client.go | 24 ++++++++++--------- .../problemclient/problem_client_test.go | 2 +- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/cmd/options/options.go b/cmd/options/options.go index e9fbc5dac..3c252a9ae 100644 --- a/cmd/options/options.go +++ b/cmd/options/options.go @@ -49,6 +49,8 @@ type NodeProblemDetectorOptions struct { // k8sExporter options // EnableK8sExporter is the flag determining whether to report to Kubernetes. EnableK8sExporter bool + // EventNamespace is the namespace events are written to + EventNamespace string // ApiServerOverride is the custom URI used to connect to Kubernetes ApiServer. ApiServerOverride string // APIServerWaitTimeout is the timeout on waiting for kube-apiserver to be @@ -105,6 +107,7 @@ func (npdo *NodeProblemDetectorOptions) AddFlags(fs *pflag.FlagSet) { []string{}, "List of paths to custom plugin monitor config files, comma separated.") fs.MarkDeprecated("custom-plugin-monitors", "replaced by --config.custom-plugin-monitor. NPD will panic if both --custom-plugin-monitors and --config.custom-plugin-monitor are set.") fs.BoolVar(&npdo.EnableK8sExporter, "enable-k8s-exporter", true, "Enables reporting to Kubernetes API server.") + fs.StringVar(&npdo.EventNamespace, "event-namespace", "", "Namespace for recorded Kubernetes events.") fs.StringVar(&npdo.ApiServerOverride, "apiserver-override", "", "Custom URI used to connect to Kubernetes ApiServer. This is ignored if --enable-k8s-exporter is false.") fs.DurationVar(&npdo.APIServerWaitTimeout, "apiserver-wait-timeout", time.Duration(5)*time.Minute, "The timeout on waiting for kube-apiserver to be ready. This is ignored if --enable-k8s-exporter is false.") diff --git a/pkg/exporters/k8sexporter/problemclient/problem_client.go b/pkg/exporters/k8sexporter/problemclient/problem_client.go index 5fec1dbc1..1694ec82c 100644 --- a/pkg/exporters/k8sexporter/problemclient/problem_client.go +++ b/pkg/exporters/k8sexporter/problemclient/problem_client.go @@ -53,11 +53,12 @@ type Client interface { } type nodeProblemClient struct { - nodeName string - client typedcorev1.CoreV1Interface - clock clock.Clock - recorders map[string]record.EventRecorder - nodeRef *v1.ObjectReference + nodeName string + client typedcorev1.CoreV1Interface + clock clock.Clock + recorders map[string]record.EventRecorder + nodeRef *v1.ObjectReference + eventNamespace string } // NewClientOrDie creates a new problem client, panics if error occurs. @@ -76,7 +77,8 @@ func NewClientOrDie(npdo *options.NodeProblemDetectorOptions) Client { // TODO(random-liu): Set QPS Limit c.client = clientset.NewForConfigOrDie(cfg).CoreV1() c.nodeName = npdo.NodeName - c.nodeRef = getNodeRef(c.nodeName) + c.eventNamespace = npdo.EventNamespace + c.nodeRef = getNodeRef(c.eventNamespace, c.nodeName) c.recorders = make(map[string]record.EventRecorder) return c } @@ -113,7 +115,7 @@ func (c *nodeProblemClient) Eventf(eventType, source, reason, messageFmt string, recorder, found := c.recorders[source] if !found { // TODO(random-liu): If needed use separate client and QPS limit for event. - recorder = getEventRecorder(c.client, c.nodeName, source) + recorder = getEventRecorder(c.client, c.eventNamespace, c.nodeName, source) c.recorders[source] = recorder } recorder.Eventf(c.nodeRef, eventType, reason, messageFmt, args...) @@ -133,20 +135,20 @@ func generatePatch(conditions []v1.NodeCondition) ([]byte, error) { } // getEventRecorder generates a recorder for specific node name and source. -func getEventRecorder(c typedcorev1.CoreV1Interface, nodeName, source string) record.EventRecorder { +func getEventRecorder(c typedcorev1.CoreV1Interface, namespace, nodeName, source string) record.EventRecorder { eventBroadcaster := record.NewBroadcaster() eventBroadcaster.StartLogging(glog.V(4).Infof) recorder := eventBroadcaster.NewRecorder(legacyscheme.Scheme, v1.EventSource{Component: source, Host: nodeName}) - eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: c.Events("")}) + eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: c.Events(namespace)}) return recorder } -func getNodeRef(nodeName string) *v1.ObjectReference { +func getNodeRef(namespace, nodeName string) *v1.ObjectReference { // TODO(random-liu): Get node to initialize the node reference return &v1.ObjectReference{ Kind: "Node", Name: nodeName, UID: types.UID(nodeName), - Namespace: "", + Namespace: namespace, } } diff --git a/pkg/exporters/k8sexporter/problemclient/problem_client_test.go b/pkg/exporters/k8sexporter/problemclient/problem_client_test.go index 915748c32..200e3a809 100644 --- a/pkg/exporters/k8sexporter/problemclient/problem_client_test.go +++ b/pkg/exporters/k8sexporter/problemclient/problem_client_test.go @@ -42,7 +42,7 @@ func newFakeProblemClient() *nodeProblemClient { // TODO(random-liu): Add test for SetConditions when we have good fake for *client.Client clock: &clock.FakeClock{}, recorders: make(map[string]record.EventRecorder), - nodeRef: getNodeRef(testNode), + nodeRef: getNodeRef("", testNode), } } From f603f26afabf54935daaa740d6cbc4169f7a8239 Mon Sep 17 00:00:00 2001 From: Sean Malloy Date: Sun, 8 Mar 2020 22:30:51 -0500 Subject: [PATCH 3/7] Document Using Descheudler As a Remedy System In addition to using draino as a remedy system the k8s descheduler can also be used as a remedy system. --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index e71c7baa1..ad6602b38 100644 --- a/README.md +++ b/README.md @@ -248,6 +248,10 @@ Kubernetes cluster to a healthy state. The following remedy systems exist: to automatically terminate drained nodes. Refer to [this issue](https://github.com/kubernetes/node-problem-detector/issues/199) for an example production use case for Draino. +* [**Descheduler**](https://github.com/kubernetes-sigs/descheduler) strategy RemovePodsViolatingNodeTaints + evicts pods violating NoSchedule taints on nodes. The k8s scheduler's TaintNodesByCondition feature must + be enabled. The [Cluster Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler) + can be used to automatically terminate drained nodes. # Testing From 70c457e5df71f853d1e543fc20be3931ac7ca0b8 Mon Sep 17 00:00:00 2001 From: Stefan Majer Date: Wed, 25 Mar 2020 10:36:47 +0100 Subject: [PATCH 4/7] Install util-linux to have lsblk binary --- Dockerfile.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.in b/Dockerfile.in index ee0609c54..7aff07c3e 100644 --- a/Dockerfile.in +++ b/Dockerfile.in @@ -15,7 +15,7 @@ FROM @BASEIMAGE@ MAINTAINER Random Liu -RUN clean-install libsystemd0 bash +RUN clean-install util-linux libsystemd0 bash # Avoid symlink of /etc/localtime. RUN test -h /etc/localtime && rm -f /etc/localtime && cp /usr/share/zoneinfo/UTC /etc/localtime || true From 74554c4b261a150b7ceb400aaf9335d1555dbc31 Mon Sep 17 00:00:00 2001 From: Mathieu Collin Date: Wed, 8 Apr 2020 11:24:56 +0200 Subject: [PATCH 5/7] update system-log-monitor and image version --- deployment/node-problem-detector.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deployment/node-problem-detector.yaml b/deployment/node-problem-detector.yaml index 0d75e6830..5f1d7a85d 100644 --- a/deployment/node-problem-detector.yaml +++ b/deployment/node-problem-detector.yaml @@ -19,8 +19,8 @@ spec: command: - /node-problem-detector - --logtostderr - - --system-log-monitors=/config/kernel-monitor.json,/config/docker-monitor.json - image: k8s.gcr.io/node-problem-detector:v0.6.3 + - --config.system-log-monitor=/config/kernel-monitor.json,/config/docker-monitor.json + image: k8s.gcr.io/node-problem-detector:v0.8.1 resources: limits: cpu: 10m From 5342a5087402c2a96490d98f7c7d2d506876d516 Mon Sep 17 00:00:00 2001 From: Abhilash Pallerlamudi Date: Wed, 15 Apr 2020 12:14:29 -0700 Subject: [PATCH 6/7] Add rhel support for osversion --- pkg/util/helpers.go | 2 ++ pkg/util/helpers_test.go | 6 ++++++ pkg/util/testdata/os-release-rhel | 15 +++++++++++++++ 3 files changed, 23 insertions(+) create mode 100644 pkg/util/testdata/os-release-rhel diff --git a/pkg/util/helpers.go b/pkg/util/helpers.go index 84cba0a92..25f9d6b7b 100644 --- a/pkg/util/helpers.go +++ b/pkg/util/helpers.go @@ -91,6 +91,8 @@ func GetOSVersion() (string, error) { return getDebianVersion(osReleaseMap), nil case "centos": return getDebianVersion(osReleaseMap), nil + case "rhel": + return getDebianVersion(osReleaseMap), nil default: return "", fmt.Errorf("Unsupported ID in /etc/os-release: %q", osReleaseMap["ID"]) } diff --git a/pkg/util/helpers_test.go b/pkg/util/helpers_test.go index 7e7d84d1f..e3642d332 100644 --- a/pkg/util/helpers_test.go +++ b/pkg/util/helpers_test.go @@ -167,6 +167,12 @@ func TestGetOSVersion(t *testing.T) { expectedOSVersion: "centos 7 (Core)", expectErr: false, }, + { + name: "rhel", + fakeOSReleasePath: "testdata/os-release-rhel", + expectedOSVersion: "rhel 7.7 (Maipo)", + expectErr: false, + }, { name: "Unknown", fakeOSReleasePath: "testdata/os-release-unknown", diff --git a/pkg/util/testdata/os-release-rhel b/pkg/util/testdata/os-release-rhel new file mode 100644 index 000000000..a8c873a33 --- /dev/null +++ b/pkg/util/testdata/os-release-rhel @@ -0,0 +1,15 @@ +NAME="Red Hat Enterprise Linux Server" +VERSION="7.7 (Maipo)" +ID="rhel" +ID_LIKE="fedora" +VERSION_ID="7.7" +PRETTY_NAME="Red Hat Enterprise Linux Server 7.7 (Maipo)" +ANSI_COLOR="0;31" +CPE_NAME="cpe:/o:redhat:enterprise_linux:7.7:GA:server" +HOME_URL="https://www.redhat.com/" +BUG_REPORT_URL="https://bugzilla.redhat.com/" + +REDHAT_BUGZILLA_PRODUCT="Red Hat Enterprise Linux 7" +REDHAT_BUGZILLA_PRODUCT_VERSION=7.7 +REDHAT_SUPPORT_PRODUCT="Red Hat Enterprise Linux" +REDHAT_SUPPORT_PRODUCT_VERSION="7.7" From 44dc4aa6c1447fff309529a832eabffd65310488 Mon Sep 17 00:00:00 2001 From: Archit Bansal Date: Mon, 11 May 2020 14:19:56 -0700 Subject: [PATCH 7/7] Add health-check-monitor --- Makefile | 13 +- cmd/healthchecker/health_checker.go | 57 ++++++++ cmd/healthchecker/options/options.go | 102 ++++++++++++++ cmd/healthchecker/options/options_test.go | 76 ++++++++++ config/health-checker-docker.json | 33 +++++ config/health-checker-kubelet.json | 33 +++++ pkg/healthchecker/health_checker.go | 163 ++++++++++++++++++++++ pkg/healthchecker/health_checker_test.go | 118 ++++++++++++++++ pkg/healthchecker/types/types.go | 37 +++++ 9 files changed, 630 insertions(+), 2 deletions(-) create mode 100644 cmd/healthchecker/health_checker.go create mode 100644 cmd/healthchecker/options/options.go create mode 100644 cmd/healthchecker/options/options_test.go create mode 100644 config/health-checker-docker.json create mode 100644 config/health-checker-kubelet.json create mode 100644 pkg/healthchecker/health_checker.go create mode 100644 pkg/healthchecker/health_checker_test.go create mode 100644 pkg/healthchecker/types/types.go diff --git a/Makefile b/Makefile index c6e82852b..207925eba 100644 --- a/Makefile +++ b/Makefile @@ -113,6 +113,14 @@ endif -tags "$(BUILD_TAGS)" \ ./test/e2e/problemmaker/problem_maker.go +./bin/health-checker: $(PKG_SOURCES) + CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \ + -mod vendor \ + -o bin/health-checker \ + -ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \ + -tags "$(BUILD_TAGS)" \ + cmd/healthchecker/health_checker.go + Dockerfile: Dockerfile.in sed -e 's|@BASEIMAGE@|$(BASEIMAGE)|g' $< >$@ ifneq ($(ENABLE_JOURNALD), 1) @@ -134,12 +142,12 @@ e2e-test: vet fmt build-tar -boskos-project-type=$(BOSKOS_PROJECT_TYPE) -job-name=$(JOB_NAME) \ -artifacts-dir=$(ARTIFACTS) -build-binaries: ./bin/node-problem-detector ./bin/log-counter +build-binaries: ./bin/node-problem-detector ./bin/log-counter ./bin/health-checker build-container: build-binaries Dockerfile docker build -t $(IMAGE) . -build-tar: ./bin/node-problem-detector ./bin/log-counter ./test/bin/problem-maker +build-tar: ./bin/node-problem-detector ./bin/log-counter ./bin/health-checker ./test/bin/problem-maker tar -zcvf $(TARBALL) bin/ config/ test/e2e-install.sh test/bin/problem-maker sha1sum $(TARBALL) md5sum $(TARBALL) @@ -164,6 +172,7 @@ push-tar: build-tar push: push-container push-tar clean: + rm -f bin/health-checker rm -f bin/log-counter rm -f bin/node-problem-detector rm -f test/bin/problem-maker diff --git a/cmd/healthchecker/health_checker.go b/cmd/healthchecker/health_checker.go new file mode 100644 index 000000000..7917fc7b5 --- /dev/null +++ b/cmd/healthchecker/health_checker.go @@ -0,0 +1,57 @@ +/* +Copyright 2020 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "flag" + "fmt" + "os" + + "github.com/spf13/pflag" + + "k8s.io/node-problem-detector/cmd/healthchecker/options" + "k8s.io/node-problem-detector/pkg/custompluginmonitor/types" + "k8s.io/node-problem-detector/pkg/healthchecker" +) + +func main() { + // Set glog flag so that it does not log to files. + if err := flag.Set("logtostderr", "true"); err != nil { + fmt.Printf("Failed to set logtostderr=true: %v", err) + os.Exit(int(types.Unknown)) + } + + hco := options.NewHealthCheckerOptions() + hco.AddFlags(pflag.CommandLine) + pflag.Parse() + hco.SetDefaults() + if err := hco.IsValid(); err != nil { + fmt.Println(err) + os.Exit(int(types.Unknown)) + } + + hc, err := healthchecker.NewHealthChecker(hco) + if err != nil { + fmt.Println(err) + os.Exit(int(types.Unknown)) + } + if !hc.CheckHealth() { + fmt.Printf("%v:%v was found unhealthy; repair flag : %v\n", hco.Component, hco.SystemdService, hco.EnableRepair) + os.Exit(int(types.NonOK)) + } + os.Exit(int(types.OK)) +} diff --git a/cmd/healthchecker/options/options.go b/cmd/healthchecker/options/options.go new file mode 100644 index 000000000..81395099d --- /dev/null +++ b/cmd/healthchecker/options/options.go @@ -0,0 +1,102 @@ +/* +Copyright 2020 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" + "fmt" + "time" + + "github.com/spf13/pflag" + + "k8s.io/node-problem-detector/pkg/healthchecker/types" +) + +// NewHealthCheckerOptions returns an empty health check options struct. +func NewHealthCheckerOptions() *HealthCheckerOptions { + return &HealthCheckerOptions{} +} + +// HealthCheckerOptions are the options used to configure the health checker. +type HealthCheckerOptions struct { + Component string + SystemdService string + EnableRepair bool + CriCtlPath string + CriSocketPath string + CoolDownTime time.Duration + HealthCheckTimeout time.Duration +} + +// AddFlags adds health checker command line options to pflag. +func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) { + fs.StringVar(&hco.Component, "component", types.KubeletComponent, + "The component to check health for. Supports kubelet, docker and cri") + fs.StringVar(&hco.SystemdService, "systemd-service", "", + "The underlying systemd service responsible for the component. Set to the corresponding component for docker and kubelet, containerd for cri.") + fs.BoolVar(&hco.EnableRepair, "enable-repair", true, "Flag to enable/disable repair attempt for the component.") + fs.StringVar(&hco.CriCtlPath, "crictl-path", types.DefaultCriCtl, + "The path to the crictl binary. This is used to check health of cri component.") + fs.StringVar(&hco.CriSocketPath, "cri-socket-path", types.DefaultCriSocketPath, + "The path to the cri socket. Used with crictl to specify the socket path.") + fs.DurationVar(&hco.CoolDownTime, "cooldown-time", types.DefaultCoolDownTime, + "The duration to wait for the service to be up before attempting repair.") + fs.DurationVar(&hco.HealthCheckTimeout, "health-check-timeout", types.DefaultHealthCheckTimeout, + "The time to wait before marking the component as unhealthy.") +} + +// IsValid validates health checker command line options. +// Returns error if invalid, nil otherwise. +func (hco *HealthCheckerOptions) IsValid() error { + // Make sure the component specified is valid. + if hco.Component != types.KubeletComponent && hco.Component != types.DockerComponent && hco.Component != types.CRIComponent { + return fmt.Errorf("the component specified is not supported. Supported components are : ") + } + // Make sure the systemd service is specified if repair is enabled. + if hco.EnableRepair && hco.SystemdService == "" { + return fmt.Errorf("systemd-service cannot be empty when repair is enabled") + } + // Skip checking further if the component is not cri. + if hco.Component != types.CRIComponent { + return nil + } + // Make sure the crictl path is not empty for cri component. + if hco.Component == types.CRIComponent && hco.CriCtlPath == "" { + return fmt.Errorf("the crictl-path cannot be empty for cri component") + } + // Make sure the cri socker path is not empty for cri component. + if hco.Component == types.CRIComponent && hco.CriSocketPath == "" { + return fmt.Errorf("the cri-socket-path cannot be empty for cri component") + } + return nil +} + +// SetDefaults sets the defaults values for the dependent flags. +func (hco *HealthCheckerOptions) SetDefaults() { + if hco.SystemdService != "" { + return + } + if hco.Component != types.CRIComponent { + hco.SystemdService = hco.Component + return + } + hco.SystemdService = types.ContainerdService +} + +func init() { + pflag.CommandLine.AddGoFlagSet(flag.CommandLine) +} diff --git a/cmd/healthchecker/options/options_test.go b/cmd/healthchecker/options/options_test.go new file mode 100644 index 000000000..737f2ae96 --- /dev/null +++ b/cmd/healthchecker/options/options_test.go @@ -0,0 +1,76 @@ +/* +Copyright 2020 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "k8s.io/node-problem-detector/pkg/healthchecker/types" +) + +func TestIsValid(t *testing.T) { + testCases := []struct { + name string + hco HealthCheckerOptions + expectError bool + }{ + { + name: "valid component", + hco: HealthCheckerOptions{ + Component: types.KubeletComponent, + }, + expectError: false, + }, + { + name: "invalid component", + hco: HealthCheckerOptions{ + Component: "wrongComponent", + }, + expectError: true, + }, + { + name: "empty crictl-path with cri", + hco: HealthCheckerOptions{ + Component: types.CRIComponent, + CriCtlPath: "", + EnableRepair: false, + }, + expectError: true, + }, + { + name: "empty systemd-service and repair enabled", + hco: HealthCheckerOptions{ + Component: types.KubeletComponent, + EnableRepair: true, + SystemdService: "", + }, + expectError: true, + }, + } + + for _, test := range testCases { + t.Run(test.name, func(t *testing.T) { + if test.expectError { + assert.Error(t, test.hco.IsValid(), "HealthChecker option %+v is invalid. Expected IsValid to return error.", test.hco) + } else { + assert.NoError(t, test.hco.IsValid(), "HealthChecker option %+v is valid. Expected IsValid to return nil.", test.hco) + } + }) + } +} diff --git a/config/health-checker-docker.json b/config/health-checker-docker.json new file mode 100644 index 000000000..66ff3ab3e --- /dev/null +++ b/config/health-checker-docker.json @@ -0,0 +1,33 @@ +{ + "plugin": "custom", + "pluginConfig": { + "invoke_interval": "10s", + "timeout": "3m", + "max_output_length": 80, + "concurrency": 1 + }, + "source": "health-checker", + "metricsReporting": true, + "conditions": [ + { + "type": "ContainerRuntimeUnhealthy", + "reason": "ContainerRuntimeIsHealthy", + "message": "Container runtime on the node is functioning properly" + } + ], + "rules": [ + { + "type": "permanent", + "condition": "ContainerRuntimeUnhealthy", + "reason": "DockerUnhealthy", + "path": "/home/kubernetes/bin/health-checker", + "args": [ + "--component=docker", + "--enable-repair=false", + "--cooldown-time=2m", + "--health-check-timeout=60s" + ], + "timeout": "3m" + } + ] +} diff --git a/config/health-checker-kubelet.json b/config/health-checker-kubelet.json new file mode 100644 index 000000000..929bf1689 --- /dev/null +++ b/config/health-checker-kubelet.json @@ -0,0 +1,33 @@ +{ + "plugin": "custom", + "pluginConfig": { + "invoke_interval": "10s", + "timeout": "3m", + "max_output_length": 80, + "concurrency": 1 + }, + "source": "health-checker", + "metricsReporting": true, + "conditions": [ + { + "type": "KubeletUnhealthy", + "reason": "KubeletIsHealthy", + "message": "kubelet on the node is functioning properly" + } + ], + "rules": [ + { + "type": "permanent", + "condition": "KubeletUnhealthy", + "reason": "KubeletUnhealthy", + "path": "/home/kubernetes/bin/health-checker", + "args": [ + "--component=kubelet", + "--enable-repair=false", + "--cooldown-time=1m", + "--health-check-timeout=10s" + ], + "timeout": "3m" + } + ] +} diff --git a/pkg/healthchecker/health_checker.go b/pkg/healthchecker/health_checker.go new file mode 100644 index 000000000..2e814c3bf --- /dev/null +++ b/pkg/healthchecker/health_checker.go @@ -0,0 +1,163 @@ +/* +Copyright 2020 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package healthchecker + +import ( + "context" + "errors" + "net/http" + "os/exec" + "strings" + "time" + + "github.com/golang/glog" + + "k8s.io/node-problem-detector/cmd/healthchecker/options" + "k8s.io/node-problem-detector/pkg/healthchecker/types" +) + +type healthChecker struct { + enableRepair bool + healthCheckFunc func() bool + // The repair is "best-effort" and ignores the error from the underlying actions. + // The bash commands to kill the process will fail if the service is down and hence ignore. + repairFunc func() + uptimeFunc func() (time.Duration, error) + crictlPath string + healthCheckTimeout time.Duration + coolDownTime time.Duration +} + +// NewHealthChecker returns a new health checker configured with the given options. +func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, error) { + hc := &healthChecker{ + enableRepair: hco.EnableRepair, + crictlPath: hco.CriCtlPath, + healthCheckTimeout: hco.HealthCheckTimeout, + coolDownTime: hco.CoolDownTime, + } + hc.healthCheckFunc = getHealthCheckFunc(hco) + hc.repairFunc = getRepairFunc(hco) + hc.uptimeFunc = getUptimeFunc(hco.SystemdService) + return hc, nil +} + +// getUptimeFunc returns the time for which the given service has been running. +func getUptimeFunc(service string) func() (time.Duration, error) { + return func() (time.Duration, error) { + out, err := execCommand(types.CmdTimeout, "systemctl", "show", service, "--property=ActiveEnterTimestamp") + if err != nil { + return time.Duration(0), err + } + val := strings.Split(out, "=") + if len(val) < 2 { + return time.Duration(0), errors.New("could not parse the service uptime time correctly") + } + t, err := time.Parse(types.UptimeTimeLayout, val[1]) + if err != nil { + return time.Duration(0), err + } + return time.Since(t), nil + } +} + +// getRepairFunc returns the repair function based on the component. +func getRepairFunc(hco *options.HealthCheckerOptions) func() { + switch hco.Component { + case types.DockerComponent: + // Use "docker ps" for docker health check. Not using crictl for docker to remove + // dependency on the kubelet. + return func() { + execCommand(types.CmdTimeout, "pkill", "-SIGUSR1", "dockerd") + execCommand(types.CmdTimeout, "systemctl", "kill", "--kill-who=main", hco.SystemdService) + } + default: + // Just kill the service for all other components + return func() { + execCommand(types.CmdTimeout, "systemctl", "kill", "--kill-who=main", hco.SystemdService) + } + } +} + +// getHealthCheckFunc returns the health check function based on the component. +func getHealthCheckFunc(hco *options.HealthCheckerOptions) func() bool { + switch hco.Component { + case types.KubeletComponent: + return func() bool { + httpClient := http.Client{Timeout: hco.HealthCheckTimeout} + response, err := httpClient.Get(types.KubeletHealthCheckEndpoint) + if err != nil || response.StatusCode != http.StatusOK { + return false + } + return true + } + case types.DockerComponent: + return func() bool { + if _, err := execCommand(hco.HealthCheckTimeout, "docker", "ps"); err != nil { + return false + } + return true + } + case types.CRIComponent: + return func() bool { + if _, err := execCommand(hco.HealthCheckTimeout, hco.CriCtlPath, "--runtime-endpoint="+hco.CriSocketPath, "--image-endpoint="+hco.CriSocketPath, "pods"); err != nil { + return false + } + return true + } + } + return nil +} + +// CheckHealth checks for the health of the component and tries to repair if enabled. +// Returns true if healthy, false otherwise. +func (hc *healthChecker) CheckHealth() bool { + healthy := hc.healthCheckFunc() + if healthy { + return true + } + // The service is unhealthy. + // Attempt repair based on flag. + if hc.enableRepair { + glog.Infof("health-checker: component is unhealthy, proceeding to repair") + // repair if the service has been up for the cool down period. + uptime, err := hc.uptimeFunc() + if err != nil { + glog.Infof("health-checker: %v\n", err.Error()) + } + glog.Infof("health-checker: component uptime: %v\n", uptime) + if uptime > hc.coolDownTime { + hc.repairFunc() + } + } + return false +} + +// execCommand executes the bash command and returns the (output, error) from command, error if timeout occurs. +func execCommand(timeout time.Duration, command string, args ...string) (string, error) { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + cmd := exec.CommandContext(ctx, command, args...) + glog.Infof("health-checker: executing command : %v\n", cmd) + out, err := cmd.Output() + if err != nil { + glog.Infof("health-checker: command failed : %v, %v\n", err.Error(), out) + return "", err + } + return strings.TrimSuffix(string(out), "\n"), nil +} diff --git a/pkg/healthchecker/health_checker_test.go b/pkg/healthchecker/health_checker_test.go new file mode 100644 index 000000000..aa21097e3 --- /dev/null +++ b/pkg/healthchecker/health_checker_test.go @@ -0,0 +1,118 @@ +/* +Copyright 2020 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package healthchecker + +import ( + "testing" + "time" + + "k8s.io/node-problem-detector/pkg/healthchecker/types" +) + +var repairCalled bool + +func NewTestHealthChecker(repairFunc func(), healthCheckFunc func() bool, uptimeFunc func() (time.Duration, error), enableRepair bool) types.HealthChecker { + repairCalled = false + return &healthChecker{ + enableRepair: enableRepair, + healthCheckFunc: healthCheckFunc, + repairFunc: repairFunc, + uptimeFunc: uptimeFunc, + healthCheckTimeout: time.Second, + coolDownTime: 2 * time.Second, + } +} + +func healthyFunc() bool { + return true +} + +func unhealthyFunc() bool { + return false +} + +func repairFunc() { + repairCalled = true +} + +func longServiceUptimeFunc() (time.Duration, error) { + return 1 * time.Hour, nil +} + +func shortServiceUptimeFunc() (time.Duration, error) { + return 1 * time.Second, nil +} + +func TestHealthCheck(t *testing.T) { + for _, tc := range []struct { + description string + enableRepair bool + healthy bool + healthCheckFunc func() bool + uptimeFunc func() (time.Duration, error) + repairFunc func() + repairCalled bool + }{ + { + description: "healthy component", + enableRepair: true, + healthy: true, + healthCheckFunc: healthyFunc, + repairFunc: repairFunc, + uptimeFunc: shortServiceUptimeFunc, + repairCalled: false, + }, + { + description: "unhealthy component and disabled repair", + enableRepair: false, + healthy: false, + healthCheckFunc: unhealthyFunc, + repairFunc: repairFunc, + uptimeFunc: shortServiceUptimeFunc, + repairCalled: false, + }, + { + description: "unhealthy component, enabled repair and component in cool dowm", + enableRepair: true, + healthy: false, + healthCheckFunc: unhealthyFunc, + repairFunc: repairFunc, + uptimeFunc: shortServiceUptimeFunc, + repairCalled: false, + }, + { + description: "unhealthy component, enabled repair and component out of cool dowm", + enableRepair: true, + healthy: false, + healthCheckFunc: unhealthyFunc, + repairFunc: repairFunc, + uptimeFunc: longServiceUptimeFunc, + repairCalled: true, + }, + } { + t.Run(tc.description, func(t *testing.T) { + hc := NewTestHealthChecker(tc.repairFunc, tc.healthCheckFunc, tc.uptimeFunc, tc.enableRepair) + healthy := hc.CheckHealth() + if healthy != tc.healthy { + t.Errorf("incorrect health returned got %t; expected %t", healthy, tc.healthy) + } + if repairCalled != tc.repairCalled { + t.Errorf("incorrect repairCalled got %t; expected %t", repairCalled, tc.repairCalled) + } + }) + } +} diff --git a/pkg/healthchecker/types/types.go b/pkg/healthchecker/types/types.go new file mode 100644 index 000000000..c4334c66b --- /dev/null +++ b/pkg/healthchecker/types/types.go @@ -0,0 +1,37 @@ +/* +Copyright 2020 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package types + +import "time" + +const ( + DefaultCoolDownTime = 2 * time.Minute + DefaultHealthCheckTimeout = 10 * time.Second + CmdTimeout = 10 * time.Second + DefaultCriCtl = "/usr/bin/crictl" + DefaultCriSocketPath = "unix:///var/run/containerd/containerd.sock" + KubeletComponent = "kubelet" + CRIComponent = "cri" + DockerComponent = "docker" + ContainerdService = "containerd" + KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz" + UptimeTimeLayout = "Mon 2006-01-02 15:04:05 UTC" +) + +type HealthChecker interface { + CheckHealth() bool +}