From a3336a12722854c9d89e3cb685e1e16c6b420b60 Mon Sep 17 00:00:00 2001 From: Sohan Kunkerkar Date: Thu, 31 Jul 2025 08:09:53 -0400 Subject: [PATCH 1/7] server: ensure CNI teardown prevents IP leaks with missing netns Signed-off-by: Sohan Kunkerkar --- server/sandbox_network.go | 58 ++++++++++++++++++++++++++++++++------- test/network.bats | 29 ++++++++++++++++++++ 2 files changed, 77 insertions(+), 10 deletions(-) diff --git a/server/sandbox_network.go b/server/sandbox_network.go index 4c86f501c94..c11c46f9176 100644 --- a/server/sandbox_network.go +++ b/server/sandbox_network.go @@ -5,6 +5,8 @@ import ( "fmt" "math" "os" + "path/filepath" + "strings" "time" cnitypes "github.com/containernetworking/cni/pkg/types" @@ -18,6 +20,10 @@ import ( "github.com/cri-o/cri-o/server/metrics" ) +const ( + cacheDir = "/var/lib/cni/results" +) + // networkStart sets up the sandbox's network and returns the pod IP on success // or an error. func (s *Server) networkStart(ctx context.Context, sb *sandbox.Sandbox) (podIPs []string, result cnitypes.Result, retErr error) { @@ -185,43 +191,75 @@ func (s *Server) networkStop(ctx context.Context, sb *sandbox.Sandbox) error { } // Check if the network namespace file exists and is valid before attempting CNI teardown. - // If the file doesn't exist or is invalid, skip CNI teardown and mark network as stopped. + // If the file doesn't exist or is invalid, we should still attempt CNI teardown using cached information + // to prevent IP leaks, but we'll mark the network as stopped regardless of the outcome. + netnsValid := true + if podNetwork.NetNS != "" { if _, statErr := os.Stat(podNetwork.NetNS); statErr != nil { - // Network namespace file doesn't exist, mark network as stopped and return success - log.Debugf(ctx, "Network namespace file %s does not exist for pod sandbox %s(%s), skipping CNI teardown", + // Network namespace file doesn't exist, but we should still attempt CNI teardown + log.Debugf(ctx, "Network namespace file %s does not exist for pod sandbox %s(%s), attempting CNI teardown with cached info", podNetwork.NetNS, sb.Name(), sb.ID()) - return sb.SetNetworkStopped(ctx, true) - } - - if validateErr := s.validateNetworkNamespace(podNetwork.NetNS); validateErr != nil { + netnsValid = false + } else if validateErr := s.validateNetworkNamespace(podNetwork.NetNS); validateErr != nil { // Network namespace file exists but is invalid (e.g., corrupted or fake file) - log.Warnf(ctx, "Network namespace file %s is invalid for pod sandbox %s(%s): %v, removing and skipping CNI teardown", + log.Warnf(ctx, "Network namespace file %s is invalid for pod sandbox %s(%s): %v, removing and attempting CNI teardown with cached info", podNetwork.NetNS, sb.Name(), sb.ID(), validateErr) s.cleanupNetns(ctx, podNetwork.NetNS, sb) - return sb.SetNetworkStopped(ctx, true) + netnsValid = false } } + // Always attempt CNI teardown to prevent IP leaks, even if netns is invalid. if err := s.config.CNIPlugin().TearDownPodWithContext(stopCtx, podNetwork); err != nil { log.Warnf(ctx, "Failed to destroy network for pod sandbox %s(%s): %v", sb.Name(), sb.ID(), err) // If the network namespace exists but CNI teardown failed, try to clean it up. - if podNetwork.NetNS != "" { + if podNetwork.NetNS != "" && netnsValid { if _, statErr := os.Stat(podNetwork.NetNS); statErr == nil { // Clean up the netns file since CNI teardown failed. s.cleanupNetns(ctx, podNetwork.NetNS, sb) } } + // Clean up CNI result files if CNI teardown failed. + s.cleanupCNIResultFiles(ctx, sb.ID()) + + // Even if CNI teardown failed, mark network as stopped to prevent retry loops. + if setErr := sb.SetNetworkStopped(ctx, true); setErr != nil { + log.Warnf(ctx, "Failed to set network stopped for pod sandbox %s(%s): %v", sb.Name(), sb.ID(), setErr) + } + return fmt.Errorf("network teardown failed for pod sandbox %s(%s): %w", sb.Name(), sb.ID(), err) } return sb.SetNetworkStopped(ctx, true) } +// cleanupCNIResultFiles removes CNI result files for a given container ID. +// This is called when CNI teardown fails to prevent stale result files from accumulating. +func (s *Server) cleanupCNIResultFiles(ctx context.Context, containerID string) { + entries, err := os.ReadDir(cacheDir) + if err != nil { + log.Warnf(ctx, "Failed to read CNI cache directory %s: %v", cacheDir, err) + + return + } + + for _, entry := range entries { + if !entry.IsDir() && strings.Contains(entry.Name(), containerID) { + filePath := filepath.Join(cacheDir, entry.Name()) + if err := os.Remove(filePath); err != nil { + log.Warnf(ctx, "Failed to remove CNI result file %s: %v", filePath, err) + } else { + log.Infof(ctx, "Cleaned up CNI result file %s for container %s", entry.Name(), containerID) + } + } + } +} + func (s *Server) newPodNetwork(ctx context.Context, sb *sandbox.Sandbox) (ocicni.PodNetwork, error) { _, span := log.StartSpan(ctx) defer span.End() diff --git a/test/network.bats b/test/network.bats index 09a63e1f818..6c4348e0f47 100644 --- a/test/network.bats +++ b/test/network.bats @@ -224,3 +224,32 @@ function check_networking() { crictl stopp "$new_pod_id" crictl rmp "$new_pod_id" } + +@test "CNI teardown called even with missing or invalid netns to prevent IP leaks" { + start_crio + + pod_id=$(crictl runp "$TESTDATA"/sandbox_config.json) + + # Get the network namespace path + NETNS_PATH=/var/run/netns/ + NS=$(crictl inspectp "$pod_id" | + jq -er '.info.runtimeSpec.linux.namespaces[] | select(.type == "network").path | sub("'$NETNS_PATH'"; "")') + + output=$(crictl inspectp "$pod_id" | jq -r '.status.state') + [[ "$output" == "SANDBOX_READY" ]] + + # Stop the pod first to release the network namespace. + crictl stopp "$pod_id" + + # Now remove the network namespace file to simulate the issue. + rm -f "$NETNS_PATH$NS" + + # Remove the pod - this should still call CNI teardown. + crictl rmp "$pod_id" + + # Verify CNI teardown was called by checking logs. + grep -q "Deleting pod.*from CNI network" "$CRIO_LOG" + + # Verify the pod is gone. + run ! crictl inspectp "$pod_id" +} From f7566ebfdd013de7f67f5b27a4fe9875432fa52b Mon Sep 17 00:00:00 2001 From: Sascha Grunert Date: Tue, 29 Jul 2025 09:51:07 +0200 Subject: [PATCH 2/7] OCI artifact: pull image before artifact We turn the logic for pulling artifacts vs images around to try pulling the image first. Reason is that there are a bunch of error cases (unauthorized registry access, disconnected environments or ImageTagMirrorSet configurations) which are not covered by OCI artifacts. To avoid regressing that behavior we now focus on pulling images first before giving artifacts a try. Signed-off-by: Sascha Grunert --- internal/storage/image.go | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/internal/storage/image.go b/internal/storage/image.go index c0e384de03d..70c3e897c1d 100644 --- a/internal/storage/image.go +++ b/internal/storage/image.go @@ -873,26 +873,24 @@ func pullImageImplementation(ctx context.Context, lookup *imageLookupService, st return RegistryImageReference{}, err } - manifestBytes, err := ociartifact.NewStore(store.GraphRoot(), &srcSystemContext).PullManifest(ctx, srcRef, &ociartifact.PullOptions{CopyOptions: &libimage.CopyOptions{ + manifestBytes, err := copy.Image(ctx, policyContext, destRef, srcRef, ©.Options{ + SourceCtx: &srcSystemContext, + DestinationCtx: options.DestinationCtx, OciDecryptConfig: options.OciDecryptConfig, + ProgressInterval: options.ProgressInterval, Progress: options.Progress, - RemoveSignatures: true, // signature is not supported for OCI layout dest - }}) + }) if err != nil { - if !errors.Is(err, ociartifact.ErrIsAnImage) { - return RegistryImageReference{}, fmt.Errorf("unable to try pulling possible OCI artifact: %w", err) - } - - manifestBytes, err = copy.Image(ctx, policyContext, destRef, srcRef, ©.Options{ - SourceCtx: &srcSystemContext, - DestinationCtx: options.DestinationCtx, + artifactManifestBytes, artifactErr := ociartifact.NewStore(store.GraphRoot(), &srcSystemContext).PullManifest(ctx, srcRef, &ociartifact.PullOptions{CopyOptions: &libimage.CopyOptions{ OciDecryptConfig: options.OciDecryptConfig, - ProgressInterval: options.ProgressInterval, Progress: options.Progress, - }) - if err != nil { - return RegistryImageReference{}, err + RemoveSignatures: true, // signature is not supported for OCI layout dest + }}) + if artifactErr != nil { + return RegistryImageReference{}, fmt.Errorf("unable to pull image or OCI artifact: pull image err: %w; artifact err: %w", err, artifactErr) } + + manifestBytes = artifactManifestBytes } manifestDigest, err := manifest.Digest(manifestBytes) From 8e0dc356a647386ce0ec920addf29de62b893569 Mon Sep 17 00:00:00 2001 From: Sohan Kunkerkar Date: Wed, 6 Aug 2025 00:03:00 -0400 Subject: [PATCH 3/7] server: validate memory usage before applying limit decreases This change validates memory limits against current usage from cgroup files instead of stale cAdvisor cache. Signed-off-by: Sohan Kunkerkar --- internal/config/cgmgr/cgmgr_linux.go | 8 +-- internal/config/cgmgr/systemd_linux.go | 4 +- server/container_update_resources.go | 5 ++ server/container_update_resources_linux.go | 59 +++++++++++++++++++ .../container_update_resources_unsupported.go | 21 +++++++ test/ctr.bats | 50 ++++++++++++++++ 6 files changed, 141 insertions(+), 6 deletions(-) create mode 100644 server/container_update_resources_linux.go create mode 100644 server/container_update_resources_unsupported.go diff --git a/internal/config/cgmgr/cgmgr_linux.go b/internal/config/cgmgr/cgmgr_linux.go index a614eee4783..dba264ea746 100644 --- a/internal/config/cgmgr/cgmgr_linux.go +++ b/internal/config/cgmgr/cgmgr_linux.go @@ -30,9 +30,9 @@ const ( // these constants define the path and name of the memory max file // for v1 and v2 respectively. - cgroupMemoryPathV1 = "/sys/fs/cgroup/memory" + CgroupMemoryPathV1 = "/sys/fs/cgroup/memory" cgroupMemoryMaxFileV1 = "memory.limit_in_bytes" - cgroupMemoryPathV2 = "/sys/fs/cgroup" + CgroupMemoryPathV2 = "/sys/fs/cgroup" cgroupMemoryMaxFileV2 = "memory.max" ) @@ -106,13 +106,13 @@ func SetCgroupManager(cgroupManager string) (CgroupManager, error) { case cgroupfsCgroupManager: if node.CgroupIsV2() { return &CgroupfsManager{ - memoryPath: cgroupMemoryPathV2, + memoryPath: CgroupMemoryPathV2, memoryMaxFile: cgroupMemoryMaxFileV2, }, nil } return &CgroupfsManager{ - memoryPath: cgroupMemoryPathV1, + memoryPath: CgroupMemoryPathV1, memoryMaxFile: cgroupMemoryMaxFileV1, v1CtrCgMgr: make(map[string]libctr.Manager), v1SbCgMgr: make(map[string]libctr.Manager), diff --git a/internal/config/cgmgr/systemd_linux.go b/internal/config/cgmgr/systemd_linux.go index 5497a40ddd7..e0ae7b8c5a4 100644 --- a/internal/config/cgmgr/systemd_linux.go +++ b/internal/config/cgmgr/systemd_linux.go @@ -42,10 +42,10 @@ type SystemdManager struct { func NewSystemdManager() *SystemdManager { systemdMgr := SystemdManager{} if node.CgroupIsV2() { - systemdMgr.memoryPath = cgroupMemoryPathV2 + systemdMgr.memoryPath = CgroupMemoryPathV2 systemdMgr.memoryMaxFile = cgroupMemoryMaxFileV2 } else { - systemdMgr.memoryPath = cgroupMemoryPathV1 + systemdMgr.memoryPath = CgroupMemoryPathV1 systemdMgr.memoryMaxFile = cgroupMemoryMaxFileV1 systemdMgr.v1CtrCgMgr = make(map[string]cgroups.Manager) systemdMgr.v1SbCgMgr = make(map[string]cgroups.Manager) diff --git a/server/container_update_resources.go b/server/container_update_resources.go index 5977726060d..1777e8fc779 100644 --- a/server/container_update_resources.go +++ b/server/container_update_resources.go @@ -44,6 +44,11 @@ func (s *Server) UpdateContainerResources(ctx context.Context, req *types.Update updated = req.Linux } + // Validate memory update before applying it. + if err := s.validateMemoryUpdate(ctx, c, updated.GetMemoryLimitInBytes()); err != nil { + return nil, fmt.Errorf("memory validation failed: %w", err) + } + resources := toOCIResources(updated) if err := s.ContainerServer.Runtime().UpdateContainer(ctx, c, resources); err != nil { return nil, err diff --git a/server/container_update_resources_linux.go b/server/container_update_resources_linux.go new file mode 100644 index 00000000000..02ee9959853 --- /dev/null +++ b/server/container_update_resources_linux.go @@ -0,0 +1,59 @@ +//go:build linux + +package server + +import ( + "context" + "fmt" + + "github.com/cri-o/cri-o/internal/log" + "github.com/cri-o/cri-o/internal/oci" +) + +// validateMemoryUpdate checks if the new memory limit is safe to apply by getting current usage from CRI-O's stats server. +// This ensures real-time accuracy and prevents decreasing memory limits below current usage. +func (s *Server) validateMemoryUpdate(ctx context.Context, c *oci.Container, newMemoryLimit int64) error { + // Negative memory limits are invalid + if newMemoryLimit < 0 { + return fmt.Errorf("invalid memory limit: %d bytes (cannot be negative)", newMemoryLimit) + } + + // Zero means unlimited memory, no validation needed. + if newMemoryLimit == 0 { + return nil + } + + sb := s.GetSandbox(c.Sandbox()) + if sb == nil { + log.Warnf(ctx, "Could not get sandbox %s for container %s", c.Sandbox(), c.ID()) + + return nil + } + + containerStats := s.StatsForContainer(c, sb) + if containerStats == nil { + log.Warnf(ctx, "No memory stats available for container %s", c.ID()) + + return nil + } + + usageBytes := containerStats.GetMemory().GetUsageBytes() + if usageBytes == nil { + log.Warnf(ctx, "Memory usage not available for container %s", c.ID()) + + return nil + } + + currentUsage := int64(usageBytes.GetValue()) + + // Check if new limit is below current usage. + if newMemoryLimit < currentUsage { + return fmt.Errorf("cannot decrease memory limit to %d bytes: current usage is %d bytes", + newMemoryLimit, currentUsage) + } + + log.Debugf(ctx, "Memory validation passed: new limit %d >= current usage %d", + newMemoryLimit, currentUsage) + + return nil +} diff --git a/server/container_update_resources_unsupported.go b/server/container_update_resources_unsupported.go new file mode 100644 index 00000000000..7f14285e0f4 --- /dev/null +++ b/server/container_update_resources_unsupported.go @@ -0,0 +1,21 @@ +//go:build !linux + +package server + +import ( + "context" + "fmt" + + "github.com/cri-o/cri-o/internal/oci" +) + +// validateMemoryUpdate is a no-op on non-Linux platforms since cgroups don't exist. +// However, we still validate basic input constraints. +func (s *Server) validateMemoryUpdate(ctx context.Context, c *oci.Container, newMemoryLimit int64) error { + // Negative memory limits are invalid + if newMemoryLimit < 0 { + return fmt.Errorf("invalid memory limit: %d bytes (cannot be negative)", newMemoryLimit) + } + + return nil +} diff --git a/test/ctr.bats b/test/ctr.bats index 3990464a90f..e1fd1abd51f 100644 --- a/test/ctr.bats +++ b/test/ctr.bats @@ -1619,3 +1619,53 @@ EOF output=$(run crictl inspect "$ctr_id" | jq -r '.status.exitCode') [[ "$output" == "0" ]] } + +@test "memory limit decrease below usage should be blocked" { + start_crio + + # Create a container config with initial memory limit of 128MB + jq '.command = ["/bin/sh", "-c", "dd if=/dev/zero of=/dev/shm/memtest bs=1M count=64 && echo '\''Memory allocated: 64MB'\'' && sleep 300"] | .linux.resources.memory_limit_in_bytes = 134217728' \ + "$TESTDATA"/container_config.json > "$TESTDIR"/container_memory.json + + # Run the container + ctr_id=$(crictl run "$TESTDIR"/container_memory.json "$TESTDATA"/sandbox_config.json) + + # Wait a moment for memory allocation + sleep 3 + + # Check current memory usage to ensure it's above our target limit (32MB = 33554432 bytes). + run crictl stats --output json "$ctr_id" + [[ "$status" -eq 0 ]] + current_usage=$(echo "$output" | jq -r '.stats[0].memory.usageBytes.value') + + [[ "$current_usage" -gt 33554432 ]] + + # Attempt to update memory limit to 32MB (below current usage) - should fail + run crictl update --memory 33554432 "$ctr_id" + echo "Update attempt output: $output" + [[ "$status" -ne 0 ]] + [[ "$output" =~ "cannot decrease memory limit" ]] + + # Verify the container is still running with original memory limit. + run crictl inspect "$ctr_id" + [[ "$status" -eq 0 ]] + [[ "$output" == *"CONTAINER_RUNNING"* ]] + + # Check that memory limit is still the original 128MB (134217728 bytes) + memory_limit=$(echo "$output" | jq -r '.info.runtimeSpec.linux.resources.memory.limit') + [[ "$memory_limit" == "134217728" ]] + + # Test that memory limit increase still works (256MB = 268435456 bytes) + run crictl update --memory 268435456 "$ctr_id" + echo "Increase attempt output: $output" + [[ "$status" -eq 0 ]] + + # Verify the memory limit was actually updated to 256MB + run crictl inspect "$ctr_id" + [[ "$status" -eq 0 ]] + memory_limit_after_increase=$(echo "$output" | jq -r '.info.runtimeSpec.linux.resources.memory.limit') + [[ "$memory_limit_after_increase" == "268435456" ]] + + crictl stop "$ctr_id" + crictl rm "$ctr_id" +} From 0f9ff66e915487be5c5507ad7ceee9dad494cc71 Mon Sep 17 00:00:00 2001 From: Ayato Tokubi Date: Fri, 15 Aug 2025 14:46:46 +0000 Subject: [PATCH 4/7] Fix potential panic when it fails to parse ref name. Signed-off-by: Ayato Tokubi --- internal/ociartifact/store.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/ociartifact/store.go b/internal/ociartifact/store.go index a4db21f9a92..80f05d2b2c8 100644 --- a/internal/ociartifact/store.go +++ b/internal/ociartifact/store.go @@ -340,6 +340,8 @@ func (s *Store) buildArtifact(ctx context.Context, item *layout.ListResult) (*Ar namedRef, err := reference.ParseNormalizedNamed(val) if err != nil { log.Warnf(ctx, "Failed to parse annotation ref %s with the error %s", val, err) + + namedRef = unknownRef{} } artifact.namedRef = namedRef From a27c3594e055fac4ea98beb27e5f4db8350889d2 Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Tue, 19 Aug 2025 14:21:18 -0400 Subject: [PATCH 5/7] inspect: add hostnetwork information which could allow cadvisor to choose not to report network metrics Signed-off-by: Peter Hunt --- pkg/types/types.go | 1 + server/inspect.go | 2 ++ 2 files changed, 3 insertions(+) diff --git a/pkg/types/types.go b/pkg/types/types.go index 75ad96d16a9..3daf9b7f9dd 100644 --- a/pkg/types/types.go +++ b/pkg/types/types.go @@ -18,6 +18,7 @@ type ContainerInfo struct { Root string `json:"root"` Sandbox string `json:"sandbox"` IPs []string `json:"ip_addresses"` + HostNetwork *bool `json:"host_network"` } // IDMappings specifies the ID mappings used for containers. diff --git a/server/inspect.go b/server/inspect.go index 86eb72f909f..933858e8833 100644 --- a/server/inspect.go +++ b/server/inspect.go @@ -15,6 +15,7 @@ import ( "github.com/go-chi/chi/v5" json "github.com/json-iterator/go" "github.com/sirupsen/logrus" + "k8s.io/utils/ptr" "github.com/cri-o/cri-o/internal/lib/sandbox" "github.com/cri-o/cri-o/internal/log" @@ -130,6 +131,7 @@ func (s *Server) getContainerInfo(ctx context.Context, id string, getContainerFu LogPath: ctr.LogPath(), Sandbox: ctr.Sandbox(), IPs: sb.IPs(), + HostNetwork: ptr.To(sb.HostNetwork()), }, nil } From f0532ccf45943f3b8cf23f98a667bee2ab493ae2 Mon Sep 17 00:00:00 2001 From: Sunnatillo Date: Mon, 1 Sep 2025 09:46:03 +0300 Subject: [PATCH 6/7] Update nixpkgs Signed-off-by: Sunnatillo --- nix/nixpkgs.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nix/nixpkgs.json b/nix/nixpkgs.json index 316daf651d6..280665baf6b 100644 --- a/nix/nixpkgs.json +++ b/nix/nixpkgs.json @@ -1,10 +1,10 @@ { "url": "https://github.com/nixos/nixpkgs", - "rev": "11c738bae485c60a87e952a589cd769783ce3748", - "date": "2025-06-22T20:02:23-07:00", - "path": "/nix/store/wcjn2craafchk3kigd8nm9vjpic1ld1k-nixpkgs", - "sha256": "135m5zl2ivc46x8ya936r5i174ki69j8nrnqnzvswbadnpi4g93k", - "hash": "sha256-c6RH4rVNLa73t9hmi2QycZITYslmJOVRN4TtKOgvtYw=", + "rev": "c73522789a3c7552b1122773d6eaa34e1491cc1c", + "date": "2025-08-28T22:42:44-05:00", + "path": "/nix/store/j0bjdb6bigfx6b2q85r6dkyr1w6a4021-nixpkgs", + "sha256": "1s79wnz8as9g2mas3rm72iapizri79d1724p0nk288b48kfkp3na", + "hash": "sha256-yo473URkISSmBZeIE1o6Mf94VRSn5qFVFS9phb7l6eg=", "fetchLFS": false, "fetchSubmodules": false, "deepClone": false, From 6d41f6cbccba8a1e72d334ef9eb6ad7bbaefd592 Mon Sep 17 00:00:00 2001 From: Kubernetes Release Robot Date: Mon, 1 Sep 2025 00:32:28 +0000 Subject: [PATCH 7/7] version: bump to 1.33.4 Signed-off-by: Kubernetes Release Robot Signed-off-by: Sascha Grunert --- dependencies.yaml | 2 +- internal/version/version.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dependencies.yaml b/dependencies.yaml index 25c478fea10..b2f0ec8cba2 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -13,7 +13,7 @@ dependencies: # tag the .0 release if it does not already exist. If the .0 release is done, # increase the development version to the next minor (1.x.0). - name: development version - version: 1.33.2 + version: 1.33.4 refPaths: - path: internal/version/version.go match: Version diff --git a/internal/version/version.go b/internal/version/version.go index 48918590bba..5624bb3dcf2 100644 --- a/internal/version/version.go +++ b/internal/version/version.go @@ -21,7 +21,7 @@ import ( ) // Version is the version of the build. -const Version = "1.33.3" +const Version = "1.33.4" // ReleaseMinorVersions are the currently supported minor versions. var ReleaseMinorVersions = []string{"1.33", "1.32", "1.31", "1.30"}