Skip to content

Commit 5ee6df4

Browse files
Merge pull request #9643 from saschagrunert/fix-pod-stop-timeout-splitting
Fix pod stop timeout allocation for containers and infra
2 parents f46bf73 + a7b2228 commit 5ee6df4

File tree

2 files changed

+54
-7
lines changed

2 files changed

+54
-7
lines changed

server/sandbox_stop_freebsd.go

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,12 @@ import (
66
"fmt"
77

88
"go.podman.io/storage"
9+
"golang.org/x/sync/errgroup"
10+
types "k8s.io/cri-api/pkg/apis/runtime/v1"
11+
912
"github.com/cri-o/cri-o/internal/lib/sandbox"
1013
"github.com/cri-o/cri-o/internal/log"
1114
oci "github.com/cri-o/cri-o/internal/oci"
12-
"golang.org/x/sync/errgroup"
13-
types "k8s.io/cri-api/pkg/apis/runtime/v1"
1415
)
1516

1617
func (s *Server) stopPodSandbox(ctx context.Context, sb *sandbox.Sandbox) error {
@@ -30,24 +31,47 @@ func (s *Server) stopPodSandbox(ctx context.Context, sb *sandbox.Sandbox) error
3031
return nil
3132
}
3233

34+
// Calculate the timeout once. Regular containers get most of the timeout,
35+
// reserving a small amount for infra container shutdown. The infra container
36+
// will use the full totalTimeout, allowing it to use any time saved from
37+
// containers stopping earlier than their allocated timeout.
38+
totalTimeout := stopTimeoutFromContext(ctx)
39+
40+
const infraReservedTimeout int64 = 1 // 1 second reserved for infra container
41+
42+
var containerTimeout int64
43+
44+
if totalTimeout < infraReservedTimeout*2 {
45+
// If total timeout is too small, split evenly
46+
containerTimeout = totalTimeout / 2
47+
} else {
48+
// Reserve fixed time for infra, give rest to containers
49+
containerTimeout = totalTimeout - infraReservedTimeout
50+
}
51+
3352
errorGroup := &errgroup.Group{}
3453

3554
for _, ctr := range sb.Containers().List() {
3655
if ctr.State().Status == oci.ContainerStateStopped {
3756
continue
3857
}
3958

59+
// Because ctr is reused across iterations, all goroutines can end up
60+
// calling stopContainer on the last container in the list instead of
61+
// their respective one. We fix that by:
62+
stopCtr := ctr
63+
4064
errorGroup.Go(func() error {
41-
return s.stopContainer(ctx, ctr, stopTimeoutFromContext(ctx))
65+
return s.stopContainer(ctx, stopCtr, containerTimeout)
4266
})
4367
}
4468
if err := errorGroup.Wait(); err != nil {
4569
return fmt.Errorf("stop containers in parallel: %w", err)
4670
}
4771

4872
podInfraContainer := sb.InfraContainer()
49-
if err := s.stopContainer(ctx, podInfraContainer, stopTimeoutFromContext(ctx)); err != nil && !errors.Is(err, storage.ErrContainerUnknown) && !errors.Is(err, oci.ErrContainerStopped) {
50-
return fmt.Errorf("failed to stop infra container for pod sandbox %s: %v", sb.ID(), err)
73+
if err := s.stopContainer(ctx, podInfraContainer, totalTimeout); err != nil && !errors.Is(err, storage.ErrContainerUnknown) && !errors.Is(err, oci.ErrContainerStopped) {
74+
return fmt.Errorf("failed to stop infra container for pod sandbox %s: %w", sb.ID(), err)
5175
}
5276

5377
if err := sb.RemoveManagedNamespaces(); err != nil {

server/sandbox_stop_linux.go

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,15 +45,38 @@ func (s *Server) stopPodSandbox(ctx context.Context, sb *sandbox.Sandbox) error
4545
return nil
4646
}
4747

48+
// Calculate the timeout once. Regular containers get most of the timeout,
49+
// reserving a small amount for infra container shutdown. The infra container
50+
// will use the full totalTimeout, allowing it to use any time saved from
51+
// containers stopping earlier than their allocated timeout.
52+
totalTimeout := stopTimeoutFromContext(ctx)
53+
54+
const infraReservedTimeout int64 = 1 // 1 second reserved for infra container
55+
56+
var containerTimeout int64
57+
58+
if totalTimeout < infraReservedTimeout*2 {
59+
// If total timeout is too small, split evenly
60+
containerTimeout = totalTimeout / 2
61+
} else {
62+
// Reserve fixed time for infra, give rest to containers
63+
containerTimeout = totalTimeout - infraReservedTimeout
64+
}
65+
4866
errorGroup := &errgroup.Group{}
4967

5068
for _, ctr := range sb.Containers().List() {
5169
if ctr.State().Status == oci.ContainerStateStopped {
5270
continue
5371
}
5472

73+
// Because ctr is reused across iterations, all goroutines can end up
74+
// calling stopContainer on the last container in the list instead of
75+
// their respective one. We fix that by:
76+
stopCtr := ctr
77+
5578
errorGroup.Go(func() error {
56-
return s.stopContainer(ctx, ctr, stopTimeoutFromContext(ctx))
79+
return s.stopContainer(ctx, stopCtr, containerTimeout)
5780
})
5881
}
5982

@@ -62,7 +85,7 @@ func (s *Server) stopPodSandbox(ctx context.Context, sb *sandbox.Sandbox) error
6285
}
6386

6487
podInfraContainer := sb.InfraContainer()
65-
if err := s.stopContainer(ctx, podInfraContainer, stopTimeoutFromContext(ctx)); err != nil && !errors.Is(err, storage.ErrContainerUnknown) {
88+
if err := s.stopContainer(ctx, podInfraContainer, totalTimeout); err != nil && !errors.Is(err, storage.ErrContainerUnknown) && !errors.Is(err, oci.ErrContainerStopped) {
6689
return fmt.Errorf("failed to stop infra container for pod sandbox %s: %w", sb.ID(), err)
6790
}
6891

0 commit comments

Comments
 (0)