Skip to content

Commit cd5e6b2

Browse files
committed
sandbox: use created/stopped instead of infra container for readiness
As we hit cases of deadlock when a sandbox is stuck stopping Signed-off-by: Peter Hunt <[email protected]>
1 parent 9607a04 commit cd5e6b2

File tree

7 files changed

+57
-33
lines changed

7 files changed

+57
-33
lines changed

internal/lib/container_server.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,11 @@ func (c *ContainerServer) LoadSandbox(ctx context.Context, id string) (sb *sandb
375375
}
376376

377377
sb.SetCreated()
378+
379+
if scontainer.State().Status == oci.ContainerStateStopped {
380+
sb.SetStopped(ctx, true)
381+
}
382+
378383
if err := label.ReserveLabel(processLabel); err != nil {
379384
return sb, err
380385
}

internal/lib/sandbox/sandbox.go

Lines changed: 40 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -59,15 +59,18 @@ type Sandbox struct {
5959
nsOpts *types.NamespaceOption
6060
dnsConfig *types.DNSConfig
6161
stopMutex sync.RWMutex
62-
created bool
63-
stopped bool
64-
networkStopped bool
65-
privileged bool
66-
hostNetwork bool
67-
usernsMode string
68-
containerEnvPath string
69-
podLinuxOverhead *types.LinuxContainerResources
70-
podLinuxResources *types.LinuxContainerResources
62+
// stateMutex protects the use of created, stopped and networkStopped bools
63+
// which are all fields that can change at runtime
64+
stateMutex sync.RWMutex
65+
created bool
66+
stopped bool
67+
networkStopped bool
68+
privileged bool
69+
hostNetwork bool
70+
usernsMode string
71+
containerEnvPath string
72+
podLinuxOverhead *types.LinuxContainerResources
73+
podLinuxResources *types.LinuxContainerResources
7174
}
7275

7376
// DefaultShmSize is the default shm size.
@@ -326,6 +329,10 @@ func (s *Sandbox) RemoveInfraContainer() {
326329
func (s *Sandbox) SetStopped(ctx context.Context, createFile bool) {
327330
ctx, span := log.StartSpan(ctx)
328331
defer span.End()
332+
333+
s.stateMutex.Lock()
334+
defer s.stateMutex.Unlock()
335+
329336
if s.stopped {
330337
return
331338
}
@@ -340,16 +347,24 @@ func (s *Sandbox) SetStopped(ctx context.Context, createFile bool) {
340347
// Stopped returns whether the sandbox state has been
341348
// set to stopped.
342349
func (s *Sandbox) Stopped() bool {
350+
s.stateMutex.RLock()
351+
defer s.stateMutex.RUnlock()
352+
343353
return s.stopped
344354
}
345355

346356
// SetCreated sets the created status of sandbox to true.
347357
func (s *Sandbox) SetCreated() {
358+
s.stateMutex.Lock()
359+
defer s.stateMutex.Unlock()
348360
s.created = true
349361
}
350362

351363
// NetworkStopped returns whether the network has been stopped.
352364
func (s *Sandbox) NetworkStopped() bool {
365+
s.stateMutex.RLock()
366+
defer s.stateMutex.RUnlock()
367+
353368
return s.networkStopped
354369
}
355370

@@ -363,6 +378,10 @@ func (s *Sandbox) NetworkStopped() bool {
363378
func (s *Sandbox) SetNetworkStopped(ctx context.Context, createFile bool) error {
364379
ctx, span := log.StartSpan(ctx)
365380
defer span.End()
381+
382+
s.stateMutex.Lock()
383+
defer s.stateMutex.Unlock()
384+
366385
if s.networkStopped {
367386
return nil
368387
}
@@ -394,6 +413,7 @@ func (s *Sandbox) SetContainerEnvFile(ctx context.Context) error {
394413
return nil
395414
}
396415

416+
// This function assumes the state lock has been taken for this sandbox.
397417
func (s *Sandbox) createFileInInfraDir(ctx context.Context, filename string) error {
398418
// If the sandbox is not yet created,
399419
// this function is being called when
@@ -419,6 +439,9 @@ func (s *Sandbox) createFileInInfraDir(ctx context.Context, filename string) err
419439
}
420440

421441
func (s *Sandbox) RestoreStopped() {
442+
s.stateMutex.Lock()
443+
defer s.stateMutex.Unlock()
444+
422445
if s.fileExistsInInfraDir(sbStoppedFilename) {
423446
s.stopped = true
424447
}
@@ -441,11 +464,14 @@ func (s *Sandbox) fileExistsInInfraDir(filename string) bool {
441464

442465
// Created returns the created status of sandbox.
443466
func (s *Sandbox) Created() bool {
467+
s.stateMutex.RLock()
468+
defer s.stateMutex.RUnlock()
469+
444470
return s.created
445471
}
446472

447473
func (s *Sandbox) State() types.PodSandboxState {
448-
if s.Ready(false) {
474+
if s.Ready() {
449475
return types.PodSandboxState_SANDBOX_READY
450476
}
451477
return types.PodSandboxState_SANDBOX_NOTREADY
@@ -456,22 +482,9 @@ func (s *Sandbox) State() types.PodSandboxState {
456482
// `takeLock` should be set if we need to take the lock to get the infra container's state.
457483
// If there is no infra container, it is never considered ready.
458484
// If the infra container is spoofed, the pod is considered ready when it has been created, but not stopped.
459-
func (s *Sandbox) Ready(takeLock bool) bool {
460-
podInfraContainer := s.InfraContainer()
461-
if podInfraContainer == nil {
462-
return false
463-
}
464-
if podInfraContainer.Spoofed() {
465-
return s.created && !s.stopped
466-
}
467-
// Assume the sandbox is ready, unless it has an infra container that
468-
// isn't running
469-
var cState *oci.ContainerState
470-
if takeLock {
471-
cState = podInfraContainer.State()
472-
} else {
473-
cState = podInfraContainer.StateNoLock()
474-
}
485+
func (s *Sandbox) Ready() bool {
486+
s.stateMutex.RLock()
487+
defer s.stateMutex.RUnlock()
475488

476-
return cState.Status == oci.ContainerStateRunning
489+
return s.created && !s.stopped
477490
}

internal/lib/sandbox/sandbox_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ var _ = t.Describe("Sandbox", func() {
136136

137137
// Then
138138
Expect(testSandbox.Stopped()).To(BeTrue())
139+
Expect(testSandbox.Ready()).To(BeFalse())
139140
})
140141
})
141142

@@ -181,6 +182,7 @@ var _ = t.Describe("Sandbox", func() {
181182

182183
// Then
183184
Expect(testSandbox.Created()).To(BeTrue())
185+
Expect(testSandbox.Ready()).To(BeTrue())
184186
})
185187
})
186188

server/container_portforward.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ func (s *StreamService) PortForward(ctx context.Context, podSandboxID string, po
5050
return fmt.Errorf("could not find sandbox %s", podSandboxID)
5151
}
5252

53-
if !sb.Ready(true) {
53+
if !sb.Ready() {
5454
return fmt.Errorf("sandbox %s is not running", podSandboxID)
5555
}
5656

server/sandbox_list_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,9 @@ var _ = t.Describe("ListPodSandbox", func() {
9696
// Given
9797
mockDirs(testManifest)
9898
createDummyState()
99-
_, err := sut.LoadSandbox(context.Background(), sandboxID)
99+
sb, err := sut.LoadSandbox(context.Background(), sandboxID)
100100
Expect(err).ToNot(HaveOccurred())
101+
sb.SetStopped(context.Background(), false)
101102

102103
// When
103104
response, err := sut.ListPodSandbox(context.Background(),

server/sandbox_status.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,7 @@ func (s *Server) PodSandboxStatus(ctx context.Context, req *types.PodSandboxStat
2424
return nil, status.Errorf(codes.NotFound, "could not find pod %q: %v", req.PodSandboxId, err)
2525
}
2626

27-
rStatus := types.PodSandboxState_SANDBOX_NOTREADY
28-
if sb.Ready(true) {
29-
rStatus = types.PodSandboxState_SANDBOX_READY
30-
}
27+
rStatus := sb.State()
3128

3229
var linux *types.LinuxPodSandboxStatus
3330
if sb.NamespaceOptions() != nil {

server/server.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -792,6 +792,12 @@ func (s *Server) handleExit(ctx context.Context, event fsnotify.Event) {
792792
}
793793
c = sb.InfraContainer()
794794
resource = "sandbox infra"
795+
// We discovered the infra container stopped (potentially unexpectedly).
796+
// Since sandboxes status is now being judged by the sb.stopped boolean,
797+
// rather than the infra container's status, we have to manually set stopped here.
798+
// It's likely we're doing double the work here, but that's better than missing it
799+
// if the infra container crashed.
800+
sb.SetStopped(ctx, true)
795801
} else {
796802
sb = s.GetSandbox(c.Sandbox())
797803
}

0 commit comments

Comments
 (0)