Skip to content

Commit 103e03f

Browse files
committed
fix deadlock when the container is in uninterruptible sleep
Signed-off-by: Ayato Tokubi <[email protected]> (cherry picked from commit 1e751b4) Signed-off-by: Ayato Tokubi <[email protected]> # Conflicts: # internal/oci/container.go # internal/oci/runtime_oci.go
1 parent ce48a32 commit 103e03f

File tree

2 files changed

+40
-18
lines changed

2 files changed

+40
-18
lines changed

internal/oci/container.go

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ type Container struct {
6969
stopLock sync.Mutex
7070
stopTimeoutChan chan int64
7171
stopWatchers []chan struct{}
72+
stopKillLoopBegun bool
7273
pidns nsmgr.Namespace
7374
restore bool
7475
restoreArchivePath string
@@ -155,22 +156,23 @@ func NewContainer(id, name, bundlePath, logPath string, labels, crioAnnotations,
155156
ImageRef: externalImageRef,
156157
ImageId: imageIDString,
157158
},
158-
name: name,
159-
bundlePath: bundlePath,
160-
logPath: logPath,
161-
terminal: terminal,
162-
stdin: stdin,
163-
stdinOnce: stdinOnce,
164-
runtimeHandler: runtimeHandler,
165-
crioAnnotations: crioAnnotations,
166-
imageName: imageName,
167-
imageID: imageID,
168-
dir: dir,
169-
state: state,
170-
stopSignal: stopSignal,
171-
stopTimeoutChan: make(chan int64, 10),
172-
stopWatchers: []chan struct{}{},
173-
execPIDs: map[int]bool{},
159+
name: name,
160+
bundlePath: bundlePath,
161+
logPath: logPath,
162+
terminal: terminal,
163+
stdin: stdin,
164+
stdinOnce: stdinOnce,
165+
runtimeHandler: runtimeHandler,
166+
crioAnnotations: crioAnnotations,
167+
imageName: imageName,
168+
imageID: imageID,
169+
dir: dir,
170+
state: state,
171+
stopSignal: stopSignal,
172+
stopTimeoutChan: make(chan int64, 10),
173+
stopWatchers: []chan struct{}{},
174+
stopKillLoopBegun: false,
175+
execPIDs: map[int]bool{},
174176
}
175177
return c, nil
176178
}
@@ -611,14 +613,30 @@ func (c *Container) SetAsStopping() (setToStopping bool) {
611613
return false
612614
}
613615

616+
// SetStopKillLoopBegun sets the stopKillLoopBegun flag to true.
617+
func (c *Container) SetStopKillLoopBegun() {
618+
c.stopLock.Lock()
619+
defer c.stopLock.Unlock()
620+
c.stopKillLoopBegun = true
621+
}
622+
614623
func (c *Container) WaitOnStopTimeout(ctx context.Context, timeout int64) {
615624
c.stopLock.Lock()
616625
if !c.stopping {
617626
c.stopLock.Unlock()
618627
return
619628
}
620629

621-
c.stopTimeoutChan <- timeout
630+
// Don't use the stopTimeoutChan when the container is in kill loop
631+
// because values in the channel are no longer consumed.
632+
if !c.stopKillLoopBegun {
633+
// Use select and default not to block when the stopTimeoutChan is full.
634+
// The channel is very unlikely to be full, but it could happen in theory.
635+
select {
636+
case c.stopTimeoutChan <- timeout:
637+
default:
638+
}
639+
}
622640

623641
watcher := make(chan struct{}, 1)
624642
c.stopWatchers = append(c.stopWatchers, watcher)

internal/oci/runtime_oci.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -949,7 +949,9 @@ func (r *runtimeOCI) StopLoopForContainer(c *Container, bm kwait.BackoffManager)
949949
}
950950

951951
case <-time.After(time.Until(targetTime)):
952-
log.Warnf(ctx, "Stopping container %s with stop signal timed out. Killing...", c.ID())
952+
log.Warnf(ctx, "Stopping container %s with stop signal(%s) timed out. Killing...", c.ID(), c.GetStopSignal())
953+
c.SetStopKillLoopBegun()
954+
953955
goto killContainer
954956

955957
case <-done:
@@ -971,9 +973,11 @@ killContainer:
971973
}
972974

973975
if err := c.Living(); err != nil {
976+
log.Debugf(ctx, "Container is no longer alive")
974977
stop()
975978
return
976979
}
980+
log.Debugf(ctx, "Killing failed for some reasons, retrying...")
977981
// Reschedule the timer so that the periodic reminder can continue.
978982
blockedTimer.Reset(stopProcessBlockedInterval)
979983
}, bm, true, ctx.Done())

0 commit comments

Comments
 (0)