Skip to content

Commit 6bda4d5

Browse files
Merge pull request #9331 from bitoku/release-1.29
OCPBUGS-58509: [release-1.29] fix deadlock when the container is in uninterruptible sleep
2 parents 6cc30d2 + 4da0e47 commit 6bda4d5

File tree

2 files changed

+38
-17
lines changed

2 files changed

+38
-17
lines changed

internal/oci/container.go

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ type Container struct {
6868
stopLock sync.Mutex
6969
stopTimeoutChan chan int64
7070
stopWatchers []chan struct{}
71+
stopKillLoopBegun bool
7172
pidns nsmgr.Namespace
7273
restore bool
7374
restoreArchivePath string
@@ -145,22 +146,23 @@ func NewContainer(id, name, bundlePath, logPath string, labels, crioAnnotations,
145146
},
146147
ImageRef: externalImageRef,
147148
},
148-
name: name,
149-
bundlePath: bundlePath,
150-
logPath: logPath,
151-
terminal: terminal,
152-
stdin: stdin,
153-
stdinOnce: stdinOnce,
154-
runtimeHandler: runtimeHandler,
155-
crioAnnotations: crioAnnotations,
156-
imageName: imageName,
157-
imageID: imageID,
158-
dir: dir,
159-
state: state,
160-
stopSignal: stopSignal,
161-
stopTimeoutChan: make(chan int64, 10),
162-
stopWatchers: []chan struct{}{},
163-
execPIDs: map[int]bool{},
149+
name: name,
150+
bundlePath: bundlePath,
151+
logPath: logPath,
152+
terminal: terminal,
153+
stdin: stdin,
154+
stdinOnce: stdinOnce,
155+
runtimeHandler: runtimeHandler,
156+
crioAnnotations: crioAnnotations,
157+
imageName: imageName,
158+
imageID: imageID,
159+
dir: dir,
160+
state: state,
161+
stopSignal: stopSignal,
162+
stopTimeoutChan: make(chan int64, 10),
163+
stopWatchers: []chan struct{}{},
164+
stopKillLoopBegun: false,
165+
execPIDs: map[int]bool{},
164166
}
165167
return c, nil
166168
}
@@ -600,14 +602,30 @@ func (c *Container) SetAsStopping() (setToStopping bool) {
600602
return false
601603
}
602604

605+
// SetStopKillLoopBegun sets the stopKillLoopBegun flag to true.
606+
func (c *Container) SetStopKillLoopBegun() {
607+
c.stopLock.Lock()
608+
defer c.stopLock.Unlock()
609+
c.stopKillLoopBegun = true
610+
}
611+
603612
func (c *Container) WaitOnStopTimeout(ctx context.Context, timeout int64) {
604613
c.stopLock.Lock()
605614
if !c.stopping {
606615
c.stopLock.Unlock()
607616
return
608617
}
609618

610-
c.stopTimeoutChan <- timeout
619+
// Don't use the stopTimeoutChan when the container is in kill loop
620+
// because values in the channel are no longer consumed.
621+
if !c.stopKillLoopBegun {
622+
// Use select and default not to block when the stopTimeoutChan is full.
623+
// The channel is very unlikely to be full, but it could happen in theory.
624+
select {
625+
case c.stopTimeoutChan <- timeout:
626+
default:
627+
}
628+
}
611629

612630
watcher := make(chan struct{}, 1)
613631
c.stopWatchers = append(c.stopWatchers, watcher)

internal/oci/runtime_oci.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -948,6 +948,7 @@ func (r *runtimeOCI) StopLoopForContainer(c *Container, bm kwait.BackoffManager)
948948

949949
case <-time.After(time.Until(targetTime)):
950950
log.Warnf(ctx, "Stopping container %s with stop signal timed out. Killing...", c.ID())
951+
c.SetStopKillLoopBegun()
951952
goto killContainer
952953

953954
case <-done:
@@ -968,9 +969,11 @@ killContainer:
968969
}
969970

970971
if err := c.Living(); err != nil {
972+
log.Debugf(ctx, "Container is no longer alive")
971973
stop()
972974
return
973975
}
976+
log.Debugf(ctx, "Killing failed for some reasons, retrying...")
974977
// Reschedule the timer so that the periodic reminder can continue.
975978
blockedTimer.Reset(stopProcessBlockedInterval)
976979
}, bm, true, ctx.Done())

0 commit comments

Comments
 (0)