Skip to content

Commit 07ea24f

Browse files
Merge pull request #9647 from bitoku/exec-cpu-affinity
OCPBUGS-67014: Fix Exec CPU affinity doesn't work when CPU load balancing is disabled.
2 parents 9e979af + 68795ff commit 07ea24f

17 files changed

+1009
-237
lines changed

Makefile

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,8 @@ mockgen: \
448448
mock-ocicni-types \
449449
mock-seccompociartifact-types \
450450
mock-ociartifact-types \
451-
mock-systemd
451+
mock-systemd \
452+
mock-cgmgr
452453

453454
.PHONY: mock-containereventserver
454455
mock-containereventserver: ${MOCKGEN}
@@ -492,6 +493,13 @@ mock-oci: ${MOCKGEN}
492493
-destination ${MOCK_PATH}/oci/oci.go \
493494
github.com/cri-o/cri-o/internal/oci RuntimeImpl
494495

496+
.PHONY: mock-cgmgr
497+
mock-cgmgr: ${MOCKGEN}
498+
${MOCKGEN} \
499+
-package cgmgr \
500+
-destination ${MOCK_PATH}/config/cgmgr/cgmgr.go \
501+
github.com/cri-o/cri-o/internal/config/cgmgr CgroupManager
502+
495503
.PHONY: mock-image-types
496504
mock-image-types: ${MOCKGEN}
497505
${BUILD_BIN_PATH}/mockgen \

internal/config/cgmgr/cgmgr_linux.go

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,17 @@ type CgroupManager interface {
8484
// It creates a new cgroup for that sandbox if it does not already exist.
8585
// It returns the cgroup stats for that sandbox.
8686
SandboxCgroupStats(sbParent, sbID string) (*CgroupStats, error)
87+
// ExecCgroupManager returns the cgroup manager for the exec cgroup used to place exec processes.
88+
// The cgroupPath parameter is the container's cgroup path from spec.Linux.CgroupsPath.
89+
// This is only supported on cgroup v2.
90+
ExecCgroupManager(cgroupPath string) (cgroups.Manager, error)
91+
// PodAndContainerCgroupManagers returns the libcontainer cgroup managers for both the pod and container cgroups.
92+
// The sbParent is the sandbox parent cgroup, and containerID is the container's ID.
93+
// It returns:
94+
// - podManager: the cgroup manager for the pod cgroup
95+
// - containerManagers: a slice of cgroup managers for the container cgroup(s).
96+
// This may include an extra manager if crun creates a sub-cgroup of the container.
97+
PodAndContainerCgroupManagers(sbParent, containerID string) (podManager cgroups.Manager, containerManagers []cgroups.Manager, err error)
8798
}
8899

89100
// New creates a new CgroupManager with defaults.
@@ -245,3 +256,77 @@ func removeSandboxCgroup(sbParent, containerCgroup string) error {
245256
func containerCgroupPath(id string) string {
246257
return CrioPrefix + "-" + id
247258
}
259+
260+
// LibctrManager creates a libcontainer cgroup manager for the given cgroup.
261+
// The cgroup parameter is the name of the cgroup, parent is the parent path,
262+
// and systemd indicates whether to use systemd cgroup driver.
263+
func LibctrManager(cgroup, parent string, systemd bool) (cgroups.Manager, error) {
264+
if systemd {
265+
parent = filepath.Base(parent)
266+
if parent == "." {
267+
// libcontainer shorthand for root
268+
// see https://github.com/opencontainers/runc/blob/9fffadae8/libcontainer/cgroups/systemd/common.go#L71
269+
parent = "-.slice"
270+
}
271+
}
272+
273+
cg := &cgroups.Cgroup{
274+
Name: cgroup,
275+
Parent: parent,
276+
Resources: &cgroups.Resources{
277+
SkipDevices: true,
278+
},
279+
Systemd: systemd,
280+
// If the cgroup manager is systemd, then libcontainer
281+
// will construct the cgroup path (for scopes) as:
282+
// ScopePrefix-Name.scope. For slices, and for cgroupfs manager,
283+
// this will be ignored.
284+
// See: https://github.com/opencontainers/runc/tree/main/libcontainer/cgroups/systemd/common.go:getUnitName
285+
ScopePrefix: CrioPrefix,
286+
}
287+
288+
return manager.New(cg)
289+
}
290+
291+
// crunContainerCgroupManager returns the cgroup manager for the actual container cgroup.
292+
// Some runtimes like crun create a sub-cgroup of the container to do the actual management,
293+
// to enforce systemd's single owner rule. This function checks for and handles that case.
294+
// If no sub-cgroup exists, it returns nil, nil.
295+
func crunContainerCgroupManager(expectedContainerCgroup string) (cgroups.Manager, error) {
296+
// HACK: There isn't really a better way to check if the actual container cgroup is in a child cgroup of the expected.
297+
// We could check /proc/$pid/cgroup, but we need to be able to query this after the container exits and the process is gone.
298+
// We know the source of this: crun creates a sub cgroup of the container to do the actual management, to enforce systemd's single
299+
// owner rule. Thus, we need to hardcode this check.
300+
actualContainerCgroup := filepath.Join(expectedContainerCgroup, "container")
301+
// Choose cpuset as the cgroup to check, with little reason.
302+
cgroupRoot := CgroupMemoryPathV2
303+
if !node.CgroupIsV2() {
304+
cgroupRoot += "/cpuset"
305+
}
306+
307+
// Normalize the path so that we don't add duplicate prefix.
308+
cgroupPath := filepath.Join(cgroupRoot, strings.TrimPrefix(actualContainerCgroup, cgroupRoot))
309+
if _, err := os.Stat(cgroupPath); err != nil {
310+
return nil, nil
311+
}
312+
// must be crun, make another LibctrManager. Regardless of cgroup driver, it will be treated as cgroupfs
313+
return LibctrManager(filepath.Base(actualContainerCgroup), filepath.Dir(actualContainerCgroup), false)
314+
}
315+
316+
// execCgroupManager creates an exec cgroup for placing exec processes.
317+
// containerCgroupAbsPath is the absolute path to the container's cgroup (without /sys/fs/cgroup prefix).
318+
// Returns the cgroup manager for the exec cgroup.
319+
//
320+
// The exec cgroup location depends on whether crun created a "container" child cgroup:
321+
// - If crun's "container" child exists: exec cgroup is created under it
322+
// - Otherwise: exec cgroup is created directly under the container cgroup
323+
func execCgroupManager(containerCgroupAbsPath string) (cgroups.Manager, error) {
324+
execCgroupParent := containerCgroupAbsPath
325+
326+
// Check if crun created a "container" child cgroup
327+
if mgr, err := crunContainerCgroupManager(containerCgroupAbsPath); err == nil && mgr != nil {
328+
execCgroupParent = filepath.Join(containerCgroupAbsPath, "container")
329+
}
330+
331+
return LibctrManager("exec", execCgroupParent, false)
332+
}

internal/config/cgmgr/cgroupfs_linux.go

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
package cgmgr
44

55
import (
6+
"errors"
67
"fmt"
78
"path"
89
"path/filepath"
@@ -79,7 +80,7 @@ func (m *CgroupfsManager) ContainerCgroupManager(sbParent, containerID string) (
7980
return nil, err
8081
}
8182

82-
cgMgr, err := libctrManager(filepath.Base(cgPath), filepath.Dir(cgPath), false)
83+
cgMgr, err := LibctrManager(filepath.Base(cgPath), filepath.Dir(cgPath), false)
8384
if err != nil {
8485
return nil, err
8586
}
@@ -146,7 +147,7 @@ func (m *CgroupfsManager) SandboxCgroupManager(sbParent, sbID string) (cgroups.M
146147
return nil, err
147148
}
148149

149-
cgMgr, err := libctrManager(filepath.Base(cgPath), filepath.Dir(cgPath), false)
150+
cgMgr, err := LibctrManager(filepath.Base(cgPath), filepath.Dir(cgPath), false)
150151
if err != nil {
151152
return nil, err
152153
}
@@ -256,3 +257,53 @@ func (m *CgroupfsManager) RemoveSandboxCgroup(sbParent, containerID string) erro
256257
// https://github.com/opencontainers/runc/blob/fd5debf3aa/libcontainer/cgroups/fs/paths.go#L156
257258
return removeSandboxCgroup(filepath.Join("/", sbParent), containerCgroupPath(containerID))
258259
}
260+
261+
// PodAndContainerCgroupManagers returns the libcontainer cgroup managers for both the pod and container cgroups.
262+
// The sbParent is the sandbox parent cgroup, and containerID is the container's ID.
263+
func (m *CgroupfsManager) PodAndContainerCgroupManagers(sbParent, containerID string) (podManager cgroups.Manager, containerManagers []cgroups.Manager, _ error) {
264+
containerCgroupFullPath, err := m.ContainerCgroupAbsolutePath(sbParent, containerID)
265+
if err != nil {
266+
return nil, nil, err
267+
}
268+
269+
podCgroupFullPath := filepath.Dir(containerCgroupFullPath)
270+
271+
podManager, err = LibctrManager(filepath.Base(podCgroupFullPath), filepath.Dir(podCgroupFullPath), false)
272+
if err != nil {
273+
return nil, nil, err
274+
}
275+
276+
containerManager, err := LibctrManager(filepath.Base(containerCgroupFullPath), filepath.Dir(containerCgroupFullPath), false)
277+
if err != nil {
278+
return nil, nil, err
279+
}
280+
281+
containerManagers = []cgroups.Manager{containerManager}
282+
283+
// crun actually does the cgroup configuration in a child of the cgroup CRI-O expects to be the container's
284+
extraManager, err := crunContainerCgroupManager(containerCgroupFullPath)
285+
if err != nil {
286+
return nil, nil, err
287+
}
288+
289+
if extraManager != nil {
290+
containerManagers = append(containerManagers, extraManager)
291+
}
292+
293+
return podManager, containerManagers, nil
294+
}
295+
296+
// ExecCgroupManager returns the cgroup manager for the exec cgroup used to place exec processes.
297+
// For cgroupfs, the cgroupPath is a direct filesystem path.
298+
// This is only supported on cgroup v2.
299+
func (m *CgroupfsManager) ExecCgroupManager(cgroupPath string) (cgroups.Manager, error) {
300+
if cgroupPath == "" {
301+
return nil, errors.New("container cgroup path is empty")
302+
}
303+
304+
if !node.CgroupIsV2() {
305+
return nil, errors.New("exec cgroup with CgroupFD is only supported on cgroup v2")
306+
}
307+
308+
return execCgroupManager(cgroupPath)
309+
}

internal/config/cgmgr/stats_linux.go

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,12 @@ import (
55
"math"
66
"os"
77
"path"
8-
"path/filepath"
98
"strconv"
109
"strings"
1110
"syscall"
1211
"time"
1312

1413
"github.com/opencontainers/cgroups"
15-
"github.com/opencontainers/cgroups/manager"
1614

1715
"github.com/cri-o/cri-o/internal/config/node"
1816
"github.com/cri-o/cri-o/internal/log"
@@ -133,34 +131,6 @@ func MemLimitGivenSystem(cgroupLimit uint64) uint64 {
133131
return cgroupLimit
134132
}
135133

136-
func libctrManager(cgroup, parent string, systemd bool) (cgroups.Manager, error) {
137-
if systemd {
138-
parent = filepath.Base(parent)
139-
if parent == "." {
140-
// libcontainer shorthand for root
141-
// see https://github.com/opencontainers/runc/blob/9fffadae8/libcontainer/cgroups/systemd/common.go#L71
142-
parent = "-.slice"
143-
}
144-
}
145-
146-
cg := &cgroups.Cgroup{
147-
Name: cgroup,
148-
Parent: parent,
149-
Resources: &cgroups.Resources{
150-
SkipDevices: true,
151-
},
152-
Systemd: systemd,
153-
// If the cgroup manager is systemd, then libcontainer
154-
// will construct the cgroup path (for scopes) as:
155-
// ScopePrefix-Name.scope. For slices, and for cgroupfs manager,
156-
// this will be ignored.
157-
// See: https://github.com/opencontainers/runc/tree/main/libcontainer/cgroups/systemd/common.go:getUnitName
158-
ScopePrefix: CrioPrefix,
159-
}
160-
161-
return manager.New(cg)
162-
}
163-
164134
func statsFromLibctrMgr(cgMgr cgroups.Manager) (*CgroupStats, error) {
165135
stats, err := cgMgr.GetStats()
166136
if err != nil {

internal/config/cgmgr/systemd_linux.go

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
package cgmgr
44

55
import (
6+
"errors"
67
"fmt"
78
"path"
89
"path/filepath"
@@ -111,7 +112,7 @@ func (m *SystemdManager) ContainerCgroupManager(sbParent, containerID string) (c
111112
return nil, err
112113
}
113114
// Due to a quirk of libcontainer's cgroup driver, cgroup name = containerID
114-
cgMgr, err := libctrManager(containerID, filepath.Dir(cgPath), true)
115+
cgMgr, err := LibctrManager(containerID, filepath.Dir(cgPath), true)
115116
if err != nil {
116117
return nil, err
117118
}
@@ -257,7 +258,7 @@ func (m *SystemdManager) SandboxCgroupManager(sbParent, sbID string) (cgroups.Ma
257258
return nil, err
258259
}
259260

260-
cgMgr, err := libctrManager(filepath.Base(cgPath), filepath.Dir(cgPath), true)
261+
cgMgr, err := LibctrManager(filepath.Base(cgPath), filepath.Dir(cgPath), true)
261262
if err != nil {
262263
return nil, err
263264
}
@@ -355,3 +356,73 @@ func (m *SystemdManager) RemoveSandboxCgroup(sbParent, containerID string) error
355356

356357
return removeSandboxCgroup(expandedParent, containerCgroupPath(containerID))
357358
}
359+
360+
// PodAndContainerCgroupManagers returns the libcontainer cgroup managers for both the pod and container cgroups.
361+
// The sbParent is the sandbox parent cgroup, and containerID is the container's ID.
362+
func (m *SystemdManager) PodAndContainerCgroupManagers(sbParent, containerID string) (podManager cgroups.Manager, containerManagers []cgroups.Manager, _ error) {
363+
containerCgroupFullPath, err := m.ContainerCgroupAbsolutePath(sbParent, containerID)
364+
if err != nil {
365+
return nil, nil, err
366+
}
367+
368+
podCgroupFullPath := filepath.Dir(containerCgroupFullPath)
369+
370+
podManager, err = LibctrManager(filepath.Base(podCgroupFullPath), filepath.Dir(podCgroupFullPath), true)
371+
if err != nil {
372+
return nil, nil, err
373+
}
374+
375+
// The first argument should be container ID, otherwise it adds duplicate prefix/suffix.
376+
containerManager, err := LibctrManager(containerID, filepath.Dir(containerCgroupFullPath), true)
377+
if err != nil {
378+
return nil, nil, err
379+
}
380+
381+
containerManagers = []cgroups.Manager{containerManager}
382+
383+
// crun actually does the cgroup configuration in a child of the cgroup CRI-O expects to be the container's
384+
extraManager, err := crunContainerCgroupManager(containerCgroupFullPath)
385+
if err != nil {
386+
return nil, nil, err
387+
}
388+
389+
if extraManager != nil {
390+
containerManagers = append(containerManagers, extraManager)
391+
}
392+
393+
return podManager, containerManagers, nil
394+
}
395+
396+
// ExecCgroupManager returns the cgroup manager for the exec cgroup used to place exec processes.
397+
// For systemd, the cgroupPath is in the format "slice:prefix:containerID".
398+
// This is only supported on cgroup v2.
399+
func (m *SystemdManager) ExecCgroupManager(cgroupPath string) (cgroups.Manager, error) {
400+
if cgroupPath == "" {
401+
return nil, errors.New("container cgroup path is empty")
402+
}
403+
404+
if !node.CgroupIsV2() {
405+
return nil, errors.New("exec cgroup with CgroupFD is only supported on cgroup v2")
406+
}
407+
408+
// Parse systemd format: slice:prefix:containerID
409+
parts := strings.Split(cgroupPath, ":")
410+
if len(parts) != 3 {
411+
return nil, fmt.Errorf("invalid systemd cgroup path format: %s (expected slice:prefix:containerID)", cgroupPath)
412+
}
413+
414+
slice := parts[0]
415+
prefix := parts[1]
416+
containerID := parts[2]
417+
418+
expandedSlice, err := systemd.ExpandSlice(slice)
419+
if err != nil {
420+
return nil, fmt.Errorf("failed to expand systemd slice %q: %w", slice, err)
421+
}
422+
423+
// The container cgroup is a scope under the expanded slice
424+
// Format: <expanded-slice>/<prefix>-<containerID>.scope
425+
containerCgroupAbsPath := filepath.Join(expandedSlice, prefix+"-"+containerID+".scope")
426+
427+
return execCgroupManager(containerCgroupAbsPath)
428+
}

internal/oci/container.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,10 @@ type Container struct {
8989
// To avoid race condition, it must be used with monitorProcessLock.
9090
monitorProcess *os.Process
9191
monitorProcessLock sync.Mutex
92+
// execCgroupPath is the absolute path to the pre-created exec cgroup.
93+
// When set, the exec process will spawn on this cgroup.
94+
// If this is used, InfraCtrCPUSet will be ignored for the exec operation.
95+
execCgroupPath string
9296
}
9397

9498
func (c *Container) CRIAttributes() *types.ContainerAttributes {
@@ -931,6 +935,16 @@ func (c *Container) RuntimeUser() *types.ContainerUser {
931935
return c.runtimeUser
932936
}
933937

938+
// SetExecCgroupPath sets the pre-created exec cgroup path.
939+
func (c *Container) SetExecCgroupPath(path string) {
940+
c.execCgroupPath = path
941+
}
942+
943+
// ExecCgroupPath returns the pre-created exec cgroup path, or empty string if not set.
944+
func (c *Container) ExecCgroupPath() string {
945+
return c.execCgroupPath
946+
}
947+
934948
// SetMonitorProcess loads the container monitor process from the ContainerMonitorProcess field.
935949
// It doesn't return any error so that we can continue to load the container even if the monitor process
936950
// is not found.

internal/oci/oci_unsupported.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ package oci
55
import (
66
"context"
77
"os"
8+
"os/exec"
89
"syscall"
910

1011
types "k8s.io/cri-api/pkg/apis/runtime/v1"
@@ -40,3 +41,7 @@ func (c *Container) SetSeccompProfilePath(pp string) {
4041
func (c *Container) SeccompProfilePath() string {
4142
return ""
4243
}
44+
45+
// setSysProcAttr is a no-op on non-Linux platforms.
46+
func setSysProcAttr(_ *exec.Cmd, _ uintptr) {
47+
}

0 commit comments

Comments
 (0)