Skip to content

Commit

Permalink
runc exec: implement CPU affinity
Browse files Browse the repository at this point in the history
As per
- opencontainers/runtime-spec#1253
- opencontainers/runtime-spec#1261

CPU affinity can be set in two ways:
1. When creating/starting a container, in config.json's
   Process.ExecCPUAffinity, which is when applied to all execs.
2. When running an exec, in process.json's CPUAffinity, which
   applied to a given exec and overrides the value from (1).

Add some basic tests.

Note that older kernels (RHEL8, Ubuntu 20.04) change CPU affinity of a
process to that of a container's cgroup, as soon as it is moved to that
cgroup, while newer kernels (Ubuntu 24.04, Fedora 41) don't do that.

Because of the above,
 - it's impossible to really test initial CPU affinity without adding
   debug logging to libcontainer/nsenter;
 - for older kernels, there can be a brief moment when exec's affinity
   is different than either initial or final affinity being set;
 - exec's final CPU affinity, if not specified, can be different
   depending on the kernel, therefore we don't test it.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
  • Loading branch information
kolyshkin committed Jan 15, 2025
1 parent 6c749bb commit 673d846
Show file tree
Hide file tree
Showing 11 changed files with 278 additions and 7 deletions.
72 changes: 72 additions & 0 deletions libcontainer/configs/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@ package configs
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"os/exec"
"strconv"
"strings"
"time"

"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -225,6 +228,9 @@ type Config struct {

// IOPriority is the container's I/O priority.
IOPriority *IOPriority `json:"io_priority,omitempty"`

// ExecCPUAffinity is CPU affinity for a non-init process to be run in the container.
ExecCPUAffinity *CPUAffinity `json:"exec_cpu_affinity,omitempty"`
}

// Scheduler is based on the Linux sched_setattr(2) syscall.
Expand Down Expand Up @@ -288,6 +294,72 @@ func ToSchedAttr(scheduler *Scheduler) (*unix.SchedAttr, error) {

type IOPriority = specs.LinuxIOPriority

type CPUAffinity struct {
Initial, Final *unix.CPUSet
}

func toCPUSet(str string) (*unix.CPUSet, error) {
if str == "" {
return nil, nil
}
s := new(unix.CPUSet)
for _, r := range strings.Split(str, ",") {
// Allow extra spaces around.
r = strings.TrimSpace(r)
// Allow empty elements (extra commas).
if r == "" {
continue
}
if r0, r1, found := strings.Cut(r, "-"); found {
start, err := strconv.ParseUint(r0, 10, 32)
if err != nil {
return nil, err
}
end, err := strconv.ParseUint(r1, 10, 32)
if err != nil {
return nil, err
}
if start > end {
return nil, errors.New("invalid range: " + r)
}
for i := int(start); i <= int(end); i++ {
s.Set(i)
}
} else {
val, err := strconv.ParseUint(r, 10, 32)
if err != nil {
return nil, err
}
s.Set(int(val))
}
}

return s, nil
}

// ConvertCPUAffinity converts [specs.CPUAffinity] to [CPUAffinity].
func ConvertCPUAffinity(sa *specs.CPUAffinity) (*CPUAffinity, error) {
if sa == nil {
return nil, nil
}
initial, err := toCPUSet(sa.Initial)
if err != nil {
return nil, fmt.Errorf("bad CPUAffinity.Initial: %w", err)
}
final, err := toCPUSet(sa.Final)
if err != nil {
return nil, fmt.Errorf("bad CPUAffinity.Final: %w", err)
}
if initial == nil && final == nil {
return nil, nil
}

return &CPUAffinity{
Initial: initial,
Final: final,
}, nil
}

type (
HookName string
HookList []Hook
Expand Down
4 changes: 4 additions & 0 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,7 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
AppArmorProfile: c.config.AppArmorProfile,
ProcessLabel: c.config.ProcessLabel,
Rlimits: c.config.Rlimits,
CPUAffinity: c.config.ExecCPUAffinity,
CreateConsole: process.ConsoleSocket != nil,
ConsoleWidth: process.ConsoleWidth,
ConsoleHeight: process.ConsoleHeight,
Expand All @@ -713,6 +714,9 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
if len(process.Rlimits) > 0 {
cfg.Rlimits = process.Rlimits
}
if process.CPUAffinity != nil {
cfg.CPUAffinity = process.CPUAffinity
}
if cgroups.IsCgroup2UnifiedMode() {
cfg.Cgroup2Path = c.cgroupManager.Path("")
}
Expand Down
3 changes: 2 additions & 1 deletion libcontainer/init_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ type initConfig struct {
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
SpecState *specs.State `json:"spec_state,omitempty"`
Cgroup2Path string `json:"cgroup2_path,omitempty"`
CPUAffinity *configs.CPUAffinity `json:"cpu_affinity,omitempty"`
}

// Init is part of "runc init" implementation.
Expand Down Expand Up @@ -150,7 +151,7 @@ func startInitialization() (retErr error) {

logrus.SetOutput(logPipe)
logrus.SetFormatter(new(logrus.JSONFormatter))
logrus.Debug("child process in init()")
logrus.Debugf("child process in init()")

// Only init processes have FIFOFD.
var fifoFile *os.File
Expand Down
9 changes: 7 additions & 2 deletions libcontainer/nsenter/log.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ void setup_logpipe(void)
loglevel = i;
}

bool log_enabled_for(int level)
{
return (logfd >= 0 && level <= loglevel);
}

/* Defined in nsexec.c */
extern int current_stage;

Expand All @@ -40,8 +45,8 @@ void write_log(int level, const char *format, ...)
va_list args;
int ret;

if (logfd < 0 || level > loglevel)
goto out;
if (!log_enabled_for(level))
return;

va_start(args, format);
ret = vasprintf(&message, format, args);
Expand Down
3 changes: 3 additions & 0 deletions libcontainer/nsenter/log.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef NSENTER_LOG_H
#define NSENTER_LOG_H

#include <stdbool.h>
#include <stdio.h>

/*
Expand All @@ -20,6 +21,8 @@
*/
void setup_logpipe(void);

bool log_enabled_for(int level);

void write_log(int level, const char *format, ...) __attribute__((format(printf, 2, 3)));

extern int logfd;
Expand Down
29 changes: 29 additions & 0 deletions libcontainer/nsenter/nsexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,25 @@ static void update_timens_offsets(pid_t pid, char *map, size_t map_len)
bail("failed to update /proc/%d/timens_offsets", pid);
}

void print_cpu_affinity()
{
cpu_set_t cpus = { };
size_t i, mask = 0;

if (sched_getaffinity(0, sizeof(cpus), &cpus) < 0) {
write_log(WARNING, "sched_getaffinity: %m");
return;
}

/* Do not print the complete mask, we only need a few first CPUs. */
for (i = 0; i < sizeof(mask) * 8; i++) {
if (CPU_ISSET(i, &cpus))
mask |= 1 << i;
}

write_log(DEBUG, "affinity: 0x%zx", mask);
}

void nsexec(void)
{
int pipenum;
Expand All @@ -699,6 +718,16 @@ void nsexec(void)

write_log(DEBUG, "=> nsexec container setup");

/* This is for ../../tests/integration/cpu_affinity.bats test only.
*
* Printing this from Go code might be too late as some kernels
* change the process' CPU affinity to that of container's cpuset
* as soon as the process is moved into container's cgroup.
*/
if (log_enabled_for(DEBUG)) {
print_cpu_affinity();
}

/* Parse all of the netlink configuration. */
nl_parse(pipenum, &config);

Expand Down
2 changes: 2 additions & 0 deletions libcontainer/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ type Process struct {
Scheduler *configs.Scheduler

IOPriority *configs.IOPriority

CPUAffinity *configs.CPUAffinity
}

// Wait waits for the process to exit.
Expand Down
51 changes: 47 additions & 4 deletions libcontainer/process_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,13 +163,53 @@ type setnsProcess struct {
initProcessPid int
}

// Starts setns process with specified initial CPU affinity.
func (p *setnsProcess) startWithCPUAffinity() error {
aff := p.config.CPUAffinity
if aff == nil || aff.Initial == nil {
return p.cmd.Start()
}
errCh := make(chan error)
defer close(errCh)

// Use a goroutine to dedicate an OS thread.
go func() {
runtime.LockOSThread()
// Command inherits the CPU affinity.
if err := unix.SchedSetaffinity(unix.Gettid(), aff.Initial); err != nil {
runtime.UnlockOSThread()
errCh <- fmt.Errorf("error setting initial CPU affinity: %w", err)
return
}

errCh <- p.cmd.Start()
// Deliberately omit runtime.UnlockOSThread here.
// https://pkg.go.dev/runtime#LockOSThread says:
// "If the calling goroutine exits without unlocking the
// thread, the thread will be terminated".
}()

return <-errCh
}

func (p *setnsProcess) setFinalCPUAffinity() error {
aff := p.config.CPUAffinity
if aff == nil || aff.Final == nil {
return nil
}
if err := unix.SchedSetaffinity(p.pid(), aff.Final); err != nil {
return fmt.Errorf("error setting final CPU affinity: %w", err)
}
return nil
}

func (p *setnsProcess) start() (retErr error) {
defer p.comm.closeParent()

// get the "before" value of oom kill count
// Get the "before" value of oom kill count.
oom, _ := p.manager.OOMKillCount()
err := p.cmd.Start()
// close the child-side of the pipes (controlled by child)
err := p.startWithCPUAffinity()
// Close the child-side of the pipes (controlled by child).
p.comm.closeChild()
if err != nil {
return fmt.Errorf("error starting setns process: %w", err)
Expand Down Expand Up @@ -219,6 +259,10 @@ func (p *setnsProcess) start() (retErr error) {
}
}
}
// Set final CPU affinity right after the process is moved into container's cgroup.
if err := p.setFinalCPUAffinity(); err != nil {
return err
}
if p.intelRdtPath != "" {
// if Intel RDT "resource control" filesystem path exists
_, err := os.Stat(p.intelRdtPath)
Expand All @@ -228,7 +272,6 @@ func (p *setnsProcess) start() (retErr error) {
}
}
}

if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil {
return fmt.Errorf("error writing config to pipe: %w", err)
}
Expand Down
5 changes: 5 additions & 0 deletions libcontainer/specconv/spec_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -556,6 +556,11 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
ioPriority := *spec.Process.IOPriority
config.IOPriority = &ioPriority
}
config.ExecCPUAffinity, err = configs.ConvertCPUAffinity(spec.Process.ExecCPUAffinity)
if err != nil {
return nil, err
}

}
createHooks(spec, config)
config.Version = specs.Version
Expand Down
Loading

0 comments on commit 673d846

Please sign in to comment.