From a05e50d70e28e33185f14523284a1f600aaa0b89 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 28 Oct 2024 16:45:24 -0700 Subject: [PATCH] libct/int: retry Checkpoint for cgroup v1 Cgroup v1 freezer have issues when trying to freeze a cgroup, and despite criu retries, it may fail like this: === RUN TestCheckpoint time="2024-10-18T08:55:44Z" level=warning msg="--- Quoting "/tmp/TestCheckpoint214687474/003/criu-parent/dump.log"" time="2024-10-18T08:55:44Z" level=warning msg="118:(09.517977) freezer.state=FREEZING" time="2024-10-18T08:55:44Z" level=warning msg="119:(09.618087) freezer.state=FREEZING" time="2024-10-18T08:55:44Z" level=warning msg="120:(09.718192) freezer.state=FREEZING" time="2024-10-18T08:55:44Z" level=warning msg="121:(09.818291) freezer.state=FREEZING" time="2024-10-18T08:55:44Z" level=warning msg="122:(09.918412) freezer.state=FREEZING" time="2024-10-18T08:55:44Z" level=warning msg="123:(10.001045) Error (criu/cr-dump.c:1779): Timeout reached. Try to interrupt: 0" time="2024-10-18T08:55:44Z" level=warning msg="124:(10.001084) freezer.state=FREEZING" time="2024-10-18T08:55:44Z" level=warning msg="125:(10.001125) Unfreezing tasks into 1" time="2024-10-18T08:55:44Z" level=warning msg="126:(10.001128) \tUnseizing 45035 into 1" time="2024-10-18T08:55:44Z" level=warning msg="127:(10.001140) Error (compel/src/lib/infect.c:418): Unable to detach from 45035: No such process" time="2024-10-18T08:55:44Z" level=warning msg="128:(10.001144) Writing image inventory (version 1)" time="2024-10-18T08:55:44Z" level=warning msg="129:(10.001223) Error (criu/cr-dump.c:1893): Pre-dumping FAILED." time="2024-10-18T08:55:44Z" level=warning msg=--- checkpoint_test.go:93: criu failed: type PRE_DUMP errno 0 Since cgroup v1 is going to be deprecated, and the problem doesn't exist on cgroup v2, let's retry the checkpoint a few times (on v1 only) to avoid flaky tests. Issues 4457, 4273. Signed-off-by: Kir Kolyshkin --- libcontainer/integration/checkpoint_test.go | 24 +++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/libcontainer/integration/checkpoint_test.go b/libcontainer/integration/checkpoint_test.go index 8d4d6fe4751..21793f7a3a1 100644 --- a/libcontainer/integration/checkpoint_test.go +++ b/libcontainer/integration/checkpoint_test.go @@ -8,8 +8,10 @@ import ( "regexp" "strings" "testing" + "time" "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/cgroups" "golang.org/x/sys/unix" ) @@ -79,6 +81,24 @@ func testCheckpoint(t *testing.T, userns bool) { tmp := t.TempDir() var parentImage string + retryCheckpoint := func(opts *libcontainer.CriuOpts) error { + err := container.Checkpoint(opts) + // Cgroup v1 freezer is flaky; v2 is fine. + if err == nil || cgroups.IsCgroup2UnifiedMode() { + return err + } + + const retries = 2 + for i := 1; i <= retries; i++ { + time.Sleep(time.Second << i) + t.Logf("cgroup v1 checkpointing is flaky, retry %d of %d", i, retries) + if err = container.Checkpoint(opts); err == nil { + return nil + } + } + return err + } + // Test pre-dump if mem_dirty_track is available. if criuFeature("mem_dirty_track") { parentImage = "../criu-parent" @@ -89,7 +109,7 @@ func testCheckpoint(t *testing.T, userns bool) { PreDump: true, } - if err := container.Checkpoint(preDumpOpts); err != nil { + if err := retryCheckpoint(preDumpOpts); err != nil { t.Fatal(err) } @@ -109,7 +129,7 @@ func testCheckpoint(t *testing.T, userns bool) { ParentImage: parentImage, } - if err := container.Checkpoint(checkpointOpts); err != nil { + if err := retryCheckpoint(checkpointOpts); err != nil { t.Fatal(err) }