Skip to content

Commit

Permalink
Merge pull request #160 from srl-labs/feat/better-launcher-logs-on-co…
Browse files Browse the repository at this point in the history
…ntainer-start-fail

feat: some hopefully better logging for when clab fails due to container failing too quickly
  • Loading branch information
carlmontanari authored Jun 20, 2024
2 parents 23bb254 + e9d1b4c commit 417bebe
Show file tree
Hide file tree
Showing 7 changed files with 89 additions and 20 deletions.
3 changes: 2 additions & 1 deletion build/launcher.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ FROM --platform=linux/amd64 debian:bookworm-slim
SHELL ["/bin/bash", "-o", "pipefail", "-c"]

ARG DOCKER_VERSION="5:26.*"
ARG CONTAINERLAB_VERSION="0.55.*"
# pinning back as vxlan tools has some issue we need to investigate in 0.52.0
ARG CONTAINERLAB_VERSION="0.51.3"
ARG NERDCTL_VERSION="1.7.6"

RUN apt-get update && \
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ require (
// pin back to help controller-runtime out
// https://github.com/kubernetes-sigs/controller-runtime/issues/2788
k8s.io/client-go v0.30.2
k8s.io/klog/v2 v2.130.0
k8s.io/klog/v2 v2.130.1
k8s.io/kube-openapi v0.0.0-20240521193020-835d969ad83a
sigs.k8s.io/controller-runtime v0.18.4
sigs.k8s.io/yaml v1.4.0
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,8 @@ k8s.io/klog/v2 v2.120.1 h1:QXU6cPEOIslTGvZaXvFWiP9VKyeet3sawzTOvdXb4Vw=
k8s.io/klog/v2 v2.120.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
k8s.io/klog/v2 v2.130.0 h1:5nB3+3HpqKqXJIXNtJdtxcDCfaa9KL8StJgMzGJkUkM=
k8s.io/klog/v2 v2.130.0/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
k8s.io/kms v0.30.2 h1:VSZILO/tkzrz5Tu2j+yFQZ2Dc5JerQZX2GqhFJbQrfw=
k8s.io/kms v0.30.2/go.mod h1:GrMurD0qk3G4yNgGcsCEmepqf9KyyIrTXYR2lyUOJC4=
k8s.io/kube-openapi v0.0.0-20240521193020-835d969ad83a h1:zD1uj3Jf+mD4zmA7W+goE5TxDkI7OGJjBNBzq5fJtLA=
Expand Down
26 changes: 23 additions & 3 deletions launcher/clabernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ func (c *clabernetes) startup() {
c.launch()
c.connectivity()

go c.imageCleanup()
go c.runProbes()
go c.watchContainers()

Expand Down Expand Up @@ -198,10 +199,15 @@ func (c *clabernetes) launch() {

err := c.runContainerlab()
if err != nil {
c.logger.Fatalf("failed launching containerlab, err: %s", err)
c.logger.Criticalf(
"failed launching containerlab,"+
" will try to gather crashed container logs then will exit, err: %s", err,
)

c.reportContainerLaunchFail()
}

c.containerIDs, err = getContainerIDs()
c.containerIDs, err = getContainerIDs(false)
if err != nil {
c.logger.Warnf(
"failed determining container ids will continue but will not log container output,"+
Expand Down Expand Up @@ -385,7 +391,7 @@ func (c *clabernetes) watchContainers() {
ticker := time.NewTicker(containerCheckInterval)

for range ticker.C {
currentContainerIDs, err := getContainerIDs()
currentContainerIDs, err := getContainerIDs(false)
if err != nil {
c.logger.Warnf(
"failed listing container ids, error: %s",
Expand All @@ -406,3 +412,17 @@ func (c *clabernetes) watchContainers() {
}
}
}

func (c *clabernetes) reportContainerLaunchFail() {
allContainerIDs, err := getContainerIDs(true)
if err != nil {
c.logger.Fatalf(
"failed launching containerlab, then failed gathering all container "+
"ids to report container status. error: %s", err,
)
}

printContainerLogs(c.nodeLogger, allContainerIDs)

os.Exit(clabernetesconstants.ExitCodeError)
}
20 changes: 8 additions & 12 deletions launcher/connectivity/vxlan.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,20 +73,16 @@ func (m *vxlanManager) resolveVXLANService(vxlanRemote string) (string, error) {
for attempt := 0; attempt < resolveServiceMaxAttempts; attempt++ {
resolvedVxlanRemotes, err = net.LookupIP(vxlanRemote)
if err != nil {
if attempt < resolveServiceMaxAttempts {
m.logger.Warnf(
"failed resolving remote vxlan endpoint but under max attempts will try"+
" again in %s. error: %s",
resolveServiceSleep,
err,
)

time.Sleep(resolveServiceSleep)
m.logger.Warnf(
"failed resolving remote vxlan endpoint but under max attempts will try"+
" again in %s. error: %s",
resolveServiceSleep,
err,
)

continue
}
time.Sleep(resolveServiceSleep)

return "", err
continue
}

break
Expand Down
37 changes: 34 additions & 3 deletions launcher/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,16 @@ func startDocker(logger io.Writer) error {
}
}

func getContainerIDs() ([]string, error) {
// return all the container ids running in the pod
psCmd := exec.Command("docker", "ps", "--quiet")
func getContainerIDs(all bool) ([]string, error) {
args := []string{"ps"}

if all {
args = append(args, "-a")
}

args = append(args, "--quiet")

psCmd := exec.Command("docker", args...)

output, err := psCmd.Output()
if err != nil {
Expand All @@ -148,6 +155,30 @@ func getContainerIDs() ([]string, error) {
return containerIDs, nil
}

func printContainerLogs(
logger claberneteslogging.Instance,
containerIDs []string,
) {
for _, containerID := range containerIDs {
args := []string{
"logs",
containerID,
}

cmd := exec.Command("docker", args...) //nolint:gosec

cmd.Stdout = logger
cmd.Stderr = logger

err := cmd.Run()
if err != nil {
logger.Warnf(
"printing node logs for container id %q failed, err: %s", containerID, err,
)
}
}
}

func tailContainerLogs(
logger claberneteslogging.Instance,
nodeLogger io.Writer,
Expand Down
19 changes: 19 additions & 0 deletions launcher/image.go
Original file line number Diff line number Diff line change
Expand Up @@ -356,3 +356,22 @@ func (c *clabernetes) imageImport() error {

return nil
}

func (c *clabernetes) imageCleanup() {
c.logger.Debug("running image (docker) cleanup in background...")

exportCmd := exec.Command(
"docker",
"system",
"prune",
"--force",
)

exportCmd.Stdout = c.logger
exportCmd.Stderr = c.logger

err := exportCmd.Run()
if err != nil {
c.logger.Warnf("failed pruning docker daemon, error: %s", err)
}
}

0 comments on commit 417bebe

Please sign in to comment.