diff --git a/checkpoint.go b/checkpoint.go index 3f8f407..27f74b9 100644 --- a/checkpoint.go +++ b/checkpoint.go @@ -38,6 +38,12 @@ type RestoreOptions struct { // established connections. TCPEstablished bool + // IgnoreVolumes skips restoring volume content from the archive, + // reusing an existing volume. Leave false for cross-node restore; + // set true for same-node restore-in-place. See + // ProjectRestoreOptions.IgnoreVolumes. + IgnoreVolumes bool + // LocalEnv overrides os.Environ() for the reattached workspace's // substituter localEnv pass. Nil means use the current process // environment — matches AttachOptions.LocalEnv. On a cross-node @@ -117,6 +123,7 @@ func (e *Engine) Restore(ctx context.Context, opts RestoreOptions) (*Workspace, ArchivePath: opts.ArchivePath, Name: opts.Name, TCPEstablished: opts.TCPEstablished, + IgnoreVolumes: opts.IgnoreVolumes, }) if err != nil { return nil, fmt.Errorf("restore: %w", err) diff --git a/checkpoint_project.go b/checkpoint_project.go index edf951f..b273d96 100644 --- a/checkpoint_project.go +++ b/checkpoint_project.go @@ -3,6 +3,7 @@ package devcontainer import ( "context" "encoding/json" + "errors" "fmt" "os" "path/filepath" @@ -63,6 +64,14 @@ type ProjectRestoreOptions struct { // established connections. TCPEstablished bool + // IgnoreVolumes skips restoring volume content from the archives, + // reusing whatever volumes already exist. Leave false for a + // cross-node restore (the destination has no volumes, so content + // must come from the archive). Set true for a same-node + // restore-in-place, where the volumes still exist with current data + // and re-extracting them would collide ("volume already exists"). + IgnoreVolumes bool + // LocalEnv overrides os.Environ() for the reattached primary // workspace's substituter (parity with RestoreOptions.LocalEnv). LocalEnv map[string]string @@ -199,11 +208,59 @@ func (e *Engine) RestoreProject(ctx context.Context, opts ProjectRestoreOptions) return nil, fmt.Errorf("RestoreProject: %w", err) } + // A cross-node restore lands on a fresh store with no project network. + // The checkpointed containers were attached to _default, and + // libpod restore fails ("network not found") unless it exists first. + // Recreate it before restoring any container, mirroring how + // compose.Orchestrator.Up names + labels it (CreateNetwork is + // idempotent — a label-matching network is reused, so same-node + // restore is unaffected). Custom/extra compose networks aren't yet + // recorded in the manifest — see design/checkpoint-restore-fixes.md. + if _, err := e.runtime.CreateNetwork(ctx, runtime.NetworkSpec{ + Name: manifest.Project + "_default", + Labels: map[string]string{ + compose.LabelComposeProject: manifest.Project, + compose.LabelEngine: compose.EngineDisplayName, + }, + }); err != nil { + return nil, fmt.Errorf("RestoreProject: recreate project network: %w", err) + } + out := &ProjectRestore{Project: manifest.Project, Services: map[string]*runtime.Container{}} for _, svc := range manifest.Services { + // Restore (--import) re-creates the container under its archived, + // deterministic compose name. On a fresh/cross-node store nothing + // pre-exists. On a same-node restore the checkpoint left the source + // container *stopped* under this name (StopAfter stops, it does not + // remove), which collides with re-create ("that ID is already in + // use"). Clear a non-running leftover — its full state is in the + // archive — but refuse to clobber a *running* container: that would + // be destroying a live service, not restoring it. RemoveVolumes + // stays false so the service's data volume survives for reuse. + name := manifest.Project + "-" + svc.Service + "-1" + d, ierr := e.runtime.InspectContainer(ctx, name) + switch { + case ierr != nil: + // Absent is the normal fresh/cross-node case — proceed. Any + // other inspect failure (daemon/API/permission) is surfaced + // rather than masked behind a downstream restore error. + var notFound *runtime.ContainerNotFoundError + if !errors.As(ierr, ¬Found) { + return nil, fmt.Errorf("RestoreProject: service %q: inspect existing container %q: %w", svc.Service, name, ierr) + } + case d != nil: + if d.State == runtime.StateRunning { + return nil, fmt.Errorf("RestoreProject: service %q: a running container %q already exists — stop it before restoring", svc.Service, name) + } + if rerr := e.runtime.RemoveContainer(ctx, name, runtime.RemoveOptions{Force: true}); rerr != nil { + return nil, fmt.Errorf("RestoreProject: service %q: clearing stale container %q: %w", svc.Service, name, rerr) + } + } + c, err := cr.Restore(ctx, runtime.RestoreSpec{ ArchivePath: filepath.Join(opts.ArchiveDir, svc.Archive), TCPEstablished: opts.TCPEstablished, + IgnoreVolumes: opts.IgnoreVolumes, }) if err != nil { return nil, fmt.Errorf("RestoreProject: service %q: %w", svc.Service, err) diff --git a/checkpoint_project_test.go b/checkpoint_project_test.go index 1fa0f21..ceb105c 100644 --- a/checkpoint_project_test.go +++ b/checkpoint_project_test.go @@ -39,6 +39,13 @@ func (f *fakeProjectRuntime) Capabilities() runtime.Capabilities { return c } +// CreateNetwork succeeds: RestoreProject recreates the project network +// before restoring containers (cross-node fresh-store path), so the +// checkpoint-capable fake must support it. +func (f *fakeProjectRuntime) CreateNetwork(ctx context.Context, spec runtime.NetworkSpec) (string, error) { + return "net-" + spec.Name, nil +} + func (f *fakeProjectRuntime) ListContainers(ctx context.Context, filter runtime.LabelFilter) ([]runtime.Container, error) { f.fakeRuntime.mu.Lock() defer f.fakeRuntime.mu.Unlock() diff --git a/compose/orchestrator.go b/compose/orchestrator.go index 94c82ab..b1994d4 100644 --- a/compose/orchestrator.go +++ b/compose/orchestrator.go @@ -31,6 +31,7 @@ import ( "errors" "fmt" "sort" + "strings" "sync" "time" @@ -82,15 +83,36 @@ type Orchestrator struct { // PollInterval is the cadence for health polling. Tests // override; production default below. PollInterval time.Duration + + // selfProbe is true when the backend asks the orchestrator to run + // healthcheck probes itself (via Exec) instead of configuring the + // backend's native HEALTHCHECK — see selfHealthProber and + // design/compose-native-health.md. Computed once at construction. + selfProbe bool +} + +// selfHealthProber is an optional runtime capability: a backend that +// implements it and returns true asks the orchestrator to drive +// healthcheck probes itself rather than relying on the backend's native +// HEALTHCHECK. Podman implements it — its native healthcheck runs eagerly +// as root and races privilege-dropping images. Docker and Apple do not, +// keeping their existing behavior. +type selfHealthProber interface { + PreferSelfProbedHealth() bool } // NewOrchestrator constructs an Orchestrator with sane defaults. func NewOrchestrator(rt runtime.Runtime, backendName string) *Orchestrator { + selfProbe := false + if p, ok := rt.(selfHealthProber); ok { + selfProbe = p.PreferSelfProbedHealth() + } return &Orchestrator{ rt: rt, BackendName: backendName, HealthTimeout: DefaultHealthTimeout, PollInterval: 500 * time.Millisecond, + selfProbe: selfProbe, } } @@ -481,6 +503,14 @@ func (o *Orchestrator) ensureService( } spec := serviceToRunSpec(plan, svc, projectLabels, hash, imageDigest) + if o.selfProbe { + // The orchestrator probes health itself (see waitFor); explicitly + // DISABLE the backend's native HEALTHCHECK. Nil would mean "inherit + // from image" (toHealthcheck), so an image-baked HEALTHCHECK would + // still run natively — reintroducing Podman's eager root probe that + // breaks privilege-dropping images. Disable emits the NONE sentinel. + spec.HealthCheck = &runtime.HealthCheckSpec{Disable: true} + } c, err := o.rt.RunContainer(ctx, spec) if err != nil { // Compose's `up -d` pulls missing images implicitly. Mirror @@ -582,7 +612,11 @@ func (o *Orchestrator) gateLevel( if cid == "" { continue } - if err := o.waitFor(ctx, svcName, cid, req.condition, deadline); err != nil { + var hc *runtime.HealthCheckSpec + if svc, ok := plan.Project.Services[svcName]; ok { + hc = healthCheckOf(svc.HealthCheck) + } + if err := o.waitFor(ctx, svcName, cid, req.condition, hc, deadline); err != nil { if req.optional { // Per compose spec: a non-required dependency that // fails to satisfy its condition does not block the @@ -596,46 +630,63 @@ func (o *Orchestrator) gateLevel( return nil } -// waitFor polls a service's condition until satisfied or deadline. +// waitFor polls a service's condition until satisfied or deadline. When +// o.selfProbe is set, a service_healthy gate is evaluated by executing the +// service's healthcheck command (hc) via Exec rather than reading the +// backend's native health status — see selfHealthProber and +// design/compose-native-health.md. The completion condition always reads +// container state (no native healthcheck required either way). func (o *Orchestrator) waitFor( - ctx context.Context, svc, id, cond string, deadline time.Time, + ctx context.Context, svc, id, cond string, hc *runtime.HealthCheckSpec, deadline time.Time, ) error { + // In self-probe mode, honor start_period as a grace delay before the + // first probe so an eager probe can't run before the service inits. + probeNotBefore := time.Now() + if o.selfProbe && hc != nil { + probeNotBefore = probeNotBefore.Add(hc.StartPeriod) + } for { if err := ctx.Err(); err != nil { return err } - details, err := o.rt.InspectContainer(ctx, id) - if err == nil && details != nil { - switch cond { - case "service_healthy": - // Treat HealthNone as satisfied: a container with - // no HEALTHCHECK directive can still be a - // service_healthy gate target (compose's behavior), - // so falling back to State=Running keeps that case - // working. For services that DO declare a - // healthcheck, require Healthy explicitly. - switch details.Health { - case runtime.HealthHealthy: - return nil - case runtime.HealthNone: - if details.State == runtime.StateRunning { + if o.selfProbe && cond == "service_healthy" { + if o.probeHealthy(ctx, id, hc, probeNotBefore) { + return nil + } + } else { + details, err := o.rt.InspectContainer(ctx, id) + if err == nil && details != nil { + switch cond { + case "service_healthy": + // Treat HealthNone as satisfied: a container with + // no HEALTHCHECK directive can still be a + // service_healthy gate target (compose's behavior), + // so falling back to State=Running keeps that case + // working. For services that DO declare a + // healthcheck, require Healthy explicitly. + switch details.Health { + case runtime.HealthHealthy: return nil + case runtime.HealthNone: + if details.State == runtime.StateRunning { + return nil + } + case runtime.HealthUnhealthy: + return fmt.Errorf( + "compose: service %q reported unhealthy while waiting on service_healthy", + svc, + ) + } + case "service_completed_successfully": + if details.State == runtime.StateExited && details.ExitCode == 0 { + return nil + } + if details.State == runtime.StateExited && details.ExitCode != 0 { + return fmt.Errorf( + "compose: %s exited with code %d while waiting for completion", + svc, details.ExitCode, + ) } - case runtime.HealthUnhealthy: - return fmt.Errorf( - "compose: service %q reported unhealthy while waiting on service_healthy", - svc, - ) - } - case "service_completed_successfully": - if details.State == runtime.StateExited && details.ExitCode == 0 { - return nil - } - if details.State == runtime.StateExited && details.ExitCode != 0 { - return fmt.Errorf( - "compose: %s exited with code %d while waiting for completion", - svc, details.ExitCode, - ) } } } @@ -654,6 +705,59 @@ func (o *Orchestrator) waitFor( } } +// probeHealthy runs the service's compose healthcheck once via Exec and +// reports whether it currently passes. A nil/disabled/NONE/empty +// healthcheck falls back to "is the container running?", mirroring the +// native path's HealthNone behavior. During the start_period grace +// (before notBefore) it reports not-healthy without probing, so the first +// probe is deferred until the service has had time to initialize. +func (o *Orchestrator) probeHealthy( + ctx context.Context, id string, hc *runtime.HealthCheckSpec, notBefore time.Time, +) bool { + cmd := healthProbeCmd(hc) + if cmd == nil { + details, err := o.rt.InspectContainer(ctx, id) + return err == nil && details != nil && details.State == runtime.StateRunning + } + if time.Now().Before(notBefore) { + return false + } + probeCtx := ctx + if hc.Timeout > 0 { + var cancel context.CancelFunc + probeCtx, cancel = context.WithTimeout(ctx, hc.Timeout) + defer cancel() + } + res, err := o.rt.ExecContainer(probeCtx, id, runtime.ExecOptions{Cmd: cmd}) + return err == nil && res.ExitCode == 0 +} + +// healthProbeCmd converts a compose-normalized healthcheck Test into an +// Exec command, or nil for NONE / disabled / empty. Compose normalizes +// Test to a CMD / CMD-SHELL / NONE leading token; the default branch +// shell-runs a bare-string form defensively. +func healthProbeCmd(hc *runtime.HealthCheckSpec) []string { + if hc == nil || hc.Disable || len(hc.Test) == 0 { + return nil + } + switch hc.Test[0] { + case "NONE": + return nil + case "CMD": + if len(hc.Test) < 2 { + return nil + } + return append([]string(nil), hc.Test[1:]...) + case "CMD-SHELL": + if len(hc.Test) < 2 { + return nil + } + return []string{"/bin/sh", "-c", hc.Test[1]} + default: + return []string{"/bin/sh", "-c", strings.Join(hc.Test, " ")} + } +} + // serviceToRunSpec is the in-memory translation from compose's // ServiceConfig to runtime.RunSpec. This is intentionally minimal // for C6 — env / labels / mounts / command / entrypoint / user / diff --git a/compose/orchestrator_health_test.go b/compose/orchestrator_health_test.go new file mode 100644 index 0000000..9c896f9 --- /dev/null +++ b/compose/orchestrator_health_test.go @@ -0,0 +1,139 @@ +package compose + +import ( + "context" + "errors" + "reflect" + "testing" + "time" + + "github.com/crunchloop/devcontainer/runtime" +) + +func TestHealthProbeCmd(t *testing.T) { + tests := []struct { + name string + hc *runtime.HealthCheckSpec + want []string + }{ + {"nil", nil, nil}, + {"disabled", &runtime.HealthCheckSpec{Test: []string{"CMD", "true"}, Disable: true}, nil}, + {"empty", &runtime.HealthCheckSpec{Test: nil}, nil}, + {"none", &runtime.HealthCheckSpec{Test: []string{"NONE"}}, nil}, + {"cmd", &runtime.HealthCheckSpec{Test: []string{"CMD", "rabbitmq-diagnostics", "-q", "ping"}}, []string{"rabbitmq-diagnostics", "-q", "ping"}}, + {"cmd-no-args", &runtime.HealthCheckSpec{Test: []string{"CMD"}}, nil}, + {"cmd-shell", &runtime.HealthCheckSpec{Test: []string{"CMD-SHELL", "pg_isready -U postgres"}}, []string{"/bin/sh", "-c", "pg_isready -U postgres"}}, + {"bare-string-defensive", &runtime.HealthCheckSpec{Test: []string{"curl", "-f", "localhost"}}, []string{"/bin/sh", "-c", "curl -f localhost"}}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := healthProbeCmd(tt.hc) + if !reflect.DeepEqual(got, tt.want) { + t.Fatalf("healthProbeCmd(%v) = %v, want %v", tt.hc, got, tt.want) + } + }) + } +} + +// selfProbingMock is a mockRuntime that opts into orchestrator-driven +// health probing, exercising the selfHealthProber detection. +type selfProbingMock struct{ *mockRuntime } + +func (selfProbingMock) PreferSelfProbedHealth() bool { return true } + +func TestNewOrchestratorDetectsSelfProbe(t *testing.T) { + if NewOrchestrator(newMockRuntime(), "docker").selfProbe { + t.Error("docker mock (no selfHealthProber) should not self-probe") + } + if !NewOrchestrator(selfProbingMock{newMockRuntime()}, "podman").selfProbe { + t.Error("podman-like mock should self-probe") + } +} + +// In self-probe mode the RunSpec must DISABLE the native healthcheck, not +// leave it nil — nil means "inherit the image's HEALTHCHECK", which would +// let Podman run an image-baked probe (as root, eagerly) and reintroduce +// the very failure self-probing exists to avoid. +func TestUp_SelfProbeDisablesNativeHealthcheck(t *testing.T) { + rt := newMockRuntime() + orch := NewOrchestrator(selfProbingMock{rt}, "podman") + proj := newProject(t, map[string][]string{"app": nil}) + + var got *runtime.HealthCheckSpec + rt.OnRunContainer = func(spec runtime.RunSpec) (*runtime.Container, error) { + got = spec.HealthCheck + return nil, nil + } + if _, err := orch.Up(context.Background(), &Plan{Project: proj, ProjectName: "dc-x"}); err != nil { + t.Fatalf("Up: %v", err) + } + if got == nil || !got.Disable { + t.Fatalf("self-probe must set HealthCheck.Disable=true (got %+v); nil would inherit the image healthcheck", got) + } +} + +func TestWaitForSelfProbeHealthy(t *testing.T) { + rt := newMockRuntime() + var execed int + rt.OnExec = func(id string, opts runtime.ExecOptions) (runtime.ExecResult, error) { + execed++ + // Succeed only on the 2nd probe — proves we poll the command, + // not the native health status. + if execed >= 2 { + return runtime.ExecResult{ExitCode: 0}, nil + } + return runtime.ExecResult{ExitCode: 1}, nil + } + orch := NewOrchestrator(selfProbingMock{rt}, "podman") + orch.PollInterval = time.Millisecond + + hc := &runtime.HealthCheckSpec{Test: []string{"CMD", "rabbitmq-diagnostics", "-q", "ping"}} + err := orch.waitFor(context.Background(), "rabbitmq", "c1", "service_healthy", hc, time.Now().Add(2*time.Second)) + if err != nil { + t.Fatalf("waitFor = %v, want nil (probe should pass on 2nd try)", err) + } + if execed < 2 { + t.Fatalf("expected >= 2 probe execs, got %d", execed) + } +} + +func TestWaitForSelfProbeTimeout(t *testing.T) { + rt := newMockRuntime() + rt.OnExec = func(id string, opts runtime.ExecOptions) (runtime.ExecResult, error) { + return runtime.ExecResult{ExitCode: 1}, nil // never healthy + } + orch := NewOrchestrator(selfProbingMock{rt}, "podman") + orch.PollInterval = time.Millisecond + orch.HealthTimeout = 30 * time.Millisecond + + hc := &runtime.HealthCheckSpec{Test: []string{"CMD", "false"}} + err := orch.waitFor(context.Background(), "svc", "c1", "service_healthy", hc, time.Now().Add(30*time.Millisecond)) + var hte *HealthTimeoutError + if !errors.As(err, &hte) { + t.Fatalf("waitFor = %v, want *HealthTimeoutError", err) + } +} + +// A service with no/NONE healthcheck, self-probe mode: gate is satisfied +// by the container merely running (mirrors native HealthNone behavior). +func TestWaitForSelfProbeNoHealthcheckRunning(t *testing.T) { + rt := newMockRuntime() + c, err := rt.RunContainer(context.Background(), runtime.RunSpec{Name: "svc"}) + if err != nil { + t.Fatal(err) + } + if err := rt.StartContainer(context.Background(), c.ID); err != nil { + t.Fatal(err) + } + rt.OnExec = func(id string, opts runtime.ExecOptions) (runtime.ExecResult, error) { + t.Fatal("should not exec a probe when there is no healthcheck") + return runtime.ExecResult{}, nil + } + orch := NewOrchestrator(selfProbingMock{rt}, "podman") + orch.PollInterval = time.Millisecond + + err = orch.waitFor(context.Background(), "svc", c.ID, "service_healthy", nil, time.Now().Add(time.Second)) + if err != nil { + t.Fatalf("waitFor = %v, want nil (running container, no healthcheck)", err) + } +} diff --git a/compose/orchestrator_test.go b/compose/orchestrator_test.go index 1593c66..4e1321e 100644 --- a/compose/orchestrator_test.go +++ b/compose/orchestrator_test.go @@ -45,6 +45,7 @@ type mockRuntime struct { // Hooks (set to override default behavior). OnRunContainer func(spec runtime.RunSpec) (*runtime.Container, error) OnInspect func(id string, base *runtime.ContainerDetails) *runtime.ContainerDetails + OnExec func(id string, opts runtime.ExecOptions) (runtime.ExecResult, error) // imageDigest, when non-empty, is the digest InspectImage // returns for every reference. Tests that exercise digest-drift @@ -139,6 +140,9 @@ func (m *mockRuntime) RemoveContainer(ctx context.Context, id string, opts runti } func (m *mockRuntime) ExecContainer(ctx context.Context, id string, opts runtime.ExecOptions) (runtime.ExecResult, error) { + if m.OnExec != nil { + return m.OnExec(id, opts) + } return runtime.ExecResult{}, nil } diff --git a/design/checkpoint-restore.md b/design/checkpoint-restore.md index 3820554..6c82287 100644 --- a/design/checkpoint-restore.md +++ b/design/checkpoint-restore.md @@ -242,13 +242,46 @@ Model notes (validated by the Phase-0 spike, §7): - **Network:** the shared network re-forms as containers come back (`restore --import` re-attaches networking, and Podman restores the original container name, so service-name DNS resolves again). The network - must still exist on the target; recreating it cross-node is the - orchestrator's caller's job (or a follow-up). + must exist on the target *before* restore, or libpod fails with `network + not found`. **`RestoreProject` now recreates `_default` itself** + (idempotent, mirroring `compose.Orchestrator.Up`'s name + labels) so a + cross-node restore onto a fresh store works. Only the default network is + recreated; custom/extra compose networks aren't recorded in the manifest + yet — a follow-up records the network set at checkpoint time. +- **Replace, don't duplicate:** `restore --import` always creates a *new* + container under the archived (deterministic `--1`) + name. `CheckpointProject(StopAfter)` stops but does not remove, so a + same-node restore-in-place collides on that name (`ID already in use`). + `RestoreProject` clears it first, but only when **not running** (its + state is in the archive; `RemoveVolumes` stays false so the data volume + survives) — a *running* same-named container is a genuine conflict and + restore errors rather than killing a live service. No-op cross-node. +- **Volumes:** restore re-extracts volume content from the archive by + default (correct cross-node, where the destination has none). For a + same-node restore-in-place the volumes still exist and re-extraction + collides (`volume already exists`); `RestoreSpec.IgnoreVolumes` + (surfaced as `ProjectRestoreOptions`/`RestoreOptions.IgnoreVolumes` → + Podman `ignoreVolumes=true`) reuses the existing volume instead. - **Completeness:** the manifest is written last, so its presence implies a complete set; a partial checkpoint leaves no manifest and `RestoreProject` fails cleanly. - **Scale:** one container per service in v1 (no compose `scale`). +Two **node prerequisites** the multi-service validation surfaced (both +node-provisioning, not library code — the engine talks to a remote socket): + +- **CRIU `skip-file-rwx-check`** in `/etc/criu/default.conf` on every node + that restores. Some images set their data dir to `0700` (Postgres + requires it) and CRIU records the process cwd mode at checkpoint; the + restored volume mountpoint is `0755`, so CRIU's file-mode consistency + check rejects it. Podman exposes no per-restore flag for it (5.7.0), and + the content is correct — only the mountpoint mode differs — so the CRIU + config switch is the fix, alongside installing CRIU itself. +- **Base images present on the destination.** A project restore is *not* + fully self-contained: Podman pulls each service's base image if absent. + A cross-node destination needs the images cached or registry credentials + (in practice it already runs the same workloads). + ## 4. Capability gate Add one field to the `Capabilities` struct @@ -362,13 +395,26 @@ What's **proven** (2026-06-19, bench pod, real image): peer-IP change on restore is the residual edge (matches the "agents reconnect" assumption). +- ✅ **Real multi-service devcontainer, full project round trip + true + cross-node migration (2026-06-23).** An 8-service compose devcontainer + (Postgres, RabbitMQ, MinIO, an OAuth mock, an app sidecar, …) checkpointed + on node A and `RestoreProject`'d on node B — a separate, never-ran-this + Podman store. All 8 services resumed and in-memory/disk state survived the + move (a file written in the app container and a row written in Postgres + pre-checkpoint were present post-restore on B). Exercised the §3.3 + decisions (network recreation, replace-guard, volume handling) and both + node prerequisites above. Timings: checkpoint ~9–17s, restore ~6–7s + cross-node / ~5s same-node-reuse for a ~580 MB archive set; storage speed + (local vs shared mount) and whether volume content is re-extracted are the + dominant variables. + What's **still open** (minor): -- **Working-set timing.** Idle container was fast (~0.5s same-pod, ~3.5s - cross-pod incl. rootfs unpack); a busy agent's memory footprint sets the - real checkpoint time vs the eviction window. Measure on a real workload. -- **Forced cross-node** placement (anti-affinity) — belt-and-suspenders; - the archive is already proven node-independent. +- **Working-set timing on a busy agent.** The validations above were on + freshly-started services; a saturated agent's memory footprint still sets + the real checkpoint time vs the eviction window — measure under load. +- **Forced cross-node** placement (anti-affinity) — now exercised across two + isolated Podman stores (2026-06-23); the archive is proven node-independent. ## 8. Constraints, risks & mitigations diff --git a/design/compose-native-health.md b/design/compose-native-health.md new file mode 100644 index 0000000..0f272a1 --- /dev/null +++ b/design/compose-native-health.md @@ -0,0 +1,166 @@ +# Design — Orchestrator-driven health probing (Podman) + +**Status:** Draft for review +**Date:** 2026-06-23 +**Scope:** Make the native compose orchestrator (`compose/orchestrator.go`) +run healthcheck probes itself via `Exec` on backends whose native +HEALTHCHECK is unsafe, instead of configuring the backend's native +healthcheck. Fixes RabbitMQ (and any privilege-dropping image) failing to +boot under the Podman backend. + +Companion to `design/compose-native.md` (the orchestrator) and +`design/podman-backend.md` (the Podman runtime). + +--- + +## 1. Problem + +A `rabbitmq` service in a multi-service compose devcontainer exits 1 under +the Podman backend: + +``` +Error when reading /var/lib/rabbitmq/.erlang.cookie: eacces +``` + +Root cause (reproduced deterministically, incl. bare `podman run +--health-cmd ...` — i.e. a Podman behavior, not a bug in our translation): + +1. The rabbitmq image runs as root; its entrypoint `gosu`-drops + `rabbitmq-server` to uid 999. +2. **Podman runs the container HEALTHCHECK as root and fires the first + probe ~1-2s after start** (it does *not* defer the first probe for + `start_period`). +3. rabbitmq's healthcheck `rabbitmq-diagnostics -q ping` is an Erlang + escript. Run as root with `HOME=/var/lib/rabbitmq` and no cookie yet, + it **creates `/var/lib/rabbitmq/.erlang.cookie` owned root:root, + mode 400**. +4. The uid-999 server then cannot read root's cookie → `eacces` → exit 1. + +Docker is unaffected because it defers the first probe past the interval, +so the 999 server writes the `999:999` cookie first; a later root probe +just reads it (root bypasses DAC). Only Erlang-cookie-style images are hit +— `pg_isready` / `curl` probes don't create root-owned files the main +process must read. + +Evidence: no healthcheck → works; `CMD true` healthcheck → works; real +healthcheck (even with `start_period: 120s`) → fails; first probe deferred +until after server init → works. + +## 2. Decision + +On backends that opt in, the orchestrator **owns health probing**: + +- **Do not** set `RunSpec.HealthCheck` (the backend never runs a native + HEALTHCHECK, so no eager root probe). +- For `depends_on..condition: service_healthy` gates, the orchestrator + runs the service's compose `test` command itself via `ExecContainer`, + polling until it exits 0 (healthy) or `HealthTimeout` (→ + `HealthTimeoutError`). + +Because probing is driven by the gate loop — which runs only *after* the +level's containers have reached running — the first probe lands after the +service has begun initializing, mirroring Docker's deferred-first-probe +behavior. `start_period` is honored as a grace delay before the first +probe. + +Services that declare a healthcheck but are **not** gated by any dependent +(e.g. rabbitmq, when the dependent depends on it with the list/`service_started` +form) are simply never probed — they run without a health status, which is +correct: nothing consumes it, and the eager-root-probe crash is gone. + +### 2.1 Backend opt-in + +A backend signals the preference through an optional, consumer-defined +interface (Go idiom — the orchestrator declares what it needs): + +```go +// in compose/ +type selfHealthProber interface { PreferSelfProbedHealth() bool } +``` + +`runtime/podman.Runtime` implements it returning `true`. Docker and +Apple do not implement it → unchanged behavior (Docker keeps native +healthchecks + `InspectContainer().Health` gating; Apple keeps its +`Healthchecks: false` fallback). + +Rejected alternatives: +- A `runtime.Capabilities` bit — `Capabilities` models compose *feature* + support; "native healthcheck runs eagerly as root" is a backend quirk, + not a feature. Keeping it out of `Capabilities` avoids conflating the two. +- Type-asserting `*podman.Runtime` in the engine — couples the engine to a + concrete backend and risks an import cycle. + +### 2.2 Probe semantics + +- `test` is compose-normalized to `["CMD", arg...]`, `["CMD-SHELL", str]`, + or `["NONE"]`. CMD → `ExecOptions.Cmd = test[1:]`; CMD-SHELL → + `["/bin/sh", "-c", test[1]]`; NONE / empty / `Disable` → treated as "no + healthcheck" → fall back to `State == Running` (same as today's + `HealthNone`). +- Exec runs as the container's default user (`ExecOptions.User=""`). Safe + because probing is deferred: by the time we probe, the main process has + initialized. (Residual edge: a *gated* Erlang-cookie service with no + `start_period` could still be probed before its server writes the cookie; + `start_period` closes that. The motivating project does not hit this — + its rabbitmq is not gated.) +- Exit 0 → healthy → gate satisfied. Non-zero → keep polling until deadline. + +## 3. Scope of change + +- `runtime/podman/podman.go`: add `PreferSelfProbedHealth() bool`. +- `compose/orchestrator.go`: detect opt-in; omit `HealthCheck` from the + RunSpec when opted in; in `waitFor`, self-probe `service_healthy` gates + via Exec using the service's healthcheck spec. +- Tests: unit (test→exec translation; self-probe gate satisfied/timeout via + a fake runtime). Validated manually end-to-end against a real + multi-service compose devcontainer on Podman (OrbStack) — rabbitmq boots + clean; a Podman integration test is follow-up work. + +No change to the Docker or Apple paths. + +## 3.1 Two coexisting health-check paths (the divergence) + +This change introduces a **second** health mechanism rather than replacing +the first, so the orchestrator now has two, selected per-backend by +`selfProbe`: + +| | Native path (Docker, Apple) | Self-probe path (Podman) | +|---|---|---| +| Healthcheck on container | `RunSpec.HealthCheck` set; runtime runs it | omitted | +| Who probes | the runtime, as the container's user | the orchestrator, via `ExecContainer` | +| `service_healthy` gate reads | `InspectContainer().Health` | exit code of the exec'd `test` | +| Selected when | `selfProbe == false` | `selfProbe == true` | + +**Behavioral divergence to be aware of**, beyond two code paths to +maintain: on the native path *every* service with a `healthcheck:` gets a +persistent health status surfaced by the runtime (`podman ps` / `docker ps` +HEALTH column). On the self-probe path the native healthcheck is omitted +entirely, so (a) only services actually gated by a `service_healthy` +dependency are ever probed, and (b) the probe result is transient — used to +satisfy the gate, never surfaced as the container's reported health. For +the motivating project this is invisible (nothing consumes rabbitmq's +health; a `service → postgres` `service_healthy` gate still works via the +self-probe), but the HEALTH column differs +between backends. + +This fork is deliberate — Podman's eager-root healthcheck forced it — not +accidental. Convergence options, to revisit when the `compose-native` +rollout flips the default backend (`design/compose-native.md` §10): +1. Unify on self-probe for all backends — one path, Docker parity, but lose + the runtime's native HEALTH surface. +2. Keep the fork but also set a native healthcheck for non-gated services on + the self-probe path where safe — narrows the divergence, more complex. +3. Leave as-is, documented (current choice). + +## 4. Out of scope — rootless Podman + gosu + +A *distinct* failure exists for the same image class under **rootless** +Podman: `gosu`'s setuid/setgid privilege drop is denied by the user +namespace (containers/podman#6816), so `rabbitmq-server` can't drop to uid +999 at all. That is a different mechanism from the healthcheck race fixed +here and this change does **not** address it. Our validation is on +**rootful** Podman (the C/R target — root socket), where gosu works; if a +rootless deployment is ever targeted, rabbitmq needs a separate remedy +(e.g. `user:` pinning + uid-mapped volumes). Confirmed via #20893 that +`--health-start-period` is doc-only and never defers the first probe, so it +is not a viable alternative. diff --git a/runtime/podman/checkpoint.go b/runtime/podman/checkpoint.go index 63120d2..ecbb638 100644 --- a/runtime/podman/checkpoint.go +++ b/runtime/podman/checkpoint.go @@ -75,6 +75,9 @@ func (r *Runtime) Restore(ctx context.Context, spec runtime.RestoreSpec) (*runti q := url.Values{} q.Set("import", "true") q.Set("tcpestablished", strconv.FormatBool(spec.TCPEstablished)) + if spec.IgnoreVolumes { + q.Set("ignoreVolumes", "true") + } if spec.Name != "" { q.Set("name", spec.Name) } diff --git a/runtime/podman/podman.go b/runtime/podman/podman.go index 004b7d1..c6d9433 100644 --- a/runtime/podman/podman.go +++ b/runtime/podman/podman.go @@ -119,3 +119,15 @@ func (r *Runtime) Capabilities() runtime.Capabilities { Checkpoint: r.checkpointOK, } } + +// PreferSelfProbedHealth tells the compose orchestrator to run healthcheck +// probes itself (via Exec) rather than configuring Podman's native +// HEALTHCHECK. Podman runs the native healthcheck as root and fires the +// first probe immediately at container start — before the main process +// initializes — which breaks privilege-dropping images: e.g. rabbitmq's +// `rabbitmq-diagnostics` probe, run as root, creates a root-owned +// /var/lib/rabbitmq/.erlang.cookie that the gosu-dropped (uid 999) server +// then cannot read (eacces). Letting the orchestrator probe defers the +// first check until after the service is up, matching Docker's behavior. +// See design/compose-native-health.md. +func (r *Runtime) PreferSelfProbedHealth() bool { return true } diff --git a/runtime/runtime.go b/runtime/runtime.go index 0e47673..7a69fdf 100644 --- a/runtime/runtime.go +++ b/runtime/runtime.go @@ -148,6 +148,14 @@ type RestoreSpec struct { // TCPEstablished must match the checkpoint when the archive captured // established connections. TCPEstablished bool + + // IgnoreVolumes asks the backend to skip restoring volume content + // from the archive, reusing whatever volume already exists. For + // Podman this maps to restore's ignore-volumes; cross-node restore + // leaves it false (content must come from the archive), same-node + // restore-in-place sets it true to avoid a "volume already exists" + // collision. + IgnoreVolumes bool } // CheckpointRef describes a written checkpoint archive.