diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md index 6389c728e..540cb988f 100644 --- a/architecture/gateway-single-node.md +++ b/architecture/gateway-single-node.md @@ -242,6 +242,7 @@ Layers added: 4. Kubernetes manifests: `deploy/kube/manifests/*.yaml` -> `/opt/openshell/manifests/` Bundled manifests include: +- `openshell-namespace.yaml` (declares the `openshell` namespace so it exists as soon as the k3s API is ready, before Helm reconciliation — see [Manifest injection](#manifest-injection)) - `openshell-helmchart.yaml` (OpenShell Helm chart auto-deploy) - `envoy-gateway-helmchart.yaml` (Envoy Gateway for Gateway API) - `agent-sandbox.yaml` @@ -272,6 +273,8 @@ Writes `/etc/rancher/k3s/registries.yaml` from `REGISTRY_HOST`, `REGISTRY_ENDPOI Copies bundled manifests from `/opt/openshell/manifests/` to `/var/lib/rancher/k3s/server/manifests/`. This is needed because the volume mount on `/var/lib/rancher/k3s` overwrites any files baked into that path at image build time. +`openshell-namespace.yaml` is a standalone `kind: Namespace` manifest that k3s applies as soon as its API server is ready — before the Helm controller reconciles `openshell-helmchart.yaml`. `reconcile_pki` in `crates/openshell-bootstrap` waits up to ~115s for the namespace before reading or writing PKI secrets; declaring it here decouples that wait from Helm controller latency on slow networks or cold boots. `createNamespace: true` on the HelmChart is retained as an idempotent fallback — Helm's `--create-namespace` coexists with pre-existing namespaces without error. + ### Image configuration overrides When environment variables are set, the entrypoint modifies the HelmChart manifest at `/var/lib/rancher/k3s/server/manifests/openshell-helmchart.yaml`: diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index 71d223d66..deacbe643 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -1214,4 +1214,28 @@ mod tests { ); } } + + #[test] + fn openshell_namespace_manifest_is_present_and_well_formed() { + // Guards `wait_for_namespace("openshell")` against silent regressions + // in the auto-applied manifest that k3s uses to create the namespace + // before Helm reconciles the openshell chart. + const MANIFEST: &str = + include_str!("../../../deploy/kube/manifests/openshell-namespace.yaml"); + assert!( + MANIFEST.contains("apiVersion: v1"), + "manifest must target core/v1: +{MANIFEST}" + ); + assert!( + MANIFEST.contains("kind: Namespace"), + "manifest must declare kind: Namespace: +{MANIFEST}" + ); + assert!( + MANIFEST.contains("name: openshell"), + "manifest must name the openshell namespace: +{MANIFEST}" + ); + } } diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh index d43046d4f..c6a48c576 100755 --- a/crates/openshell-vm/scripts/build-rootfs.sh +++ b/crates/openshell-vm/scripts/build-rootfs.sh @@ -354,7 +354,7 @@ MANIFEST_DEST="${ROOTFS_DIR}/opt/openshell/manifests" echo "==> Injecting Kubernetes manifests..." mkdir -p "${MANIFEST_DEST}" -for manifest in openshell-helmchart.yaml agent-sandbox.yaml; do +for manifest in openshell-namespace.yaml openshell-helmchart.yaml agent-sandbox.yaml; do if [ -f "${MANIFEST_SRC}/${manifest}" ]; then cp "${MANIFEST_SRC}/${manifest}" "${MANIFEST_DEST}/" echo " ${manifest}" diff --git a/deploy/kube/manifests/openshell-namespace.yaml b/deploy/kube/manifests/openshell-namespace.yaml new file mode 100644 index 000000000..c4f82d839 --- /dev/null +++ b/deploy/kube/manifests/openshell-namespace.yaml @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Explicit Namespace manifest for the OpenShell control plane. +# +# k3s auto-applies every YAML in /var/lib/rancher/k3s/server/manifests/ as +# soon as its API server is ready, before the Helm controller reconciles +# the HelmChart CR. Declaring the namespace here guarantees it exists +# within seconds of cluster startup and decouples PKI bootstrap +# (wait_for_namespace in openshell-bootstrap) from Helm controller +# reconciliation latency — which on slow networks or cold boots can +# exceed the 115-second wait budget. +# +# The companion openshell-helmchart.yaml retains `createNamespace: true` +# as an idempotent fallback; pre-existing namespaces coexist with Helm's +# --create-namespace flag without error. + +apiVersion: v1 +kind: Namespace +metadata: + name: openshell diff --git a/e2e/rust/tests/namespace_bootstrap.rs b/e2e/rust/tests/namespace_bootstrap.rs new file mode 100644 index 000000000..5f27719ed --- /dev/null +++ b/e2e/rust/tests/namespace_bootstrap.rs @@ -0,0 +1,96 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![cfg(feature = "e2e")] + +//! Regression test for the openshell namespace auto-create. +//! +//! `reconcile_pki` waits up to ~115s for `namespace/openshell` before the +//! PKI phase can read or write secrets. The namespace is declared by a +//! standalone manifest at `deploy/kube/manifests/openshell-namespace.yaml` +//! that k3s auto-applies before the Helm controller reconciles the +//! openshell chart — without it, slow networks or cold boots race the +//! Helm controller and `wait_for_namespace` times out. +//! +//! This test runs against a healthy gateway and asserts the namespace is +//! present in the cluster. Closes NVIDIA/NemoClaw#1974. + +use std::process::{Command, Stdio}; + +use openshell_e2e::harness::output::strip_ansi; + +/// Resolve the gateway name from `OPENSHELL_GATEWAY`, falling back to the +/// CI default of `"openshell"` — same convention as `gateway_resume`. +fn gateway_name() -> String { + std::env::var("OPENSHELL_GATEWAY").unwrap_or_else(|_| "openshell".to_string()) +} + +/// Docker container name for the e2e gateway. +fn container_name() -> String { + format!("openshell-cluster-{}", gateway_name()) +} + +/// Run `kubectl` against the gateway's embedded k3s cluster via +/// `docker exec` and return (stdout, stderr, exit-code). +fn kubectl_in_cluster(args: &str) -> (String, String, i32) { + let cname = container_name(); + let output = Command::new("docker") + .args([ + "exec", + &cname, + "sh", + "-c", + &format!("KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl {args}"), + ]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .expect("spawn docker exec kubectl"); + + ( + String::from_utf8_lossy(&output.stdout).to_string(), + String::from_utf8_lossy(&output.stderr).to_string(), + output.status.code().unwrap_or(-1), + ) +} + +#[tokio::test] +async fn openshell_namespace_exists_after_cluster_start() { + let (stdout, stderr, code) = kubectl_in_cluster("get namespace openshell -o name"); + assert_eq!( + code, 0, + "`kubectl get namespace openshell` must succeed after gateway start. \ + stdout=<{}> stderr=<{}>", + strip_ansi(&stdout), + strip_ansi(&stderr), + ); + assert_eq!( + stdout.trim(), + "namespace/openshell", + "unexpected kubectl output: <{}>", + strip_ansi(&stdout), + ); +} + +#[tokio::test] +async fn openshell_namespace_is_active() { + // A Namespace can exist in the `Terminating` phase during cluster + // tear-down — assert we see the healthy `Active` phase, not just + // bare existence. This also rejects an empty-phase response that a + // transient API error could produce. + let (stdout, stderr, code) = + kubectl_in_cluster("get namespace openshell -o jsonpath={.status.phase}"); + assert_eq!( + code, 0, + "jsonpath query for openshell namespace phase must succeed. \ + stdout=<{}> stderr=<{}>", + strip_ansi(&stdout), + strip_ansi(&stderr), + ); + assert_eq!( + stdout.trim(), + "Active", + "openshell namespace must be in Active phase, got: <{}>", + strip_ansi(&stdout), + ); +}