Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions architecture/gateway-single-node.md
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ Layers added:
4. Kubernetes manifests: `deploy/kube/manifests/*.yaml` -> `/opt/openshell/manifests/`

Bundled manifests include:
- `openshell-namespace.yaml` (declares the `openshell` namespace so it exists as soon as the k3s API is ready, before Helm reconciliation — see [Manifest injection](#manifest-injection))
- `openshell-helmchart.yaml` (OpenShell Helm chart auto-deploy)
- `envoy-gateway-helmchart.yaml` (Envoy Gateway for Gateway API)
- `agent-sandbox.yaml`
Expand Down Expand Up @@ -272,6 +273,8 @@ Writes `/etc/rancher/k3s/registries.yaml` from `REGISTRY_HOST`, `REGISTRY_ENDPOI

Copies bundled manifests from `/opt/openshell/manifests/` to `/var/lib/rancher/k3s/server/manifests/`. This is needed because the volume mount on `/var/lib/rancher/k3s` overwrites any files baked into that path at image build time.

`openshell-namespace.yaml` is a standalone `kind: Namespace` manifest that k3s applies as soon as its API server is ready — before the Helm controller reconciles `openshell-helmchart.yaml`. `reconcile_pki` in `crates/openshell-bootstrap` waits up to ~115s for the namespace before reading or writing PKI secrets; declaring it here decouples that wait from Helm controller latency on slow networks or cold boots. `createNamespace: true` on the HelmChart is retained as an idempotent fallback — Helm's `--create-namespace` coexists with pre-existing namespaces without error.

### Image configuration overrides

When environment variables are set, the entrypoint modifies the HelmChart manifest at `/var/lib/rancher/k3s/server/manifests/openshell-helmchart.yaml`:
Expand Down
24 changes: 24 additions & 0 deletions crates/openshell-bootstrap/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1214,4 +1214,28 @@ mod tests {
);
}
}

#[test]
fn openshell_namespace_manifest_is_present_and_well_formed() {
// Guards `wait_for_namespace("openshell")` against silent regressions
// in the auto-applied manifest that k3s uses to create the namespace
// before Helm reconciles the openshell chart.
const MANIFEST: &str =
include_str!("../../../deploy/kube/manifests/openshell-namespace.yaml");
assert!(
MANIFEST.contains("apiVersion: v1"),
"manifest must target core/v1:
{MANIFEST}"
);
assert!(
MANIFEST.contains("kind: Namespace"),
"manifest must declare kind: Namespace:
{MANIFEST}"
);
assert!(
MANIFEST.contains("name: openshell"),
"manifest must name the openshell namespace:
{MANIFEST}"
);
}
}
2 changes: 1 addition & 1 deletion crates/openshell-vm/scripts/build-rootfs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ MANIFEST_DEST="${ROOTFS_DIR}/opt/openshell/manifests"
echo "==> Injecting Kubernetes manifests..."
mkdir -p "${MANIFEST_DEST}"

for manifest in openshell-helmchart.yaml agent-sandbox.yaml; do
for manifest in openshell-namespace.yaml openshell-helmchart.yaml agent-sandbox.yaml; do
if [ -f "${MANIFEST_SRC}/${manifest}" ]; then
cp "${MANIFEST_SRC}/${manifest}" "${MANIFEST_DEST}/"
echo " ${manifest}"
Expand Down
21 changes: 21 additions & 0 deletions deploy/kube/manifests/openshell-namespace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Explicit Namespace manifest for the OpenShell control plane.
#
# k3s auto-applies every YAML in /var/lib/rancher/k3s/server/manifests/ as
# soon as its API server is ready, before the Helm controller reconciles
# the HelmChart CR. Declaring the namespace here guarantees it exists
# within seconds of cluster startup and decouples PKI bootstrap
# (wait_for_namespace in openshell-bootstrap) from Helm controller
# reconciliation latency — which on slow networks or cold boots can
# exceed the 115-second wait budget.
#
# The companion openshell-helmchart.yaml retains `createNamespace: true`
# as an idempotent fallback; pre-existing namespaces coexist with Helm's
# --create-namespace flag without error.

apiVersion: v1
kind: Namespace
metadata:
name: openshell
96 changes: 96 additions & 0 deletions e2e/rust/tests/namespace_bootstrap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

#![cfg(feature = "e2e")]

//! Regression test for the openshell namespace auto-create.
//!
//! `reconcile_pki` waits up to ~115s for `namespace/openshell` before the
//! PKI phase can read or write secrets. The namespace is declared by a
//! standalone manifest at `deploy/kube/manifests/openshell-namespace.yaml`
//! that k3s auto-applies before the Helm controller reconciles the
//! openshell chart — without it, slow networks or cold boots race the
//! Helm controller and `wait_for_namespace` times out.
//!
//! This test runs against a healthy gateway and asserts the namespace is
//! present in the cluster. Closes NVIDIA/NemoClaw#1974.

use std::process::{Command, Stdio};

use openshell_e2e::harness::output::strip_ansi;

/// Resolve the gateway name from `OPENSHELL_GATEWAY`, falling back to the
/// CI default of `"openshell"` — same convention as `gateway_resume`.
fn gateway_name() -> String {
std::env::var("OPENSHELL_GATEWAY").unwrap_or_else(|_| "openshell".to_string())
}

/// Docker container name for the e2e gateway.
fn container_name() -> String {
format!("openshell-cluster-{}", gateway_name())
}

/// Run `kubectl` against the gateway's embedded k3s cluster via
/// `docker exec` and return (stdout, stderr, exit-code).
fn kubectl_in_cluster(args: &str) -> (String, String, i32) {
let cname = container_name();
let output = Command::new("docker")
.args([
"exec",
&cname,
"sh",
"-c",
&format!("KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl {args}"),
])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.expect("spawn docker exec kubectl");

(
String::from_utf8_lossy(&output.stdout).to_string(),
String::from_utf8_lossy(&output.stderr).to_string(),
output.status.code().unwrap_or(-1),
)
}

#[tokio::test]
async fn openshell_namespace_exists_after_cluster_start() {
let (stdout, stderr, code) = kubectl_in_cluster("get namespace openshell -o name");
assert_eq!(
code, 0,
"`kubectl get namespace openshell` must succeed after gateway start. \
stdout=<{}> stderr=<{}>",
strip_ansi(&stdout),
strip_ansi(&stderr),
);
assert_eq!(
stdout.trim(),
"namespace/openshell",
"unexpected kubectl output: <{}>",
strip_ansi(&stdout),
);
}

#[tokio::test]
async fn openshell_namespace_is_active() {
// A Namespace can exist in the `Terminating` phase during cluster
// tear-down — assert we see the healthy `Active` phase, not just
// bare existence. This also rejects an empty-phase response that a
// transient API error could produce.
let (stdout, stderr, code) =
kubectl_in_cluster("get namespace openshell -o jsonpath={.status.phase}");
assert_eq!(
code, 0,
"jsonpath query for openshell namespace phase must succeed. \
stdout=<{}> stderr=<{}>",
strip_ansi(&stdout),
strip_ansi(&stderr),
);
assert_eq!(
stdout.trim(),
"Active",
"openshell namespace must be in Active phase, got: <{}>",
strip_ansi(&stdout),
);
}
Loading