From 84834abfeee26d93c25fe67a81be56296225a69c Mon Sep 17 00:00:00 2001 From: John Myers <9696606+johntmyers@users.noreply.github.com> Date: Mon, 13 Apr 2026 09:45:58 -0700 Subject: [PATCH 1/3] fix(sandbox): harden seccomp denylist, SSRF protection, and inference policy enforcement - Remove seccomp skip in NetworkMode::Allow so baseline syscall restrictions apply regardless of network mode - Block cross-process manipulation syscalls (process_vm_writev, pidfd_open, pidfd_getfd, pidfd_send_signal) symmetric with existing ptrace and process_vm_readv blocks - Block clone/clone3 with CLONE_NEWUSER flag, new mount API syscalls (fsopen, fsconfig, fsmount, fspick, move_mount, open_tree), and namespace manipulation (setns, umount2, pivot_root) - Block userfaultfd and perf_event_open consistent with Docker default seccomp profile - Deny and close keep-alive inference connections after a non-inference request instead of silently continuing the loop - Add CGNAT (100.64.0.0/10), benchmarking (198.18.0.0/15), and other special-use IP ranges to SSRF protection in both proxy and mechanistic mapper --- crates/openshell-core/src/net.rs | 79 ++++- .../src/mechanistic_mapper.rs | 21 ++ crates/openshell-sandbox/src/proxy.rs | 151 ++++++++- .../src/sandbox/linux/seccomp.rs | 314 +++++++++++++++--- 4 files changed, 491 insertions(+), 74 deletions(-) diff --git a/crates/openshell-core/src/net.rs b/crates/openshell-core/src/net.rs index 1d6f55dfb..3cd00d7b8 100644 --- a/crates/openshell-core/src/net.rs +++ b/crates/openshell-core/src/net.rs @@ -113,16 +113,15 @@ pub fn is_always_blocked_net(net: ipnet::IpNet) -> bool { /// or unspecified). /// /// This is a broader check than [`is_always_blocked_ip`] — it includes RFC 1918 -/// private ranges (`10/8`, `172.16/12`, `192.168/16`) and IPv6 ULA (`fc00::/7`) -/// which are allowable via `allowed_ips` but blocked by default without one. +/// private ranges (`10/8`, `172.16/12`, `192.168/16`), CGNAT (`100.64.0.0/10`, +/// RFC 6598), other special-use ranges, and IPv6 ULA (`fc00::/7`) which are +/// allowable via `allowed_ips` but blocked by default without one. /// /// Used by the proxy's default SSRF path and the mechanistic mapper to detect /// when `allowed_ips` should be populated in proposals. pub fn is_internal_ip(ip: IpAddr) -> bool { match ip { - IpAddr::V4(v4) => { - v4.is_loopback() || v4.is_private() || v4.is_link_local() || v4.is_unspecified() - } + IpAddr::V4(v4) => is_internal_v4(&v4), IpAddr::V6(v6) => { if v6.is_loopback() || v6.is_unspecified() { return true; @@ -137,16 +136,44 @@ pub fn is_internal_ip(ip: IpAddr) -> bool { } // Check IPv4-mapped IPv6 (::ffff:x.x.x.x) if let Some(v4) = v6.to_ipv4_mapped() { - return v4.is_loopback() - || v4.is_private() - || v4.is_link_local() - || v4.is_unspecified(); + return is_internal_v4(&v4); } false } } } +/// IPv4 internal address check covering RFC 1918, CGNAT (RFC 6598), and other +/// special-use ranges that should never be reachable from sandbox egress. +fn is_internal_v4(v4: &Ipv4Addr) -> bool { + if v4.is_loopback() || v4.is_private() || v4.is_link_local() || v4.is_unspecified() { + return true; + } + let octets = v4.octets(); + // 100.64.0.0/10 — CGNAT / shared address space (RFC 6598). Commonly used by + // cloud VPC peering, Tailscale, and similar overlay networks. + if octets[0] == 100 && (octets[1] & 0xC0) == 64 { + return true; + } + // 192.0.0.0/24 — IETF protocol assignments (RFC 6890) + if octets[0] == 192 && octets[1] == 0 && octets[2] == 0 { + return true; + } + // 198.18.0.0/15 — benchmarking (RFC 2544) + if octets[0] == 198 && (octets[1] & 0xFE) == 18 { + return true; + } + // 198.51.100.0/24 — TEST-NET-2 (RFC 5737) + if octets[0] == 198 && octets[1] == 51 && octets[2] == 100 { + return true; + } + // 203.0.113.0/24 — TEST-NET-3 (RFC 5737) + if octets[0] == 203 && octets[1] == 0 && octets[2] == 113 { + return true; + } + false +} + #[cfg(test)] mod tests { use super::*; @@ -358,4 +385,38 @@ mod tests { let v6_public = Ipv4Addr::new(8, 8, 8, 8).to_ipv6_mapped(); assert!(!is_internal_ip(IpAddr::V6(v6_public))); } + + #[test] + fn test_internal_ip_cgnat() { + // 100.64.0.0/10 — CGNAT / shared address space (RFC 6598) + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(100, 64, 0, 1)))); + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(100, 100, 50, 3)))); + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new( + 100, 127, 255, 255 + )))); + // Just outside the /10 boundary + assert!(!is_internal_ip(IpAddr::V4(Ipv4Addr::new(100, 128, 0, 1)))); + assert!(!is_internal_ip(IpAddr::V4(Ipv4Addr::new( + 100, 63, 255, 255 + )))); + } + + #[test] + fn test_internal_ip_special_use_ranges() { + // 192.0.0.0/24 — IETF protocol assignments + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(192, 0, 0, 1)))); + // 198.18.0.0/15 — benchmarking + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(198, 18, 0, 1)))); + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(198, 19, 255, 255)))); + // 198.51.100.0/24 — TEST-NET-2 + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(198, 51, 100, 1)))); + // 203.0.113.0/24 — TEST-NET-3 + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(203, 0, 113, 1)))); + } + + #[test] + fn test_internal_ip_ipv6_mapped_cgnat() { + let v6 = Ipv4Addr::new(100, 64, 0, 1).to_ipv6_mapped(); + assert!(is_internal_ip(IpAddr::V6(v6))); + } } diff --git a/crates/openshell-sandbox/src/mechanistic_mapper.rs b/crates/openshell-sandbox/src/mechanistic_mapper.rs index 7ca7538ab..3cfdf3f54 100644 --- a/crates/openshell-sandbox/src/mechanistic_mapper.rs +++ b/crates/openshell-sandbox/src/mechanistic_mapper.rs @@ -690,6 +690,27 @@ mod tests { assert!(!is_internal_ip(IpAddr::V4(Ipv4Addr::new(1, 1, 1, 1)))); } + #[test] + fn test_is_internal_ip_cgnat() { + use std::net::Ipv4Addr; + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(100, 64, 0, 1)))); + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(100, 100, 50, 3)))); + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new( + 100, 127, 255, 255 + )))); + // Just outside the /10 boundary + assert!(!is_internal_ip(IpAddr::V4(Ipv4Addr::new(100, 128, 0, 1)))); + } + + #[test] + fn test_is_internal_ip_special_use() { + use std::net::Ipv4Addr; + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(192, 0, 0, 1)))); + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(198, 18, 0, 1)))); + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(198, 51, 100, 1)))); + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(203, 0, 113, 1)))); + } + #[test] fn test_is_internal_ip_v6() { use std::net::Ipv6Addr; diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index c055d9e6d..c32f5d644 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -51,6 +51,7 @@ struct ConnectDecision { /// /// Returned by [`handle_inference_interception`] so the call site can emit /// a structured CONNECT deny log when the connection is not successfully routed. +#[derive(Debug)] enum InferenceOutcome { /// At least one request was successfully routed to a local inference backend. Routed, @@ -1009,8 +1010,6 @@ async fn handle_inference_interception( tls_state: Option<&Arc>, inference_ctx: Option<&Arc>, ) -> Result { - use crate::l7::inference::{ParseResult, format_http_response, try_parse_http_request}; - let Some(ctx) = inference_ctx else { return Ok(InferenceOutcome::Denied { reason: "cluster inference context not configured".to_string(), @@ -1033,15 +1032,28 @@ async fn handle_inference_interception( } }; - // Read and process HTTP requests from the tunnel. - // Track whether any request was successfully routed so that a late denial - // on a keep-alive connection still counts as "routed". + process_inference_keepalive(&mut tls_client, ctx, port).await +} + +/// Read and process HTTP requests from a TLS-terminated inference connection. +/// +/// Each request is matched against inference patterns and routed locally. +/// Any non-inference request is immediately denied and the connection is closed, +/// even if previous requests on the same keep-alive connection were routed +/// successfully. +async fn process_inference_keepalive( + stream: &mut S, + ctx: &InferenceContext, + port: u16, +) -> Result { + use crate::l7::inference::{ParseResult, format_http_response, try_parse_http_request}; + let mut buf = vec![0u8; INITIAL_INFERENCE_BUF]; let mut used = 0usize; let mut routed_any = false; loop { - let n = match tls_client.read(&mut buf[used..]).await { + let n = match stream.read(&mut buf[used..]).await { Ok(n) => n, Err(e) => { if routed_any { @@ -1065,10 +1077,13 @@ async fn handle_inference_interception( // Try to parse a complete HTTP request match try_parse_http_request(&buf[..used]) { ParseResult::Complete(request, consumed) => { - let was_routed = route_inference_request(&request, ctx, &mut tls_client).await?; + let was_routed = route_inference_request(&request, ctx, stream).await?; if was_routed { routed_any = true; - } else if !routed_any { + } else { + // Deny and close: a non-inference request must not be silently + // ignored on a keep-alive connection that previously routed + // inference traffic. return Ok(InferenceOutcome::Denied { reason: "connection not allowed by policy".to_string(), }); @@ -1083,7 +1098,7 @@ async fn handle_inference_interception( if used == buf.len() { if buf.len() >= MAX_INFERENCE_BUF { let response = format_http_response(413, &[], b"Payload Too Large"); - write_all(&mut tls_client, &response).await?; + write_all(stream, &response).await?; if routed_any { break; } @@ -1109,7 +1124,7 @@ async fn handle_inference_interception( ocsf_emit!(event); } let response = format_http_response(400, &[], b"Bad Request"); - write_all(&mut tls_client, &response).await?; + write_all(stream, &response).await?; return Ok(InferenceOutcome::Denied { reason }); } } @@ -2467,6 +2482,41 @@ mod tests { assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::UNSPECIFIED))); } + #[test] + fn test_rejects_ipv4_cgnat() { + // 100.64.0.0/10 — CGNAT / shared address space (RFC 6598) + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(100, 64, 0, 1)))); + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(100, 100, 50, 3)))); + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new( + 100, 127, 255, 255 + )))); + // Just outside the /10 boundary + assert!(!is_internal_ip(IpAddr::V4(Ipv4Addr::new(100, 128, 0, 1)))); + assert!(!is_internal_ip(IpAddr::V4(Ipv4Addr::new( + 100, 63, 255, 255 + )))); + } + + #[test] + fn test_rejects_ipv4_special_use_ranges() { + // 192.0.0.0/24 — IETF protocol assignments + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(192, 0, 0, 1)))); + // 198.18.0.0/15 — benchmarking + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(198, 18, 0, 1)))); + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(198, 19, 255, 255)))); + // 198.51.100.0/24 — TEST-NET-2 + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(198, 51, 100, 1)))); + // 203.0.113.0/24 — TEST-NET-3 + assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(203, 0, 113, 1)))); + } + + #[test] + fn test_rejects_ipv6_mapped_cgnat() { + // ::ffff:100.64.0.1 should be caught via IPv4-mapped unwrapping + let v6 = Ipv4Addr::new(100, 64, 0, 1).to_ipv6_mapped(); + assert!(is_internal_ip(IpAddr::V6(v6))); + } + #[test] fn test_allows_ipv4_public() { assert!(!is_internal_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8)))); @@ -3355,4 +3405,85 @@ mod tests { let result = implicit_allowed_ips_for_ip_host("*.example.com"); assert!(result.is_empty()); } + + /// Regression test: exercises the actual keep-alive interception loop to + /// verify that a non-inference request is denied even after a previous + /// inference request was successfully routed on the same connection. + /// + /// Before the fix, `handle_inference_interception` used + /// `else if !routed_any` which silently dropped denials once `routed_any` + /// was true, allowing non-inference HTTP requests to piggyback on a + /// keep-alive connection that had previously handled inference traffic. + /// Regression test: exercises the actual keep-alive interception loop to + /// verify that a non-inference request is denied even after a previous + /// inference request was successfully routed on the same connection. + /// + /// The server runs in a spawned task with empty routes (the inference + /// request gets a 503 "not configured" but is still recognized as + /// inference and returns Ok(true)). The client sends the inference + /// request, reads the 503 response, then sends a non-inference request + /// on the same connection. The server must return Denied. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_keepalive_denies_non_inference_after_routed() { + use openshell_router::Router; + use tokio::io::{AsyncReadExt, AsyncWriteExt}; + + let router = Router::new().unwrap(); + let patterns = crate::l7::inference::default_patterns(); + // Empty routes: inference request gets 503 but returns Ok(true). + let ctx = InferenceContext::new(patterns, router, vec![], vec![]); + + let body = r#"{"model":"test","messages":[{"role":"user","content":"hi"}]}"#; + let inference_req = format!( + "POST /v1/chat/completions HTTP/1.1\r\n\ + Host: inference.local\r\n\ + Content-Type: application/json\r\n\ + Content-Length: {}\r\n\r\n{}", + body.len(), + body, + ); + let non_inference_req = "GET /admin/config HTTP/1.1\r\nHost: inference.local\r\n\r\n"; + + let (client, mut server) = tokio::io::duplex(65536); + let (mut client_read, mut client_write) = tokio::io::split(client); + + // Spawn the server task so it runs concurrently. + let server_task = + tokio::spawn(async move { process_inference_keepalive(&mut server, &ctx, 443).await }); + + // Client: send inference request, read response, send non-inference. + client_write + .write_all(inference_req.as_bytes()) + .await + .unwrap(); + + // Read the 503 response so the server loops back to read. + let mut buf = vec![0u8; 4096]; + let _ = client_read.read(&mut buf).await.unwrap(); + + // Send non-inference request on the same keep-alive connection. + client_write + .write_all(non_inference_req.as_bytes()) + .await + .unwrap(); + drop(client_write); + + // Drain remaining response bytes. + tokio::spawn(async move { + let mut buf = vec![0u8; 4096]; + loop { + match client_read.read(&mut buf).await { + Ok(0) | Err(_) => break, + Ok(_) => continue, + } + } + }); + + let outcome = server_task.await.unwrap().unwrap(); + + assert!( + matches!(outcome, InferenceOutcome::Denied { .. }), + "expected Denied after non-inference request on keep-alive, got: {outcome:?}" + ); + } } diff --git a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs b/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs index e23447498..b6d5705b2 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs @@ -26,11 +26,7 @@ use tracing::debug; const SECCOMP_SET_MODE_FILTER: u64 = 1; pub fn apply(policy: &SandboxPolicy) -> Result<()> { - if matches!(policy.network.mode, NetworkMode::Allow) { - return Ok(()); - } - - let allow_inet = matches!(policy.network.mode, NetworkMode::Proxy); + let allow_inet = matches!(policy.network.mode, NetworkMode::Proxy | NetworkMode::Allow); let filter = build_filter(allow_inet)?; // Required before applying seccomp filters. @@ -43,10 +39,70 @@ pub fn apply(policy: &SandboxPolicy) -> Result<()> { } apply_filter(&filter).into_diagnostic()?; + + // Apply a separate filter for clone3 that returns ENOSYS instead of EPERM. + // seccomp BPF cannot dereference the `struct clone_args *` pointer that + // clone3 takes as arg 0, so we cannot selectively block CLONE_NEWUSER. + // Block clone3 unconditionally with ENOSYS: glibc's clone3 wrapper falls + // back to clone on ENOSYS (where we CAN filter flags), but treats EPERM + // as a hard failure. Using a separate filter with a different match_action + // gives us per-syscall errno control that SeccompFilter's global + // match_action cannot provide. + let clone3_filter = build_clone3_filter()?; + apply_filter(&clone3_filter).into_diagnostic()?; + Ok(()) } fn build_filter(allow_inet: bool) -> Result { + let rules = build_filter_rules(allow_inet)?; + + let arch = std::env::consts::ARCH + .try_into() + .map_err(|_| miette::miette!("Unsupported architecture for seccomp"))?; + + let filter = SeccompFilter::new( + rules, + SeccompAction::Allow, + SeccompAction::Errno(libc::EPERM as u32), + arch, + ) + .into_diagnostic()?; + + filter.try_into().into_diagnostic() +} + +/// Build a minimal BPF filter that blocks clone3 with ENOSYS. +/// +/// This is a separate filter from the main one because seccomp BPF cannot +/// dereference the `struct clone_args *` pointer that clone3 takes as arg 0, +/// so we cannot selectively block CLONE_NEWUSER. We block clone3 +/// unconditionally with ENOSYS so glibc falls back to the older clone +/// syscall (where flags are a direct register argument and CAN be filtered). +/// +/// glibc's clone3 wrapper checks for ENOSYS specifically — EPERM would be +/// treated as a hard failure and propagated to the caller instead of +/// triggering the clone fallback. +fn build_clone3_filter() -> Result { + let mut rules: BTreeMap> = BTreeMap::new(); + rules.entry(libc::SYS_clone3).or_default(); + + let arch = std::env::consts::ARCH + .try_into() + .map_err(|_| miette::miette!("Unsupported architecture for seccomp"))?; + + let filter = SeccompFilter::new( + rules, + SeccompAction::Allow, + SeccompAction::Errno(libc::ENOSYS as u32), + arch, + ) + .into_diagnostic()?; + + filter.try_into().into_diagnostic() +} + +fn build_filter_rules(allow_inet: bool) -> Result>> { let mut rules: BTreeMap> = BTreeMap::new(); // --- Socket domain blocks --- @@ -73,10 +129,34 @@ fn build_filter(allow_inet: bool) -> Result { rules.entry(libc::SYS_bpf).or_default(); // Cross-process memory read. rules.entry(libc::SYS_process_vm_readv).or_default(); + // Cross-process memory write (symmetric with process_vm_readv). + rules.entry(libc::SYS_process_vm_writev).or_default(); + // Process handle acquisition, fd theft, and signalling via pidfd. + rules.entry(libc::SYS_pidfd_open).or_default(); + rules.entry(libc::SYS_pidfd_getfd).or_default(); + rules.entry(libc::SYS_pidfd_send_signal).or_default(); // Async I/O subsystem with extensive CVE history. rules.entry(libc::SYS_io_uring_setup).or_default(); // Filesystem mount could subvert Landlock or overlay writable paths. rules.entry(libc::SYS_mount).or_default(); + // New mount API syscalls (Linux 5.2+) bypass the SYS_mount block entirely. + rules.entry(libc::SYS_fsopen).or_default(); + rules.entry(libc::SYS_fsconfig).or_default(); + rules.entry(libc::SYS_fsmount).or_default(); + rules.entry(libc::SYS_fspick).or_default(); + rules.entry(libc::SYS_move_mount).or_default(); + rules.entry(libc::SYS_open_tree).or_default(); + // Namespace manipulation — setns enters existing namespaces, pivot_root/umount2 + // change the filesystem root. The supervisor calls setns before seccomp is applied, + // so blocking it here is safe. + rules.entry(libc::SYS_setns).or_default(); + rules.entry(libc::SYS_umount2).or_default(); + rules.entry(libc::SYS_pivot_root).or_default(); + // Kernel exploit primitives: userfaultfd enables race-condition exploitation (multiple + // CVEs), perf_event_open enables Spectre-class side channels. Both blocked by Docker's + // default seccomp profile. + rules.entry(libc::SYS_userfaultfd).or_default(); + rules.entry(libc::SYS_perf_event_open).or_default(); // --- Conditional syscall blocks --- @@ -96,6 +176,15 @@ fn build_filter(allow_inet: bool) -> Result { libc::CLONE_NEWUSER as u64, )?; + // clone with CLONE_NEWUSER achieves the same as unshare via a different syscall. + add_masked_arg_rule( + &mut rules, + libc::SYS_clone, + 0, // flags argument + libc::CLONE_NEWUSER as u64, + )?; + // clone3 is handled by a separate filter — see build_clone3_filter(). + // seccomp(SECCOMP_SET_MODE_FILTER) would let sandboxed code replace the active filter. let condition = SeccompCondition::new( 0, // operation argument @@ -107,19 +196,7 @@ fn build_filter(allow_inet: bool) -> Result { let rule = SeccompRule::new(vec![condition]).into_diagnostic()?; rules.entry(libc::SYS_seccomp).or_default().push(rule); - let arch = std::env::consts::ARCH - .try_into() - .map_err(|_| miette::miette!("Unsupported architecture for seccomp"))?; - - let filter = SeccompFilter::new( - rules, - SeccompAction::Allow, - SeccompAction::Errno(libc::EPERM as u32), - arch, - ) - .into_diagnostic()?; - - filter.try_into().into_diagnostic() + Ok(rules) } #[allow(clippy::cast_sign_loss)] @@ -159,6 +236,10 @@ fn add_masked_arg_rule( mod tests { use super::*; + // These tests verify filter construction (rule map shape and BPF compilation). + // Behavioral verification (syscalls returning EPERM inside a sandbox) requires + // a running Linux sandbox and is covered by the e2e test suite. + #[test] fn build_filter_proxy_mode_compiles() { let filter = build_filter(true); @@ -189,31 +270,41 @@ mod tests { #[test] fn unconditional_blocks_present_in_filter() { - let mut rules: BTreeMap> = BTreeMap::new(); - - // Simulate what build_filter does for unconditional blocks - rules.entry(libc::SYS_memfd_create).or_default(); - rules.entry(libc::SYS_ptrace).or_default(); - rules.entry(libc::SYS_bpf).or_default(); - rules.entry(libc::SYS_process_vm_readv).or_default(); - rules.entry(libc::SYS_io_uring_setup).or_default(); - rules.entry(libc::SYS_mount).or_default(); + // Build a real filter and verify all unconditional blocks are present. + let filter_rules = build_filter_rules(true).unwrap(); - // Unconditional blocks have an empty Vec (no conditions = always match) - for syscall in [ + // Unconditional blocks have an empty Vec (no conditions = always match). + let expected = [ libc::SYS_memfd_create, libc::SYS_ptrace, libc::SYS_bpf, libc::SYS_process_vm_readv, + libc::SYS_process_vm_writev, + libc::SYS_pidfd_open, + libc::SYS_pidfd_getfd, + libc::SYS_pidfd_send_signal, libc::SYS_io_uring_setup, libc::SYS_mount, - ] { + libc::SYS_fsopen, + libc::SYS_fsconfig, + libc::SYS_fsmount, + libc::SYS_fspick, + libc::SYS_move_mount, + libc::SYS_open_tree, + libc::SYS_setns, + libc::SYS_umount2, + libc::SYS_pivot_root, + libc::SYS_userfaultfd, + libc::SYS_perf_event_open, + ]; + + for syscall in expected { assert!( - rules.contains_key(&syscall), + filter_rules.contains_key(&syscall), "syscall {syscall} should be in the rules map" ); assert!( - rules[&syscall].is_empty(), + filter_rules[&syscall].is_empty(), "syscall {syscall} should have empty rules (unconditional block)" ); } @@ -222,37 +313,150 @@ mod tests { #[test] fn conditional_blocks_have_rules() { // Build a real filter and verify the conditional syscalls have rule entries - // (non-empty Vec means conditional match) - let mut rules: BTreeMap> = BTreeMap::new(); + // (non-empty Vec means conditional match). + let filter_rules = build_filter_rules(true).unwrap(); - add_masked_arg_rule( - &mut rules, + for syscall in [ libc::SYS_execveat, - 4, - libc::AT_EMPTY_PATH as u64, - ) - .unwrap(); - add_masked_arg_rule(&mut rules, libc::SYS_unshare, 0, libc::CLONE_NEWUSER as u64).unwrap(); - - let condition = SeccompCondition::new( - 0, - SeccompCmpArgLen::Dword, - SeccompCmpOp::Eq, - SECCOMP_SET_MODE_FILTER, - ) - .unwrap(); - let rule = SeccompRule::new(vec![condition]).unwrap(); - rules.entry(libc::SYS_seccomp).or_default().push(rule); - - for syscall in [libc::SYS_execveat, libc::SYS_unshare, libc::SYS_seccomp] { + libc::SYS_unshare, + libc::SYS_clone, + libc::SYS_seccomp, + ] { assert!( - rules.contains_key(&syscall), + filter_rules.contains_key(&syscall), "syscall {syscall} should be in the rules map" ); assert!( - !rules[&syscall].is_empty(), + !filter_rules[&syscall].is_empty(), "syscall {syscall} should have conditional rules" ); } } + + #[test] + fn clone3_filter_compiles_and_blocks_clone3() { + let bpf = build_clone3_filter(); + assert!(bpf.is_ok(), "clone3 ENOSYS filter should compile"); + } + + #[test] + fn clone3_not_in_main_filter() { + // clone3 must NOT be in the main filter; it has its own ENOSYS filter. + let filter_rules = build_filter_rules(true).unwrap(); + assert!( + !filter_rules.contains_key(&libc::SYS_clone3), + "clone3 should not be in the main filter — it uses a separate ENOSYS filter" + ); + } + + // --- Behavioral tests --- + // + // These apply seccomp filters in a forked child and verify that blocked + // syscalls actually return the expected errno. They only compile and run + // on Linux (seccomp is a Linux kernel feature). + + /// Fork a child, apply the given filter, invoke `syscall_nr`, and return + /// the errno observed by the child. The child exits 0 if the syscall + /// returned the expected errno, 1 otherwise. + unsafe fn assert_blocked_in_child( + filter: &seccompiler::BpfProgram, + syscall_nr: i64, + expected_errno: i32, + ) { + let pid = libc::fork(); + assert!(pid >= 0, "fork failed"); + if pid == 0 { + // Child: apply filter and try the syscall. + libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + apply_filter(filter).expect("apply_filter"); + let ret = libc::syscall(syscall_nr, 0 as libc::c_ulong, 0 as libc::c_ulong); + let errno = *libc::__errno_location(); + if ret == -1 && errno == expected_errno { + libc::_exit(0); + } else { + // Write diagnostic before exiting so test failures are debuggable. + let msg = format!( + "syscall {syscall_nr}: expected errno={expected_errno}, got ret={ret} errno={errno}\n" + ); + libc::write(2, msg.as_ptr().cast(), msg.len()); + libc::_exit(1); + } + } + // Parent: wait for child. + let mut status: libc::c_int = 0; + libc::waitpid(pid, &mut status, 0); + assert!( + libc::WIFEXITED(status) && libc::WEXITSTATUS(status) == 0, + "child failed: syscall {syscall_nr} was not blocked with errno {expected_errno}" + ); + } + + #[test] + fn behavioral_memfd_create_blocked() { + let filter = build_filter(true).unwrap(); + unsafe { assert_blocked_in_child(&filter, libc::SYS_memfd_create, libc::EPERM) }; + } + + #[test] + fn behavioral_ptrace_blocked() { + let filter = build_filter(true).unwrap(); + unsafe { assert_blocked_in_child(&filter, libc::SYS_ptrace, libc::EPERM) }; + } + + #[test] + fn behavioral_process_vm_writev_blocked() { + let filter = build_filter(true).unwrap(); + unsafe { assert_blocked_in_child(&filter, libc::SYS_process_vm_writev, libc::EPERM) }; + } + + #[test] + fn behavioral_userfaultfd_blocked() { + let filter = build_filter(true).unwrap(); + unsafe { assert_blocked_in_child(&filter, libc::SYS_userfaultfd, libc::EPERM) }; + } + + #[test] + fn behavioral_perf_event_open_blocked() { + let filter = build_filter(true).unwrap(); + unsafe { assert_blocked_in_child(&filter, libc::SYS_perf_event_open, libc::EPERM) }; + } + + #[test] + fn behavioral_setns_blocked() { + let filter = build_filter(true).unwrap(); + unsafe { assert_blocked_in_child(&filter, libc::SYS_setns, libc::EPERM) }; + } + + #[test] + fn behavioral_clone3_returns_enosys() { + // clone3 uses a separate filter that returns ENOSYS (not EPERM) so + // glibc falls back to clone. + let main_filter = build_filter(true).unwrap(); + let clone3_filter = build_clone3_filter().unwrap(); + // Apply in the same order as apply(): main first, clone3 second. + let pid = unsafe { libc::fork() }; + assert!(pid >= 0, "fork failed"); + if pid == 0 { + unsafe { + libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + apply_filter(&main_filter).expect("main filter"); + apply_filter(&clone3_filter).expect("clone3 filter"); + let ret = libc::syscall(libc::SYS_clone3, 0 as libc::c_ulong, 0 as libc::c_ulong); + let errno = *libc::__errno_location(); + if ret == -1 && errno == libc::ENOSYS { + libc::_exit(0); + } else { + let msg = format!("clone3: expected ENOSYS, got ret={ret} errno={errno}\n"); + libc::write(2, msg.as_ptr().cast(), msg.len()); + libc::_exit(1); + } + } + } + let mut status: libc::c_int = 0; + unsafe { libc::waitpid(pid, &mut status, 0) }; + assert!( + unsafe { libc::WIFEXITED(status) && libc::WEXITSTATUS(status) == 0 }, + "clone3 should be blocked with ENOSYS, not EPERM" + ); + } } From ce39c447217b65463560666ecebf0ff503541854 Mon Sep 17 00:00:00 2001 From: John Myers <9696606+johntmyers@users.noreply.github.com> Date: Mon, 13 Apr 2026 10:58:12 -0700 Subject: [PATCH 2/3] fix(sandbox): install clone3 seccomp filter before main filter --- .../src/sandbox/linux/seccomp.rs | 95 +++++++++++++++---- 1 file changed, 75 insertions(+), 20 deletions(-) diff --git a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs b/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs index b6d5705b2..854134cbf 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs @@ -27,7 +27,8 @@ const SECCOMP_SET_MODE_FILTER: u64 = 1; pub fn apply(policy: &SandboxPolicy) -> Result<()> { let allow_inet = matches!(policy.network.mode, NetworkMode::Proxy | NetworkMode::Allow); - let filter = build_filter(allow_inet)?; + let main_filter = build_filter(allow_inet)?; + let clone3_filter = build_clone3_filter()?; // Required before applying seccomp filters. let rc = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) }; @@ -38,18 +39,7 @@ pub fn apply(policy: &SandboxPolicy) -> Result<()> { )); } - apply_filter(&filter).into_diagnostic()?; - - // Apply a separate filter for clone3 that returns ENOSYS instead of EPERM. - // seccomp BPF cannot dereference the `struct clone_args *` pointer that - // clone3 takes as arg 0, so we cannot selectively block CLONE_NEWUSER. - // Block clone3 unconditionally with ENOSYS: glibc's clone3 wrapper falls - // back to clone on ENOSYS (where we CAN filter flags), but treats EPERM - // as a hard failure. Using a separate filter with a different match_action - // gives us per-syscall errno control that SeccompFilter's global - // match_action cannot provide. - let clone3_filter = build_clone3_filter()?; - apply_filter(&clone3_filter).into_diagnostic()?; + apply_runtime_filters(&main_filter, &clone3_filter)?; Ok(()) } @@ -102,6 +92,22 @@ fn build_clone3_filter() -> Result { filter.try_into().into_diagnostic() } +/// Install the sandbox seccomp filters in the required order. +/// +/// Order matters: +/// 1. Install the dedicated clone3 filter first so it can still call +/// `seccomp(SECCOMP_SET_MODE_FILTER)`. +/// 2. Install the main filter second. It blocks further seccomp filter +/// installation with `EPERM`, preserving the original hardening intent. +fn apply_runtime_filters( + main_filter: seccompiler::BpfProgramRef, + clone3_filter: seccompiler::BpfProgramRef, +) -> Result<()> { + apply_filter(clone3_filter).into_diagnostic()?; + apply_filter(main_filter).into_diagnostic()?; + Ok(()) +} + fn build_filter_rules(allow_inet: bool) -> Result>> { let mut rules: BTreeMap> = BTreeMap::new(); @@ -236,9 +242,8 @@ fn add_masked_arg_rule( mod tests { use super::*; - // These tests verify filter construction (rule map shape and BPF compilation). - // Behavioral verification (syscalls returning EPERM inside a sandbox) requires - // a running Linux sandbox and is covered by the e2e test suite. + // These tests cover both filter construction (rule map shape and BPF + // compilation) and selected runtime behavior on Linux via forked children. #[test] fn build_filter_proxy_mode_compiles() { @@ -391,6 +396,18 @@ mod tests { ); } + unsafe fn install_runtime_filters_in_child( + main_filter: &seccompiler::BpfProgram, + clone3_filter: &seccompiler::BpfProgram, + ) { + libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + if let Err(err) = apply_runtime_filters(main_filter, clone3_filter) { + let msg = format!("failed to install runtime seccomp filters: {err}\n"); + libc::write(2, msg.as_ptr().cast(), msg.len()); + libc::_exit(1); + } + } + #[test] fn behavioral_memfd_create_blocked() { let filter = build_filter(true).unwrap(); @@ -433,14 +450,12 @@ mod tests { // glibc falls back to clone. let main_filter = build_filter(true).unwrap(); let clone3_filter = build_clone3_filter().unwrap(); - // Apply in the same order as apply(): main first, clone3 second. + // Apply in the same order as apply(): clone3 filter first, main filter second. let pid = unsafe { libc::fork() }; assert!(pid >= 0, "fork failed"); if pid == 0 { unsafe { - libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); - apply_filter(&main_filter).expect("main filter"); - apply_filter(&clone3_filter).expect("clone3 filter"); + install_runtime_filters_in_child(&main_filter, &clone3_filter); let ret = libc::syscall(libc::SYS_clone3, 0 as libc::c_ulong, 0 as libc::c_ulong); let errno = *libc::__errno_location(); if ret == -1 && errno == libc::ENOSYS { @@ -459,4 +474,44 @@ mod tests { "clone3 should be blocked with ENOSYS, not EPERM" ); } + + #[test] + fn behavioral_third_filter_install_blocked_after_startup() { + let main_filter = build_filter(true).unwrap(); + let clone3_filter = build_clone3_filter().unwrap(); + let third_filter = build_clone3_filter().unwrap(); + + let pid = unsafe { libc::fork() }; + assert!(pid >= 0, "fork failed"); + if pid == 0 { + unsafe { + install_runtime_filters_in_child(&main_filter, &clone3_filter); + match apply_filter(&third_filter) { + Err(seccompiler::Error::Seccomp(e)) + if e.raw_os_error() == Some(libc::EPERM) => + { + libc::_exit(0); + } + Err(err) => { + let msg = + format!("third filter install failed with unexpected error: {err}\n"); + libc::write(2, msg.as_ptr().cast(), msg.len()); + libc::_exit(1); + } + Ok(()) => { + let msg = "third filter unexpectedly installed\n"; + libc::write(2, msg.as_ptr().cast(), msg.len()); + libc::_exit(1); + } + } + } + } + + let mut status: libc::c_int = 0; + unsafe { libc::waitpid(pid, &mut status, 0) }; + assert!( + unsafe { libc::WIFEXITED(status) && libc::WEXITSTATUS(status) == 0 }, + "additional seccomp filter installation should be blocked after startup" + ); + } } From 8b16037285db13431fff0efbc3e9c2b7ebe81e7f Mon Sep 17 00:00:00 2001 From: John Myers <9696606+johntmyers@users.noreply.github.com> Date: Mon, 13 Apr 2026 11:12:03 -0700 Subject: [PATCH 3/3] test(ocsf): fix shorthand firewall engine expectation --- crates/openshell-ocsf/src/format/shorthand.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/openshell-ocsf/src/format/shorthand.rs b/crates/openshell-ocsf/src/format/shorthand.rs index f3f38e122..7e2296de9 100644 --- a/crates/openshell-ocsf/src/format/shorthand.rs +++ b/crates/openshell-ocsf/src/format/shorthand.rs @@ -641,7 +641,7 @@ mod tests { let shorthand = event.format_shorthand(); assert_eq!( shorthand, - "HTTP:GET [INFO] ALLOWED curl(68) -> GET http://172.20.0.1:9876/test [policy:allow_host_9876]" + "HTTP:GET [INFO] ALLOWED curl(68) -> GET http://172.20.0.1:9876/test [policy:allow_host_9876 engine:mechanistic]" ); }