From 84834abfeee26d93c25fe67a81be56296225a69c Mon Sep 17 00:00:00 2001
From: John Myers <9696606+johntmyers@users.noreply.github.com>
Date: Mon, 13 Apr 2026 09:45:58 -0700
Subject: [PATCH 1/3] fix(sandbox): harden seccomp denylist, SSRF protection,
 and inference policy enforcement

- Remove seccomp skip in NetworkMode::Allow so baseline syscall
  restrictions apply regardless of network mode
- Block cross-process manipulation syscalls (process_vm_writev,
  pidfd_open, pidfd_getfd, pidfd_send_signal) symmetric with existing
  ptrace and process_vm_readv blocks
- Block clone/clone3 with CLONE_NEWUSER flag, new mount API syscalls
  (fsopen, fsconfig, fsmount, fspick, move_mount, open_tree), and
  namespace manipulation (setns, umount2, pivot_root)
- Block userfaultfd and perf_event_open consistent with Docker default
  seccomp profile
- Deny and close keep-alive inference connections after a non-inference
  request instead of silently continuing the loop
- Add CGNAT (100.64.0.0/10), benchmarking (198.18.0.0/15), and other
  special-use IP ranges to SSRF protection in both proxy and
  mechanistic mapper
---
 crates/openshell-core/src/net.rs              |  79 ++++-
 .../src/mechanistic_mapper.rs                 |  21 ++
 crates/openshell-sandbox/src/proxy.rs         | 151 ++++++++-
 .../src/sandbox/linux/seccomp.rs              | 314 +++++++++++++++---
 4 files changed, 491 insertions(+), 74 deletions(-)

diff --git a/crates/openshell-core/src/net.rs b/crates/openshell-core/src/net.rs
index 1d6f55dfb..3cd00d7b8 100644
--- a/crates/openshell-core/src/net.rs
+++ b/crates/openshell-core/src/net.rs
@@ -113,16 +113,15 @@ pub fn is_always_blocked_net(net: ipnet::IpNet) -> bool {
 /// or unspecified).
 ///
 /// This is a broader check than [`is_always_blocked_ip`] — it includes RFC 1918
-/// private ranges (`10/8`, `172.16/12`, `192.168/16`) and IPv6 ULA (`fc00::/7`)
-/// which are allowable via `allowed_ips` but blocked by default without one.
+/// private ranges (`10/8`, `172.16/12`, `192.168/16`), CGNAT (`100.64.0.0/10`,
+/// RFC 6598), other special-use ranges, and IPv6 ULA (`fc00::/7`) which are
+/// allowable via `allowed_ips` but blocked by default without one.
 ///
 /// Used by the proxy's default SSRF path and the mechanistic mapper to detect
 /// when `allowed_ips` should be populated in proposals.
 pub fn is_internal_ip(ip: IpAddr) -> bool {
     match ip {
-        IpAddr::V4(v4) => {
-            v4.is_loopback() || v4.is_private() || v4.is_link_local() || v4.is_unspecified()
-        }
+        IpAddr::V4(v4) => is_internal_v4(&v4),
         IpAddr::V6(v6) => {
             if v6.is_loopback() || v6.is_unspecified() {
                 return true;
@@ -137,16 +136,44 @@ pub fn is_internal_ip(ip: IpAddr) -> bool {
             }
             // Check IPv4-mapped IPv6 (::ffff:x.x.x.x)
             if let Some(v4) = v6.to_ipv4_mapped() {
-                return v4.is_loopback()
-                    || v4.is_private()
-                    || v4.is_link_local()
-                    || v4.is_unspecified();
+                return is_internal_v4(&v4);
             }
             false
         }
     }
 }
 
+/// IPv4 internal address check covering RFC 1918, CGNAT (RFC 6598), and other
+/// special-use ranges that should never be reachable from sandbox egress.
+fn is_internal_v4(v4: &Ipv4Addr) -> bool {
+    if v4.is_loopback() || v4.is_private() || v4.is_link_local() || v4.is_unspecified() {
+        return true;
+    }
+    let octets = v4.octets();
+    // 100.64.0.0/10 — CGNAT / shared address space (RFC 6598). Commonly used by
+    // cloud VPC peering, Tailscale, and similar overlay networks.
+    if octets[0] == 100 && (octets[1] & 0xC0) == 64 {
+        return true;
+    }
+    // 192.0.0.0/24 — IETF protocol assignments (RFC 6890)
+    if octets[0] == 192 && octets[1] == 0 && octets[2] == 0 {
+        return true;
+    }
+    // 198.18.0.0/15 — benchmarking (RFC 2544)
+    if octets[0] == 198 && (octets[1] & 0xFE) == 18 {
+        return true;
+    }
+    // 198.51.100.0/24 — TEST-NET-2 (RFC 5737)
+    if octets[0] == 198 && octets[1] == 51 && octets[2] == 100 {
+        return true;
+    }
+    // 203.0.113.0/24 — TEST-NET-3 (RFC 5737)
+    if octets[0] == 203 && octets[1] == 0 && octets[2] == 113 {
+        return true;
+    }
+    false
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -358,4 +385,38 @@ mod tests {
         let v6_public = Ipv4Addr::new(8, 8, 8, 8).to_ipv6_mapped();
         assert!(!is_internal_ip(IpAddr::V6(v6_public)));
     }
+
+    #[test]
+    fn test_internal_ip_cgnat() {
+        // 100.64.0.0/10 — CGNAT / shared address space (RFC 6598)
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(100, 64, 0, 1))));
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(100, 100, 50, 3))));
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(
+            100, 127, 255, 255
+        ))));
+        // Just outside the /10 boundary
+        assert!(!is_internal_ip(IpAddr::V4(Ipv4Addr::new(100, 128, 0, 1))));
+        assert!(!is_internal_ip(IpAddr::V4(Ipv4Addr::new(
+            100, 63, 255, 255
+        ))));
+    }
+
+    #[test]
+    fn test_internal_ip_special_use_ranges() {
+        // 192.0.0.0/24 — IETF protocol assignments
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(192, 0, 0, 1))));
+        // 198.18.0.0/15 — benchmarking
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(198, 18, 0, 1))));
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(198, 19, 255, 255))));
+        // 198.51.100.0/24 — TEST-NET-2
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(198, 51, 100, 1))));
+        // 203.0.113.0/24 — TEST-NET-3
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(203, 0, 113, 1))));
+    }
+
+    #[test]
+    fn test_internal_ip_ipv6_mapped_cgnat() {
+        let v6 = Ipv4Addr::new(100, 64, 0, 1).to_ipv6_mapped();
+        assert!(is_internal_ip(IpAddr::V6(v6)));
+    }
 }
diff --git a/crates/openshell-sandbox/src/mechanistic_mapper.rs b/crates/openshell-sandbox/src/mechanistic_mapper.rs
index 7ca7538ab..3cfdf3f54 100644
--- a/crates/openshell-sandbox/src/mechanistic_mapper.rs
+++ b/crates/openshell-sandbox/src/mechanistic_mapper.rs
@@ -690,6 +690,27 @@ mod tests {
         assert!(!is_internal_ip(IpAddr::V4(Ipv4Addr::new(1, 1, 1, 1))));
     }
 
+    #[test]
+    fn test_is_internal_ip_cgnat() {
+        use std::net::Ipv4Addr;
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(100, 64, 0, 1))));
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(100, 100, 50, 3))));
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(
+            100, 127, 255, 255
+        ))));
+        // Just outside the /10 boundary
+        assert!(!is_internal_ip(IpAddr::V4(Ipv4Addr::new(100, 128, 0, 1))));
+    }
+
+    #[test]
+    fn test_is_internal_ip_special_use() {
+        use std::net::Ipv4Addr;
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(192, 0, 0, 1))));
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(198, 18, 0, 1))));
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(198, 51, 100, 1))));
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(203, 0, 113, 1))));
+    }
+
     #[test]
     fn test_is_internal_ip_v6() {
         use std::net::Ipv6Addr;
diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs
index c055d9e6d..c32f5d644 100644
--- a/crates/openshell-sandbox/src/proxy.rs
+++ b/crates/openshell-sandbox/src/proxy.rs
@@ -51,6 +51,7 @@ struct ConnectDecision {
 ///
 /// Returned by [`handle_inference_interception`] so the call site can emit
 /// a structured CONNECT deny log when the connection is not successfully routed.
+#[derive(Debug)]
 enum InferenceOutcome {
     /// At least one request was successfully routed to a local inference backend.
     Routed,
@@ -1009,8 +1010,6 @@ async fn handle_inference_interception(
     tls_state: Option<&Arc<ProxyTlsState>>,
     inference_ctx: Option<&Arc<InferenceContext>>,
 ) -> Result<InferenceOutcome> {
-    use crate::l7::inference::{ParseResult, format_http_response, try_parse_http_request};
-
     let Some(ctx) = inference_ctx else {
         return Ok(InferenceOutcome::Denied {
             reason: "cluster inference context not configured".to_string(),
@@ -1033,15 +1032,28 @@ async fn handle_inference_interception(
         }
     };
 
-    // Read and process HTTP requests from the tunnel.
-    // Track whether any request was successfully routed so that a late denial
-    // on a keep-alive connection still counts as "routed".
+    process_inference_keepalive(&mut tls_client, ctx, port).await
+}
+
+/// Read and process HTTP requests from a TLS-terminated inference connection.
+///
+/// Each request is matched against inference patterns and routed locally.
+/// Any non-inference request is immediately denied and the connection is closed,
+/// even if previous requests on the same keep-alive connection were routed
+/// successfully.
+async fn process_inference_keepalive<S: tokio::io::AsyncRead + tokio::io::AsyncWrite + Unpin>(
+    stream: &mut S,
+    ctx: &InferenceContext,
+    port: u16,
+) -> Result<InferenceOutcome> {
+    use crate::l7::inference::{ParseResult, format_http_response, try_parse_http_request};
+
     let mut buf = vec![0u8; INITIAL_INFERENCE_BUF];
     let mut used = 0usize;
     let mut routed_any = false;
 
     loop {
-        let n = match tls_client.read(&mut buf[used..]).await {
+        let n = match stream.read(&mut buf[used..]).await {
             Ok(n) => n,
             Err(e) => {
                 if routed_any {
@@ -1065,10 +1077,13 @@ async fn handle_inference_interception(
         // Try to parse a complete HTTP request
         match try_parse_http_request(&buf[..used]) {
             ParseResult::Complete(request, consumed) => {
-                let was_routed = route_inference_request(&request, ctx, &mut tls_client).await?;
+                let was_routed = route_inference_request(&request, ctx, stream).await?;
                 if was_routed {
                     routed_any = true;
-                } else if !routed_any {
+                } else {
+                    // Deny and close: a non-inference request must not be silently
+                    // ignored on a keep-alive connection that previously routed
+                    // inference traffic.
                     return Ok(InferenceOutcome::Denied {
                         reason: "connection not allowed by policy".to_string(),
                     });
@@ -1083,7 +1098,7 @@ async fn handle_inference_interception(
                 if used == buf.len() {
                     if buf.len() >= MAX_INFERENCE_BUF {
                         let response = format_http_response(413, &[], b"Payload Too Large");
-                        write_all(&mut tls_client, &response).await?;
+                        write_all(stream, &response).await?;
                         if routed_any {
                             break;
                         }
@@ -1109,7 +1124,7 @@ async fn handle_inference_interception(
                     ocsf_emit!(event);
                 }
                 let response = format_http_response(400, &[], b"Bad Request");
-                write_all(&mut tls_client, &response).await?;
+                write_all(stream, &response).await?;
                 return Ok(InferenceOutcome::Denied { reason });
             }
         }
@@ -2467,6 +2482,41 @@ mod tests {
         assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::UNSPECIFIED)));
     }
 
+    #[test]
+    fn test_rejects_ipv4_cgnat() {
+        // 100.64.0.0/10 — CGNAT / shared address space (RFC 6598)
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(100, 64, 0, 1))));
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(100, 100, 50, 3))));
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(
+            100, 127, 255, 255
+        ))));
+        // Just outside the /10 boundary
+        assert!(!is_internal_ip(IpAddr::V4(Ipv4Addr::new(100, 128, 0, 1))));
+        assert!(!is_internal_ip(IpAddr::V4(Ipv4Addr::new(
+            100, 63, 255, 255
+        ))));
+    }
+
+    #[test]
+    fn test_rejects_ipv4_special_use_ranges() {
+        // 192.0.0.0/24 — IETF protocol assignments
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(192, 0, 0, 1))));
+        // 198.18.0.0/15 — benchmarking
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(198, 18, 0, 1))));
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(198, 19, 255, 255))));
+        // 198.51.100.0/24 — TEST-NET-2
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(198, 51, 100, 1))));
+        // 203.0.113.0/24 — TEST-NET-3
+        assert!(is_internal_ip(IpAddr::V4(Ipv4Addr::new(203, 0, 113, 1))));
+    }
+
+    #[test]
+    fn test_rejects_ipv6_mapped_cgnat() {
+        // ::ffff:100.64.0.1 should be caught via IPv4-mapped unwrapping
+        let v6 = Ipv4Addr::new(100, 64, 0, 1).to_ipv6_mapped();
+        assert!(is_internal_ip(IpAddr::V6(v6)));
+    }
+
     #[test]
     fn test_allows_ipv4_public() {
         assert!(!is_internal_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))));
@@ -3355,4 +3405,85 @@ mod tests {
         let result = implicit_allowed_ips_for_ip_host("*.example.com");
         assert!(result.is_empty());
     }
+
+    /// Regression test: exercises the actual keep-alive interception loop to
+    /// verify that a non-inference request is denied even after a previous
+    /// inference request was successfully routed on the same connection.
+    ///
+    /// Before the fix, `handle_inference_interception` used
+    /// `else if !routed_any` which silently dropped denials once `routed_any`
+    /// was true, allowing non-inference HTTP requests to piggyback on a
+    /// keep-alive connection that had previously handled inference traffic.
+    /// Regression test: exercises the actual keep-alive interception loop to
+    /// verify that a non-inference request is denied even after a previous
+    /// inference request was successfully routed on the same connection.
+    ///
+    /// The server runs in a spawned task with empty routes (the inference
+    /// request gets a 503 "not configured" but is still recognized as
+    /// inference and returns Ok(true)). The client sends the inference
+    /// request, reads the 503 response, then sends a non-inference request
+    /// on the same connection. The server must return Denied.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn test_keepalive_denies_non_inference_after_routed() {
+        use openshell_router::Router;
+        use tokio::io::{AsyncReadExt, AsyncWriteExt};
+
+        let router = Router::new().unwrap();
+        let patterns = crate::l7::inference::default_patterns();
+        // Empty routes: inference request gets 503 but returns Ok(true).
+        let ctx = InferenceContext::new(patterns, router, vec![], vec![]);
+
+        let body = r#"{"model":"test","messages":[{"role":"user","content":"hi"}]}"#;
+        let inference_req = format!(
+            "POST /v1/chat/completions HTTP/1.1\r\n\
+             Host: inference.local\r\n\
+             Content-Type: application/json\r\n\
+             Content-Length: {}\r\n\r\n{}",
+            body.len(),
+            body,
+        );
+        let non_inference_req = "GET /admin/config HTTP/1.1\r\nHost: inference.local\r\n\r\n";
+
+        let (client, mut server) = tokio::io::duplex(65536);
+        let (mut client_read, mut client_write) = tokio::io::split(client);
+
+        // Spawn the server task so it runs concurrently.
+        let server_task =
+            tokio::spawn(async move { process_inference_keepalive(&mut server, &ctx, 443).await });
+
+        // Client: send inference request, read response, send non-inference.
+        client_write
+            .write_all(inference_req.as_bytes())
+            .await
+            .unwrap();
+
+        // Read the 503 response so the server loops back to read.
+        let mut buf = vec![0u8; 4096];
+        let _ = client_read.read(&mut buf).await.unwrap();
+
+        // Send non-inference request on the same keep-alive connection.
+        client_write
+            .write_all(non_inference_req.as_bytes())
+            .await
+            .unwrap();
+        drop(client_write);
+
+        // Drain remaining response bytes.
+        tokio::spawn(async move {
+            let mut buf = vec![0u8; 4096];
+            loop {
+                match client_read.read(&mut buf).await {
+                    Ok(0) | Err(_) => break,
+                    Ok(_) => continue,
+                }
+            }
+        });
+
+        let outcome = server_task.await.unwrap().unwrap();
+
+        assert!(
+            matches!(outcome, InferenceOutcome::Denied { .. }),
+            "expected Denied after non-inference request on keep-alive, got: {outcome:?}"
+        );
+    }
 }
diff --git a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs b/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs
index e23447498..b6d5705b2 100644
--- a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs
+++ b/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs
@@ -26,11 +26,7 @@ use tracing::debug;
 const SECCOMP_SET_MODE_FILTER: u64 = 1;
 
 pub fn apply(policy: &SandboxPolicy) -> Result<()> {
-    if matches!(policy.network.mode, NetworkMode::Allow) {
-        return Ok(());
-    }
-
-    let allow_inet = matches!(policy.network.mode, NetworkMode::Proxy);
+    let allow_inet = matches!(policy.network.mode, NetworkMode::Proxy | NetworkMode::Allow);
     let filter = build_filter(allow_inet)?;
 
     // Required before applying seccomp filters.
@@ -43,10 +39,70 @@ pub fn apply(policy: &SandboxPolicy) -> Result<()> {
     }
 
     apply_filter(&filter).into_diagnostic()?;
+
+    // Apply a separate filter for clone3 that returns ENOSYS instead of EPERM.
+    // seccomp BPF cannot dereference the `struct clone_args *` pointer that
+    // clone3 takes as arg 0, so we cannot selectively block CLONE_NEWUSER.
+    // Block clone3 unconditionally with ENOSYS: glibc's clone3 wrapper falls
+    // back to clone on ENOSYS (where we CAN filter flags), but treats EPERM
+    // as a hard failure. Using a separate filter with a different match_action
+    // gives us per-syscall errno control that SeccompFilter's global
+    // match_action cannot provide.
+    let clone3_filter = build_clone3_filter()?;
+    apply_filter(&clone3_filter).into_diagnostic()?;
+
     Ok(())
 }
 
 fn build_filter(allow_inet: bool) -> Result<seccompiler::BpfProgram> {
+    let rules = build_filter_rules(allow_inet)?;
+
+    let arch = std::env::consts::ARCH
+        .try_into()
+        .map_err(|_| miette::miette!("Unsupported architecture for seccomp"))?;
+
+    let filter = SeccompFilter::new(
+        rules,
+        SeccompAction::Allow,
+        SeccompAction::Errno(libc::EPERM as u32),
+        arch,
+    )
+    .into_diagnostic()?;
+
+    filter.try_into().into_diagnostic()
+}
+
+/// Build a minimal BPF filter that blocks clone3 with ENOSYS.
+///
+/// This is a separate filter from the main one because seccomp BPF cannot
+/// dereference the `struct clone_args *` pointer that clone3 takes as arg 0,
+/// so we cannot selectively block CLONE_NEWUSER. We block clone3
+/// unconditionally with ENOSYS so glibc falls back to the older clone
+/// syscall (where flags are a direct register argument and CAN be filtered).
+///
+/// glibc's clone3 wrapper checks for ENOSYS specifically — EPERM would be
+/// treated as a hard failure and propagated to the caller instead of
+/// triggering the clone fallback.
+fn build_clone3_filter() -> Result<seccompiler::BpfProgram> {
+    let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
+    rules.entry(libc::SYS_clone3).or_default();
+
+    let arch = std::env::consts::ARCH
+        .try_into()
+        .map_err(|_| miette::miette!("Unsupported architecture for seccomp"))?;
+
+    let filter = SeccompFilter::new(
+        rules,
+        SeccompAction::Allow,
+        SeccompAction::Errno(libc::ENOSYS as u32),
+        arch,
+    )
+    .into_diagnostic()?;
+
+    filter.try_into().into_diagnostic()
+}
+
+fn build_filter_rules(allow_inet: bool) -> Result<BTreeMap<i64, Vec<SeccompRule>>> {
     let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
 
     // --- Socket domain blocks ---
@@ -73,10 +129,34 @@ fn build_filter(allow_inet: bool) -> Result<seccompiler::BpfProgram> {
     rules.entry(libc::SYS_bpf).or_default();
     // Cross-process memory read.
     rules.entry(libc::SYS_process_vm_readv).or_default();
+    // Cross-process memory write (symmetric with process_vm_readv).
+    rules.entry(libc::SYS_process_vm_writev).or_default();
+    // Process handle acquisition, fd theft, and signalling via pidfd.
+    rules.entry(libc::SYS_pidfd_open).or_default();
+    rules.entry(libc::SYS_pidfd_getfd).or_default();
+    rules.entry(libc::SYS_pidfd_send_signal).or_default();
     // Async I/O subsystem with extensive CVE history.
     rules.entry(libc::SYS_io_uring_setup).or_default();
     // Filesystem mount could subvert Landlock or overlay writable paths.
     rules.entry(libc::SYS_mount).or_default();
+    // New mount API syscalls (Linux 5.2+) bypass the SYS_mount block entirely.
+    rules.entry(libc::SYS_fsopen).or_default();
+    rules.entry(libc::SYS_fsconfig).or_default();
+    rules.entry(libc::SYS_fsmount).or_default();
+    rules.entry(libc::SYS_fspick).or_default();
+    rules.entry(libc::SYS_move_mount).or_default();
+    rules.entry(libc::SYS_open_tree).or_default();
+    // Namespace manipulation — setns enters existing namespaces, pivot_root/umount2
+    // change the filesystem root. The supervisor calls setns before seccomp is applied,
+    // so blocking it here is safe.
+    rules.entry(libc::SYS_setns).or_default();
+    rules.entry(libc::SYS_umount2).or_default();
+    rules.entry(libc::SYS_pivot_root).or_default();
+    // Kernel exploit primitives: userfaultfd enables race-condition exploitation (multiple
+    // CVEs), perf_event_open enables Spectre-class side channels. Both blocked by Docker's
+    // default seccomp profile.
+    rules.entry(libc::SYS_userfaultfd).or_default();
+    rules.entry(libc::SYS_perf_event_open).or_default();
 
     // --- Conditional syscall blocks ---
 
@@ -96,6 +176,15 @@ fn build_filter(allow_inet: bool) -> Result<seccompiler::BpfProgram> {
         libc::CLONE_NEWUSER as u64,
     )?;
 
+    // clone with CLONE_NEWUSER achieves the same as unshare via a different syscall.
+    add_masked_arg_rule(
+        &mut rules,
+        libc::SYS_clone,
+        0, // flags argument
+        libc::CLONE_NEWUSER as u64,
+    )?;
+    // clone3 is handled by a separate filter — see build_clone3_filter().
+
     // seccomp(SECCOMP_SET_MODE_FILTER) would let sandboxed code replace the active filter.
     let condition = SeccompCondition::new(
         0, // operation argument
@@ -107,19 +196,7 @@ fn build_filter(allow_inet: bool) -> Result<seccompiler::BpfProgram> {
     let rule = SeccompRule::new(vec![condition]).into_diagnostic()?;
     rules.entry(libc::SYS_seccomp).or_default().push(rule);
 
-    let arch = std::env::consts::ARCH
-        .try_into()
-        .map_err(|_| miette::miette!("Unsupported architecture for seccomp"))?;
-
-    let filter = SeccompFilter::new(
-        rules,
-        SeccompAction::Allow,
-        SeccompAction::Errno(libc::EPERM as u32),
-        arch,
-    )
-    .into_diagnostic()?;
-
-    filter.try_into().into_diagnostic()
+    Ok(rules)
 }
 
 #[allow(clippy::cast_sign_loss)]
@@ -159,6 +236,10 @@ fn add_masked_arg_rule(
 mod tests {
     use super::*;
 
+    // These tests verify filter construction (rule map shape and BPF compilation).
+    // Behavioral verification (syscalls returning EPERM inside a sandbox) requires
+    // a running Linux sandbox and is covered by the e2e test suite.
+
     #[test]
     fn build_filter_proxy_mode_compiles() {
         let filter = build_filter(true);
@@ -189,31 +270,41 @@ mod tests {
 
     #[test]
     fn unconditional_blocks_present_in_filter() {
-        let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
-
-        // Simulate what build_filter does for unconditional blocks
-        rules.entry(libc::SYS_memfd_create).or_default();
-        rules.entry(libc::SYS_ptrace).or_default();
-        rules.entry(libc::SYS_bpf).or_default();
-        rules.entry(libc::SYS_process_vm_readv).or_default();
-        rules.entry(libc::SYS_io_uring_setup).or_default();
-        rules.entry(libc::SYS_mount).or_default();
+        // Build a real filter and verify all unconditional blocks are present.
+        let filter_rules = build_filter_rules(true).unwrap();
 
-        // Unconditional blocks have an empty Vec (no conditions = always match)
-        for syscall in [
+        // Unconditional blocks have an empty Vec (no conditions = always match).
+        let expected = [
             libc::SYS_memfd_create,
             libc::SYS_ptrace,
             libc::SYS_bpf,
             libc::SYS_process_vm_readv,
+            libc::SYS_process_vm_writev,
+            libc::SYS_pidfd_open,
+            libc::SYS_pidfd_getfd,
+            libc::SYS_pidfd_send_signal,
             libc::SYS_io_uring_setup,
             libc::SYS_mount,
-        ] {
+            libc::SYS_fsopen,
+            libc::SYS_fsconfig,
+            libc::SYS_fsmount,
+            libc::SYS_fspick,
+            libc::SYS_move_mount,
+            libc::SYS_open_tree,
+            libc::SYS_setns,
+            libc::SYS_umount2,
+            libc::SYS_pivot_root,
+            libc::SYS_userfaultfd,
+            libc::SYS_perf_event_open,
+        ];
+
+        for syscall in expected {
             assert!(
-                rules.contains_key(&syscall),
+                filter_rules.contains_key(&syscall),
                 "syscall {syscall} should be in the rules map"
             );
             assert!(
-                rules[&syscall].is_empty(),
+                filter_rules[&syscall].is_empty(),
                 "syscall {syscall} should have empty rules (unconditional block)"
             );
         }
@@ -222,37 +313,150 @@ mod tests {
     #[test]
     fn conditional_blocks_have_rules() {
         // Build a real filter and verify the conditional syscalls have rule entries
-        // (non-empty Vec means conditional match)
-        let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
+        // (non-empty Vec means conditional match).
+        let filter_rules = build_filter_rules(true).unwrap();
 
-        add_masked_arg_rule(
-            &mut rules,
+        for syscall in [
             libc::SYS_execveat,
-            4,
-            libc::AT_EMPTY_PATH as u64,
-        )
-        .unwrap();
-        add_masked_arg_rule(&mut rules, libc::SYS_unshare, 0, libc::CLONE_NEWUSER as u64).unwrap();
-
-        let condition = SeccompCondition::new(
-            0,
-            SeccompCmpArgLen::Dword,
-            SeccompCmpOp::Eq,
-            SECCOMP_SET_MODE_FILTER,
-        )
-        .unwrap();
-        let rule = SeccompRule::new(vec![condition]).unwrap();
-        rules.entry(libc::SYS_seccomp).or_default().push(rule);
-
-        for syscall in [libc::SYS_execveat, libc::SYS_unshare, libc::SYS_seccomp] {
+            libc::SYS_unshare,
+            libc::SYS_clone,
+            libc::SYS_seccomp,
+        ] {
             assert!(
-                rules.contains_key(&syscall),
+                filter_rules.contains_key(&syscall),
                 "syscall {syscall} should be in the rules map"
             );
             assert!(
-                !rules[&syscall].is_empty(),
+                !filter_rules[&syscall].is_empty(),
                 "syscall {syscall} should have conditional rules"
             );
         }
     }
+
+    #[test]
+    fn clone3_filter_compiles_and_blocks_clone3() {
+        let bpf = build_clone3_filter();
+        assert!(bpf.is_ok(), "clone3 ENOSYS filter should compile");
+    }
+
+    #[test]
+    fn clone3_not_in_main_filter() {
+        // clone3 must NOT be in the main filter; it has its own ENOSYS filter.
+        let filter_rules = build_filter_rules(true).unwrap();
+        assert!(
+            !filter_rules.contains_key(&libc::SYS_clone3),
+            "clone3 should not be in the main filter — it uses a separate ENOSYS filter"
+        );
+    }
+
+    // --- Behavioral tests ---
+    //
+    // These apply seccomp filters in a forked child and verify that blocked
+    // syscalls actually return the expected errno. They only compile and run
+    // on Linux (seccomp is a Linux kernel feature).
+
+    /// Fork a child, apply the given filter, invoke `syscall_nr`, and return
+    /// the errno observed by the child. The child exits 0 if the syscall
+    /// returned the expected errno, 1 otherwise.
+    unsafe fn assert_blocked_in_child(
+        filter: &seccompiler::BpfProgram,
+        syscall_nr: i64,
+        expected_errno: i32,
+    ) {
+        let pid = libc::fork();
+        assert!(pid >= 0, "fork failed");
+        if pid == 0 {
+            // Child: apply filter and try the syscall.
+            libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+            apply_filter(filter).expect("apply_filter");
+            let ret = libc::syscall(syscall_nr, 0 as libc::c_ulong, 0 as libc::c_ulong);
+            let errno = *libc::__errno_location();
+            if ret == -1 && errno == expected_errno {
+                libc::_exit(0);
+            } else {
+                // Write diagnostic before exiting so test failures are debuggable.
+                let msg = format!(
+                    "syscall {syscall_nr}: expected errno={expected_errno}, got ret={ret} errno={errno}\n"
+                );
+                libc::write(2, msg.as_ptr().cast(), msg.len());
+                libc::_exit(1);
+            }
+        }
+        // Parent: wait for child.
+        let mut status: libc::c_int = 0;
+        libc::waitpid(pid, &mut status, 0);
+        assert!(
+            libc::WIFEXITED(status) && libc::WEXITSTATUS(status) == 0,
+            "child failed: syscall {syscall_nr} was not blocked with errno {expected_errno}"
+        );
+    }
+
+    #[test]
+    fn behavioral_memfd_create_blocked() {
+        let filter = build_filter(true).unwrap();
+        unsafe { assert_blocked_in_child(&filter, libc::SYS_memfd_create, libc::EPERM) };
+    }
+
+    #[test]
+    fn behavioral_ptrace_blocked() {
+        let filter = build_filter(true).unwrap();
+        unsafe { assert_blocked_in_child(&filter, libc::SYS_ptrace, libc::EPERM) };
+    }
+
+    #[test]
+    fn behavioral_process_vm_writev_blocked() {
+        let filter = build_filter(true).unwrap();
+        unsafe { assert_blocked_in_child(&filter, libc::SYS_process_vm_writev, libc::EPERM) };
+    }
+
+    #[test]
+    fn behavioral_userfaultfd_blocked() {
+        let filter = build_filter(true).unwrap();
+        unsafe { assert_blocked_in_child(&filter, libc::SYS_userfaultfd, libc::EPERM) };
+    }
+
+    #[test]
+    fn behavioral_perf_event_open_blocked() {
+        let filter = build_filter(true).unwrap();
+        unsafe { assert_blocked_in_child(&filter, libc::SYS_perf_event_open, libc::EPERM) };
+    }
+
+    #[test]
+    fn behavioral_setns_blocked() {
+        let filter = build_filter(true).unwrap();
+        unsafe { assert_blocked_in_child(&filter, libc::SYS_setns, libc::EPERM) };
+    }
+
+    #[test]
+    fn behavioral_clone3_returns_enosys() {
+        // clone3 uses a separate filter that returns ENOSYS (not EPERM) so
+        // glibc falls back to clone.
+        let main_filter = build_filter(true).unwrap();
+        let clone3_filter = build_clone3_filter().unwrap();
+        // Apply in the same order as apply(): main first, clone3 second.
+        let pid = unsafe { libc::fork() };
+        assert!(pid >= 0, "fork failed");
+        if pid == 0 {
+            unsafe {
+                libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+                apply_filter(&main_filter).expect("main filter");
+                apply_filter(&clone3_filter).expect("clone3 filter");
+                let ret = libc::syscall(libc::SYS_clone3, 0 as libc::c_ulong, 0 as libc::c_ulong);
+                let errno = *libc::__errno_location();
+                if ret == -1 && errno == libc::ENOSYS {
+                    libc::_exit(0);
+                } else {
+                    let msg = format!("clone3: expected ENOSYS, got ret={ret} errno={errno}\n");
+                    libc::write(2, msg.as_ptr().cast(), msg.len());
+                    libc::_exit(1);
+                }
+            }
+        }
+        let mut status: libc::c_int = 0;
+        unsafe { libc::waitpid(pid, &mut status, 0) };
+        assert!(
+            unsafe { libc::WIFEXITED(status) && libc::WEXITSTATUS(status) == 0 },
+            "clone3 should be blocked with ENOSYS, not EPERM"
+        );
+    }
 }

From ce39c447217b65463560666ecebf0ff503541854 Mon Sep 17 00:00:00 2001
From: John Myers <9696606+johntmyers@users.noreply.github.com>
Date: Mon, 13 Apr 2026 10:58:12 -0700
Subject: [PATCH 2/3] fix(sandbox): install clone3 seccomp filter before main
 filter

---
 .../src/sandbox/linux/seccomp.rs              | 95 +++++++++++++++----
 1 file changed, 75 insertions(+), 20 deletions(-)

diff --git a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs b/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs
index b6d5705b2..854134cbf 100644
--- a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs
+++ b/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs
@@ -27,7 +27,8 @@ const SECCOMP_SET_MODE_FILTER: u64 = 1;
 
 pub fn apply(policy: &SandboxPolicy) -> Result<()> {
     let allow_inet = matches!(policy.network.mode, NetworkMode::Proxy | NetworkMode::Allow);
-    let filter = build_filter(allow_inet)?;
+    let main_filter = build_filter(allow_inet)?;
+    let clone3_filter = build_clone3_filter()?;
 
     // Required before applying seccomp filters.
     let rc = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
@@ -38,18 +39,7 @@ pub fn apply(policy: &SandboxPolicy) -> Result<()> {
         ));
     }
 
-    apply_filter(&filter).into_diagnostic()?;
-
-    // Apply a separate filter for clone3 that returns ENOSYS instead of EPERM.
-    // seccomp BPF cannot dereference the `struct clone_args *` pointer that
-    // clone3 takes as arg 0, so we cannot selectively block CLONE_NEWUSER.
-    // Block clone3 unconditionally with ENOSYS: glibc's clone3 wrapper falls
-    // back to clone on ENOSYS (where we CAN filter flags), but treats EPERM
-    // as a hard failure. Using a separate filter with a different match_action
-    // gives us per-syscall errno control that SeccompFilter's global
-    // match_action cannot provide.
-    let clone3_filter = build_clone3_filter()?;
-    apply_filter(&clone3_filter).into_diagnostic()?;
+    apply_runtime_filters(&main_filter, &clone3_filter)?;
 
     Ok(())
 }
@@ -102,6 +92,22 @@ fn build_clone3_filter() -> Result<seccompiler::BpfProgram> {
     filter.try_into().into_diagnostic()
 }
 
+/// Install the sandbox seccomp filters in the required order.
+///
+/// Order matters:
+/// 1. Install the dedicated clone3 filter first so it can still call
+///    `seccomp(SECCOMP_SET_MODE_FILTER)`.
+/// 2. Install the main filter second. It blocks further seccomp filter
+///    installation with `EPERM`, preserving the original hardening intent.
+fn apply_runtime_filters(
+    main_filter: seccompiler::BpfProgramRef,
+    clone3_filter: seccompiler::BpfProgramRef,
+) -> Result<()> {
+    apply_filter(clone3_filter).into_diagnostic()?;
+    apply_filter(main_filter).into_diagnostic()?;
+    Ok(())
+}
+
 fn build_filter_rules(allow_inet: bool) -> Result<BTreeMap<i64, Vec<SeccompRule>>> {
     let mut rules: BTreeMap<i64, Vec<SeccompRule>> = BTreeMap::new();
 
@@ -236,9 +242,8 @@ fn add_masked_arg_rule(
 mod tests {
     use super::*;
 
-    // These tests verify filter construction (rule map shape and BPF compilation).
-    // Behavioral verification (syscalls returning EPERM inside a sandbox) requires
-    // a running Linux sandbox and is covered by the e2e test suite.
+    // These tests cover both filter construction (rule map shape and BPF
+    // compilation) and selected runtime behavior on Linux via forked children.
 
     #[test]
     fn build_filter_proxy_mode_compiles() {
@@ -391,6 +396,18 @@ mod tests {
         );
     }
 
+    unsafe fn install_runtime_filters_in_child(
+        main_filter: &seccompiler::BpfProgram,
+        clone3_filter: &seccompiler::BpfProgram,
+    ) {
+        libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+        if let Err(err) = apply_runtime_filters(main_filter, clone3_filter) {
+            let msg = format!("failed to install runtime seccomp filters: {err}\n");
+            libc::write(2, msg.as_ptr().cast(), msg.len());
+            libc::_exit(1);
+        }
+    }
+
     #[test]
     fn behavioral_memfd_create_blocked() {
         let filter = build_filter(true).unwrap();
@@ -433,14 +450,12 @@ mod tests {
         // glibc falls back to clone.
         let main_filter = build_filter(true).unwrap();
         let clone3_filter = build_clone3_filter().unwrap();
-        // Apply in the same order as apply(): main first, clone3 second.
+        // Apply in the same order as apply(): clone3 filter first, main filter second.
         let pid = unsafe { libc::fork() };
         assert!(pid >= 0, "fork failed");
         if pid == 0 {
             unsafe {
-                libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
-                apply_filter(&main_filter).expect("main filter");
-                apply_filter(&clone3_filter).expect("clone3 filter");
+                install_runtime_filters_in_child(&main_filter, &clone3_filter);
                 let ret = libc::syscall(libc::SYS_clone3, 0 as libc::c_ulong, 0 as libc::c_ulong);
                 let errno = *libc::__errno_location();
                 if ret == -1 && errno == libc::ENOSYS {
@@ -459,4 +474,44 @@ mod tests {
             "clone3 should be blocked with ENOSYS, not EPERM"
         );
     }
+
+    #[test]
+    fn behavioral_third_filter_install_blocked_after_startup() {
+        let main_filter = build_filter(true).unwrap();
+        let clone3_filter = build_clone3_filter().unwrap();
+        let third_filter = build_clone3_filter().unwrap();
+
+        let pid = unsafe { libc::fork() };
+        assert!(pid >= 0, "fork failed");
+        if pid == 0 {
+            unsafe {
+                install_runtime_filters_in_child(&main_filter, &clone3_filter);
+                match apply_filter(&third_filter) {
+                    Err(seccompiler::Error::Seccomp(e))
+                        if e.raw_os_error() == Some(libc::EPERM) =>
+                    {
+                        libc::_exit(0);
+                    }
+                    Err(err) => {
+                        let msg =
+                            format!("third filter install failed with unexpected error: {err}\n");
+                        libc::write(2, msg.as_ptr().cast(), msg.len());
+                        libc::_exit(1);
+                    }
+                    Ok(()) => {
+                        let msg = "third filter unexpectedly installed\n";
+                        libc::write(2, msg.as_ptr().cast(), msg.len());
+                        libc::_exit(1);
+                    }
+                }
+            }
+        }
+
+        let mut status: libc::c_int = 0;
+        unsafe { libc::waitpid(pid, &mut status, 0) };
+        assert!(
+            unsafe { libc::WIFEXITED(status) && libc::WEXITSTATUS(status) == 0 },
+            "additional seccomp filter installation should be blocked after startup"
+        );
+    }
 }

From 8b16037285db13431fff0efbc3e9c2b7ebe81e7f Mon Sep 17 00:00:00 2001
From: John Myers <9696606+johntmyers@users.noreply.github.com>
Date: Mon, 13 Apr 2026 11:12:03 -0700
Subject: [PATCH 3/3] test(ocsf): fix shorthand firewall engine expectation

---
 crates/openshell-ocsf/src/format/shorthand.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/openshell-ocsf/src/format/shorthand.rs b/crates/openshell-ocsf/src/format/shorthand.rs
index f3f38e122..7e2296de9 100644
--- a/crates/openshell-ocsf/src/format/shorthand.rs
+++ b/crates/openshell-ocsf/src/format/shorthand.rs
@@ -641,7 +641,7 @@ mod tests {
         let shorthand = event.format_shorthand();
         assert_eq!(
             shorthand,
-            "HTTP:GET [INFO] ALLOWED curl(68) -> GET http://172.20.0.1:9876/test [policy:allow_host_9876]"
+            "HTTP:GET [INFO] ALLOWED curl(68) -> GET http://172.20.0.1:9876/test [policy:allow_host_9876 engine:mechanistic]"
         );
     }