From 1eccea7ffb7eac43670d5fd76e8afa8ecfe6b0b9 Mon Sep 17 00:00:00 2001 From: Maxim Cournoyer Date: Fri, 17 Oct 2025 23:12:27 +0900 Subject: [PATCH] build/syscalls: Introduce new safe-clone and use it. * guix/build/syscalls.scm (without-automatic-finalization): Accept multiple expressions. (without-garbage-collection): New syntax. (without-threads): Likewise. (ensure-signal-delivery-thread, safe-clone): New procedures. * tests/syscalls.scm: ("clone and unshare triggers EINVAL") ("safe-clone and unshare succeeds"): New tests. * gnu/build/linux-container.scm (run-container): Adjust to use 'safe-clone'. Relates-to: #1169 Change-Id: I044c11a899e24e547a7aed97f30c8e7250ab5363 --- gnu/build/linux-container.scm | 173 ++++++++++++++++------------------ guix/build/syscalls.scm | 53 ++++++++++- tests/syscalls.scm | 36 ++++++- 3 files changed, 168 insertions(+), 94 deletions(-) diff --git a/gnu/build/linux-container.scm b/gnu/build/linux-container.scm index 25890ec0a13..ff5449d0b0f 100644 --- a/gnu/build/linux-container.scm +++ b/gnu/build/linux-container.scm @@ -263,100 +263,93 @@ that host UIDs (respectively GIDs) map to in the namespace." ;; child process blocks until the parent writes to it. (match (socketpair PF_UNIX (logior SOCK_CLOEXEC SOCK_STREAM) 0) ((child . parent) - (let ((flags (namespaces->bit-mask namespaces))) - (match (clone flags) - (0 - ;; Inhibit thread creation until after the unshare call. - (gc-disable) - (call-with-clean-exit - (lambda () - (close-port parent) - ;; Wait for parent to set things up. - (match (read child) - ('ready - (purify-environment) - (when (and (memq 'mnt namespaces) - (not (string=? root "/"))) - (catch #t - (lambda () - (mount-file-systems root mounts - #:mount-/proc? (memq 'pid namespaces) - #:mount-/sys? (memq 'net - namespaces) - #:populate-file-system - (lambda () - (populate-file-system) - (when (and (memq 'net namespaces) - loopback-network?) - (set-network-interface-up "lo") + (safe-clone + (namespaces->bit-mask namespaces) + (lambda () + (call-with-clean-exit + (lambda () + (close-port parent) + ;; Wait for parent to set things up. + (match (read child) + ('ready + (purify-environment) + (when (and (memq 'mnt namespaces) + (not (string=? root "/"))) + (catch #t + (lambda () + (mount-file-systems root mounts + #:mount-/proc? (memq 'pid namespaces) + #:mount-/sys? (memq 'net + namespaces) + #:populate-file-system + (lambda () + (populate-file-system) + (when (and (memq 'net namespaces) + loopback-network?) + (set-network-interface-up "lo") - ;; When isolated from the - ;; network, provide a minimal - ;; /etc/hosts to resolve - ;; "localhost". - (mkdir-p "/etc") - (call-with-output-file "/etc/hosts" - (lambda (port) - (display "127.0.0.1 localhost\n" port) - (chmod port #o444))))) - #:writable-root? - (or writable-root? - (not (memq 'mnt namespaces))))) - (lambda args - ;; Forward the exception to the parent process. - ;; FIXME: SRFI-35 conditions and non-trivial objects - ;; cannot be 'read' so they shouldn't be written as is. - (write args child) - (primitive-exit 3)))) + ;; When isolated from the + ;; network, provide a minimal + ;; /etc/hosts to resolve + ;; "localhost". + (mkdir-p "/etc") + (call-with-output-file "/etc/hosts" + (lambda (port) + (display "127.0.0.1 localhost\n" port) + (chmod port #o444))))) + #:writable-root? + (or writable-root? + (not (memq 'mnt namespaces))))) + (lambda args + ;; Forward the exception to the parent process. + ;; FIXME: SRFI-35 conditions and non-trivial objects + ;; cannot be 'read' so they shouldn't be written as is. + (write args child) + (primitive-exit 3)))) - (when (and lock-mounts? - (memq 'mnt namespaces) - (memq 'user namespaces)) - ;; Create a new mount namespace owned by a new user - ;; namespace to "lock" together previous mounts, such that - ;; they cannot be unmounted or remounted separately--see - ;; mount_namespaces(7). - ;; - ;; Note: at this point, the process is single-threaded (no - ;; GC mark threads, no finalization thread, etc.) which is - ;; why unshare(CLONE_NEWUSER) can be used. - (let ((uid (getuid)) (gid (getgid))) - (unshare (logior CLONE_NEWUSER CLONE_NEWNS)) - (gc-enable) - (when (file-exists? "/proc/self") - (initialize-user-namespace (getpid) - host-uids - #:host-uid uid - #:host-gid gid - #:guest-uid guest-uid - #:guest-gid guest-gid)))) + (when (and lock-mounts? + (memq 'mnt namespaces) + (memq 'user namespaces)) + ;; Create a new mount namespace owned by a new user + ;; namespace to "lock" together previous mounts, such that + ;; they cannot be unmounted or remounted separately--see + ;; mount_namespaces(7). + (let ((uid (getuid)) (gid (getgid))) + (unshare (logior CLONE_NEWUSER CLONE_NEWNS)) + (when (file-exists? "/proc/self") + (initialize-user-namespace (getpid) + host-uids + #:host-uid uid + #:host-gid gid + #:guest-uid guest-uid + #:guest-gid guest-gid)))) - ;; TODO: Manage capabilities. - (write 'ready child) - (close-port child) - (thunk)) - (_ ;parent died or something - (primitive-exit 2)))))) - (pid - (close-port child) - (when (memq 'user namespaces) - (initialize-user-namespace pid host-uids - #:guest-uid guest-uid - #:guest-gid guest-gid)) - ;; TODO: Initialize cgroups. - (write 'ready parent) - (newline parent) + ;; TODO: Manage capabilities. + (write 'ready child) + (close-port child) + (thunk)) + (_ ;parent died or something + (primitive-exit 2)))))) + (lambda (pid) + (close-port child) + (when (memq 'user namespaces) + (initialize-user-namespace pid host-uids + #:guest-uid guest-uid + #:guest-gid guest-gid)) + ;; TODO: Initialize cgroups. + (write 'ready parent) + (newline parent) - ;; Check whether the child process' setup phase succeeded. - (let ((message (read parent))) - (close-port parent) - (match message - ('ready ;success - pid) - (((? symbol? key) args ...) ;exception - (apply throw key args)) - (_ ;unexpected termination - #f))))))))) + ;; Check whether the child process' setup phase succeeded. + (let ((message (read parent))) + (close-port parent) + (match message + ('ready ;success + pid) + (((? symbol? key) args ...) ;exception + (apply throw key args)) + (_ ;unexpected termination + #f)))))))) ;; FIXME: This is copied from (guix utils), which we cannot use because it ;; would pull (guix config) and all. diff --git a/guix/build/syscalls.scm b/guix/build/syscalls.scm index 3106e4e3d6d..d40b1ae5d93 100644 --- a/guix/build/syscalls.scm +++ b/guix/build/syscalls.scm @@ -150,6 +150,7 @@ CLONE_THREAD CLONE_VM clone + safe-clone unshare setns get-user-ns @@ -1170,17 +1171,45 @@ caller lacks root privileges." Turning finalization off shuts down the finalization thread as a side effect." (->bool ((force proc) (if enabled? 1 0)))))) -(define-syntax-rule (without-automatic-finalization exp) - "Turn off automatic finalization within the dynamic extent of EXP." +(define-syntax-rule (without-automatic-finalization body ...) + "Turn off automatic finalization within the dynamic extent of BODY. This is +useful to ensure there is no finalization thread." (let ((enabled? #t)) (dynamic-wind (lambda () (set! enabled? (%set-automatic-finalization-enabled?! #f))) (lambda () - exp) + body ...) (lambda () (%set-automatic-finalization-enabled?! enabled?))))) +(define-syntax-rule (without-garbage-collection body ...) + "Turn off garbage collection within the dynamic extent of BODY. This is useful +to avoid the creation new garbage collection thread. Note that pre-existing +GC marker threads are only disabled, not terminated." + (dynamic-wind + (lambda () + (gc-disable)) + (lambda () + body ...) + (lambda () + (gc-enable)))) + +(define-syntax-rule (without-threads body ...) + "Ensure the Guile finalizer thread is stopped and that garbage collection does +not run. Note that pre-existing GC marker threads are only disabled, not +terminated. This also leaves the signal handling thread to be disabled by +another means, since there is no Guile API to do so." + ;; Note: the three kind of threads that Guile can spawn are the finalization + ;; thread, the signal thread, or the GC marker threads. + (without-automatic-finalization + (without-garbage-collection body ...))) + +(define (ensure-signal-delivery-thread) + "Ensure the signal delivery thread is spawned and its state set + to 'RUNNING'. This is valid as of the implementation as of Guile 3.0.9." + (sigaction SIGUSR1)) ;could be any signal + ;; The libc interface to sys_clone is not useful for Scheme programs, so the ;; low-level system call is wrapped instead. The 'syscall' function is ;; declared in as a variadic function; in practice, it expects 6 @@ -1223,6 +1252,24 @@ are shared between the parent and child processes." (list err)) ret))))) +(define (safe-clone flags child parent) + "This is a raw clone syscall wrapper that ensures no Guile thread will be +spawned during execution of the child. `clone' is called with FLAGS. CHILD +is a thunk to run in the child process. PARENT is procedure that accepts the +child PID as argument. This is useful in many contexts, such as when calling +`unshare' or async-unsafe procedures in the child when the parent process +memory (CLONE_VM) or threads (CLONE_THREAD) are shared with it." + ;; TODO: Contribute `clone' to Guile, and handle these complications there, + ;; similarly to how it's handled for scm_fork in posix.c. + + ;; XXX: This is a hack: as of Guile 3.0.9, by starting the signal delivery + ;; thread in the parent, its state will be known as RUNNING, and the child + ;; won't attempt to start it itself. + (ensure-signal-delivery-thread) + (match (clone flags) + (0 (without-threads (child))) + (pid (parent pid)))) + (define (thread-count) "Return the complete thread count of the current process. Unlike `all-threads', this also counts the Guile signal delivery, and finalizer diff --git a/tests/syscalls.scm b/tests/syscalls.scm index 879c3e4f254..a0483e68f08 100644 --- a/tests/syscalls.scm +++ b/tests/syscalls.scm @@ -3,6 +3,7 @@ ;;; Copyright © 2015 David Thompson ;;; Copyright © 2020 Simon South ;;; Copyright © 2020 Mathieu Othacehe +;;; Copyright © 2025 Maxim Cournoyer ;;; ;;; This file is part of GNU Guix. ;;; @@ -29,7 +30,8 @@ #:use-module (srfi srfi-71) #:use-module (system foreign) #:use-module ((ice-9 ftw) #:select (scandir)) - #:use-module (ice-9 match)) + #:use-module (ice-9 match) + #:use-module (ice-9 threads)) ;; Test the (guix build syscalls) module, although there's not much that can ;; actually be tested without being root. @@ -158,6 +160,38 @@ (lambda args (system-error-errno args)))) +(define child-thunk + (lambda () + (gc) ;spawn GC threads + (primitive-exit + (catch 'system-error + (lambda () + (unshare CLONE_THREAD) + 0) ;no error + (lambda args + (system-error-errno args)))))) + +(define parent-proc + (lambda (pid) + (match (waitpid pid) + ((_ . status) + (status:exit-val status))))) + +(unless perform-container-tests? + (test-skip 1)) +(test-equal "clone and unshare triggers EINVAL" + EINVAL + (match (clone (logior CLONE_NEWUSER SIGCHLD)) + (0 (child-thunk)) + (pid (parent-proc pid)))) + +(unless perform-container-tests? + (test-skip 1)) +(test-equal "safe-clone and unshare succeeds" + 0 + (safe-clone (logior CLONE_NEWUSER SIGCHLD) + child-thunk parent-proc)) + (unless perform-container-tests? (test-skip 1)) (test-assert "setns"