build/syscalls: Introduce new safe-clone and use it.

* guix/build/syscalls.scm (without-automatic-finalization): Accept multiple
expressions.
(without-garbage-collection): New syntax.
(without-threads): Likewise.
(ensure-signal-delivery-thread, safe-clone): New procedures.
* tests/syscalls.scm: ("clone and unshare triggers EINVAL")
("safe-clone and unshare succeeds"): New tests.
* gnu/build/linux-container.scm (run-container): Adjust to use 'safe-clone'.

Relates-to: #1169
Change-Id: I044c11a899e24e547a7aed97f30c8e7250ab5363
This commit is contained in:
Maxim Cournoyer 2025-10-17 23:12:27 +09:00
parent 3966f76297
commit 1eccea7ffb
No known key found for this signature in database
GPG key ID: 1260E46482E63562
3 changed files with 168 additions and 94 deletions

View file

@ -263,100 +263,93 @@ that host UIDs (respectively GIDs) map to in the namespace."
;; child process blocks until the parent writes to it.
(match (socketpair PF_UNIX (logior SOCK_CLOEXEC SOCK_STREAM) 0)
((child . parent)
(let ((flags (namespaces->bit-mask namespaces)))
(match (clone flags)
(0
;; Inhibit thread creation until after the unshare call.
(gc-disable)
(call-with-clean-exit
(lambda ()
(close-port parent)
;; Wait for parent to set things up.
(match (read child)
('ready
(purify-environment)
(when (and (memq 'mnt namespaces)
(not (string=? root "/")))
(catch #t
(lambda ()
(mount-file-systems root mounts
#:mount-/proc? (memq 'pid namespaces)
#:mount-/sys? (memq 'net
namespaces)
#:populate-file-system
(lambda ()
(populate-file-system)
(when (and (memq 'net namespaces)
loopback-network?)
(set-network-interface-up "lo")
(safe-clone
(namespaces->bit-mask namespaces)
(lambda ()
(call-with-clean-exit
(lambda ()
(close-port parent)
;; Wait for parent to set things up.
(match (read child)
('ready
(purify-environment)
(when (and (memq 'mnt namespaces)
(not (string=? root "/")))
(catch #t
(lambda ()
(mount-file-systems root mounts
#:mount-/proc? (memq 'pid namespaces)
#:mount-/sys? (memq 'net
namespaces)
#:populate-file-system
(lambda ()
(populate-file-system)
(when (and (memq 'net namespaces)
loopback-network?)
(set-network-interface-up "lo")
;; When isolated from the
;; network, provide a minimal
;; /etc/hosts to resolve
;; "localhost".
(mkdir-p "/etc")
(call-with-output-file "/etc/hosts"
(lambda (port)
(display "127.0.0.1 localhost\n" port)
(chmod port #o444)))))
#:writable-root?
(or writable-root?
(not (memq 'mnt namespaces)))))
(lambda args
;; Forward the exception to the parent process.
;; FIXME: SRFI-35 conditions and non-trivial objects
;; cannot be 'read' so they shouldn't be written as is.
(write args child)
(primitive-exit 3))))
;; When isolated from the
;; network, provide a minimal
;; /etc/hosts to resolve
;; "localhost".
(mkdir-p "/etc")
(call-with-output-file "/etc/hosts"
(lambda (port)
(display "127.0.0.1 localhost\n" port)
(chmod port #o444)))))
#:writable-root?
(or writable-root?
(not (memq 'mnt namespaces)))))
(lambda args
;; Forward the exception to the parent process.
;; FIXME: SRFI-35 conditions and non-trivial objects
;; cannot be 'read' so they shouldn't be written as is.
(write args child)
(primitive-exit 3))))
(when (and lock-mounts?
(memq 'mnt namespaces)
(memq 'user namespaces))
;; Create a new mount namespace owned by a new user
;; namespace to "lock" together previous mounts, such that
;; they cannot be unmounted or remounted separately--see
;; mount_namespaces(7).
;;
;; Note: at this point, the process is single-threaded (no
;; GC mark threads, no finalization thread, etc.) which is
;; why unshare(CLONE_NEWUSER) can be used.
(let ((uid (getuid)) (gid (getgid)))
(unshare (logior CLONE_NEWUSER CLONE_NEWNS))
(gc-enable)
(when (file-exists? "/proc/self")
(initialize-user-namespace (getpid)
host-uids
#:host-uid uid
#:host-gid gid
#:guest-uid guest-uid
#:guest-gid guest-gid))))
(when (and lock-mounts?
(memq 'mnt namespaces)
(memq 'user namespaces))
;; Create a new mount namespace owned by a new user
;; namespace to "lock" together previous mounts, such that
;; they cannot be unmounted or remounted separately--see
;; mount_namespaces(7).
(let ((uid (getuid)) (gid (getgid)))
(unshare (logior CLONE_NEWUSER CLONE_NEWNS))
(when (file-exists? "/proc/self")
(initialize-user-namespace (getpid)
host-uids
#:host-uid uid
#:host-gid gid
#:guest-uid guest-uid
#:guest-gid guest-gid))))
;; TODO: Manage capabilities.
(write 'ready child)
(close-port child)
(thunk))
(_ ;parent died or something
(primitive-exit 2))))))
(pid
(close-port child)
(when (memq 'user namespaces)
(initialize-user-namespace pid host-uids
#:guest-uid guest-uid
#:guest-gid guest-gid))
;; TODO: Initialize cgroups.
(write 'ready parent)
(newline parent)
;; TODO: Manage capabilities.
(write 'ready child)
(close-port child)
(thunk))
(_ ;parent died or something
(primitive-exit 2))))))
(lambda (pid)
(close-port child)
(when (memq 'user namespaces)
(initialize-user-namespace pid host-uids
#:guest-uid guest-uid
#:guest-gid guest-gid))
;; TODO: Initialize cgroups.
(write 'ready parent)
(newline parent)
;; Check whether the child process' setup phase succeeded.
(let ((message (read parent)))
(close-port parent)
(match message
('ready ;success
pid)
(((? symbol? key) args ...) ;exception
(apply throw key args))
(_ ;unexpected termination
#f)))))))))
;; Check whether the child process' setup phase succeeded.
(let ((message (read parent)))
(close-port parent)
(match message
('ready ;success
pid)
(((? symbol? key) args ...) ;exception
(apply throw key args))
(_ ;unexpected termination
#f))))))))
;; FIXME: This is copied from (guix utils), which we cannot use because it
;; would pull (guix config) and all.

View file

@ -150,6 +150,7 @@
CLONE_THREAD
CLONE_VM
clone
safe-clone
unshare
setns
get-user-ns
@ -1170,17 +1171,45 @@ caller lacks root privileges."
Turning finalization off shuts down the finalization thread as a side effect."
(->bool ((force proc) (if enabled? 1 0))))))
(define-syntax-rule (without-automatic-finalization exp)
"Turn off automatic finalization within the dynamic extent of EXP."
(define-syntax-rule (without-automatic-finalization body ...)
"Turn off automatic finalization within the dynamic extent of BODY. This is
useful to ensure there is no finalization thread."
(let ((enabled? #t))
(dynamic-wind
(lambda ()
(set! enabled? (%set-automatic-finalization-enabled?! #f)))
(lambda ()
exp)
body ...)
(lambda ()
(%set-automatic-finalization-enabled?! enabled?)))))
(define-syntax-rule (without-garbage-collection body ...)
"Turn off garbage collection within the dynamic extent of BODY. This is useful
to avoid the creation new garbage collection thread. Note that pre-existing
GC marker threads are only disabled, not terminated."
(dynamic-wind
(lambda ()
(gc-disable))
(lambda ()
body ...)
(lambda ()
(gc-enable))))
(define-syntax-rule (without-threads body ...)
"Ensure the Guile finalizer thread is stopped and that garbage collection does
not run. Note that pre-existing GC marker threads are only disabled, not
terminated. This also leaves the signal handling thread to be disabled by
another means, since there is no Guile API to do so."
;; Note: the three kind of threads that Guile can spawn are the finalization
;; thread, the signal thread, or the GC marker threads.
(without-automatic-finalization
(without-garbage-collection body ...)))
(define (ensure-signal-delivery-thread)
"Ensure the signal delivery thread is spawned and its state set
to 'RUNNING'. This is valid as of the implementation as of Guile 3.0.9."
(sigaction SIGUSR1)) ;could be any signal
;; The libc interface to sys_clone is not useful for Scheme programs, so the
;; low-level system call is wrapped instead. The 'syscall' function is
;; declared in <unistd.h> as a variadic function; in practice, it expects 6
@ -1223,6 +1252,24 @@ are shared between the parent and child processes."
(list err))
ret)))))
(define (safe-clone flags child parent)
"This is a raw clone syscall wrapper that ensures no Guile thread will be
spawned during execution of the child. `clone' is called with FLAGS. CHILD
is a thunk to run in the child process. PARENT is procedure that accepts the
child PID as argument. This is useful in many contexts, such as when calling
`unshare' or async-unsafe procedures in the child when the parent process
memory (CLONE_VM) or threads (CLONE_THREAD) are shared with it."
;; TODO: Contribute `clone' to Guile, and handle these complications there,
;; similarly to how it's handled for scm_fork in posix.c.
;; XXX: This is a hack: as of Guile 3.0.9, by starting the signal delivery
;; thread in the parent, its state will be known as RUNNING, and the child
;; won't attempt to start it itself.
(ensure-signal-delivery-thread)
(match (clone flags)
(0 (without-threads (child)))
(pid (parent pid))))
(define (thread-count)
"Return the complete thread count of the current process. Unlike
`all-threads', this also counts the Guile signal delivery, and finalizer

View file

@ -3,6 +3,7 @@
;;; Copyright © 2015 David Thompson <davet@gnu.org>
;;; Copyright © 2020 Simon South <simon@simonsouth.net>
;;; Copyright © 2020 Mathieu Othacehe <m.othacehe@gmail.com>
;;; Copyright © 2025 Maxim Cournoyer <maxim@guixotic.coop>
;;;
;;; This file is part of GNU Guix.
;;;
@ -29,7 +30,8 @@
#:use-module (srfi srfi-71)
#:use-module (system foreign)
#:use-module ((ice-9 ftw) #:select (scandir))
#:use-module (ice-9 match))
#:use-module (ice-9 match)
#:use-module (ice-9 threads))
;; Test the (guix build syscalls) module, although there's not much that can
;; actually be tested without being root.
@ -158,6 +160,38 @@
(lambda args
(system-error-errno args))))
(define child-thunk
(lambda ()
(gc) ;spawn GC threads
(primitive-exit
(catch 'system-error
(lambda ()
(unshare CLONE_THREAD)
0) ;no error
(lambda args
(system-error-errno args))))))
(define parent-proc
(lambda (pid)
(match (waitpid pid)
((_ . status)
(status:exit-val status)))))
(unless perform-container-tests?
(test-skip 1))
(test-equal "clone and unshare triggers EINVAL"
EINVAL
(match (clone (logior CLONE_NEWUSER SIGCHLD))
(0 (child-thunk))
(pid (parent-proc pid))))
(unless perform-container-tests?
(test-skip 1))
(test-equal "safe-clone and unshare succeeds"
0
(safe-clone (logior CLONE_NEWUSER SIGCHLD)
child-thunk parent-proc))
(unless perform-container-tests?
(test-skip 1))
(test-assert "setns"