mirror of
https://codeberg.org/guix/guix.git
synced 2026-01-25 03:55:08 -06:00
build/syscalls: Introduce new safe-clone and use it.
* guix/build/syscalls.scm (without-automatic-finalization): Accept multiple
expressions.
(without-garbage-collection): New syntax.
(without-threads): Likewise.
(ensure-signal-delivery-thread, safe-clone): New procedures.
* tests/syscalls.scm: ("clone and unshare triggers EINVAL")
("safe-clone and unshare succeeds"): New tests.
* gnu/build/linux-container.scm (run-container): Adjust to use 'safe-clone'.
Relates-to: #1169
Change-Id: I044c11a899e24e547a7aed97f30c8e7250ab5363
This commit is contained in:
parent
3966f76297
commit
1eccea7ffb
3 changed files with 168 additions and 94 deletions
|
|
@ -263,100 +263,93 @@ that host UIDs (respectively GIDs) map to in the namespace."
|
|||
;; child process blocks until the parent writes to it.
|
||||
(match (socketpair PF_UNIX (logior SOCK_CLOEXEC SOCK_STREAM) 0)
|
||||
((child . parent)
|
||||
(let ((flags (namespaces->bit-mask namespaces)))
|
||||
(match (clone flags)
|
||||
(0
|
||||
;; Inhibit thread creation until after the unshare call.
|
||||
(gc-disable)
|
||||
(call-with-clean-exit
|
||||
(lambda ()
|
||||
(close-port parent)
|
||||
;; Wait for parent to set things up.
|
||||
(match (read child)
|
||||
('ready
|
||||
(purify-environment)
|
||||
(when (and (memq 'mnt namespaces)
|
||||
(not (string=? root "/")))
|
||||
(catch #t
|
||||
(lambda ()
|
||||
(mount-file-systems root mounts
|
||||
#:mount-/proc? (memq 'pid namespaces)
|
||||
#:mount-/sys? (memq 'net
|
||||
namespaces)
|
||||
#:populate-file-system
|
||||
(lambda ()
|
||||
(populate-file-system)
|
||||
(when (and (memq 'net namespaces)
|
||||
loopback-network?)
|
||||
(set-network-interface-up "lo")
|
||||
(safe-clone
|
||||
(namespaces->bit-mask namespaces)
|
||||
(lambda ()
|
||||
(call-with-clean-exit
|
||||
(lambda ()
|
||||
(close-port parent)
|
||||
;; Wait for parent to set things up.
|
||||
(match (read child)
|
||||
('ready
|
||||
(purify-environment)
|
||||
(when (and (memq 'mnt namespaces)
|
||||
(not (string=? root "/")))
|
||||
(catch #t
|
||||
(lambda ()
|
||||
(mount-file-systems root mounts
|
||||
#:mount-/proc? (memq 'pid namespaces)
|
||||
#:mount-/sys? (memq 'net
|
||||
namespaces)
|
||||
#:populate-file-system
|
||||
(lambda ()
|
||||
(populate-file-system)
|
||||
(when (and (memq 'net namespaces)
|
||||
loopback-network?)
|
||||
(set-network-interface-up "lo")
|
||||
|
||||
;; When isolated from the
|
||||
;; network, provide a minimal
|
||||
;; /etc/hosts to resolve
|
||||
;; "localhost".
|
||||
(mkdir-p "/etc")
|
||||
(call-with-output-file "/etc/hosts"
|
||||
(lambda (port)
|
||||
(display "127.0.0.1 localhost\n" port)
|
||||
(chmod port #o444)))))
|
||||
#:writable-root?
|
||||
(or writable-root?
|
||||
(not (memq 'mnt namespaces)))))
|
||||
(lambda args
|
||||
;; Forward the exception to the parent process.
|
||||
;; FIXME: SRFI-35 conditions and non-trivial objects
|
||||
;; cannot be 'read' so they shouldn't be written as is.
|
||||
(write args child)
|
||||
(primitive-exit 3))))
|
||||
;; When isolated from the
|
||||
;; network, provide a minimal
|
||||
;; /etc/hosts to resolve
|
||||
;; "localhost".
|
||||
(mkdir-p "/etc")
|
||||
(call-with-output-file "/etc/hosts"
|
||||
(lambda (port)
|
||||
(display "127.0.0.1 localhost\n" port)
|
||||
(chmod port #o444)))))
|
||||
#:writable-root?
|
||||
(or writable-root?
|
||||
(not (memq 'mnt namespaces)))))
|
||||
(lambda args
|
||||
;; Forward the exception to the parent process.
|
||||
;; FIXME: SRFI-35 conditions and non-trivial objects
|
||||
;; cannot be 'read' so they shouldn't be written as is.
|
||||
(write args child)
|
||||
(primitive-exit 3))))
|
||||
|
||||
(when (and lock-mounts?
|
||||
(memq 'mnt namespaces)
|
||||
(memq 'user namespaces))
|
||||
;; Create a new mount namespace owned by a new user
|
||||
;; namespace to "lock" together previous mounts, such that
|
||||
;; they cannot be unmounted or remounted separately--see
|
||||
;; mount_namespaces(7).
|
||||
;;
|
||||
;; Note: at this point, the process is single-threaded (no
|
||||
;; GC mark threads, no finalization thread, etc.) which is
|
||||
;; why unshare(CLONE_NEWUSER) can be used.
|
||||
(let ((uid (getuid)) (gid (getgid)))
|
||||
(unshare (logior CLONE_NEWUSER CLONE_NEWNS))
|
||||
(gc-enable)
|
||||
(when (file-exists? "/proc/self")
|
||||
(initialize-user-namespace (getpid)
|
||||
host-uids
|
||||
#:host-uid uid
|
||||
#:host-gid gid
|
||||
#:guest-uid guest-uid
|
||||
#:guest-gid guest-gid))))
|
||||
(when (and lock-mounts?
|
||||
(memq 'mnt namespaces)
|
||||
(memq 'user namespaces))
|
||||
;; Create a new mount namespace owned by a new user
|
||||
;; namespace to "lock" together previous mounts, such that
|
||||
;; they cannot be unmounted or remounted separately--see
|
||||
;; mount_namespaces(7).
|
||||
(let ((uid (getuid)) (gid (getgid)))
|
||||
(unshare (logior CLONE_NEWUSER CLONE_NEWNS))
|
||||
(when (file-exists? "/proc/self")
|
||||
(initialize-user-namespace (getpid)
|
||||
host-uids
|
||||
#:host-uid uid
|
||||
#:host-gid gid
|
||||
#:guest-uid guest-uid
|
||||
#:guest-gid guest-gid))))
|
||||
|
||||
;; TODO: Manage capabilities.
|
||||
(write 'ready child)
|
||||
(close-port child)
|
||||
(thunk))
|
||||
(_ ;parent died or something
|
||||
(primitive-exit 2))))))
|
||||
(pid
|
||||
(close-port child)
|
||||
(when (memq 'user namespaces)
|
||||
(initialize-user-namespace pid host-uids
|
||||
#:guest-uid guest-uid
|
||||
#:guest-gid guest-gid))
|
||||
;; TODO: Initialize cgroups.
|
||||
(write 'ready parent)
|
||||
(newline parent)
|
||||
;; TODO: Manage capabilities.
|
||||
(write 'ready child)
|
||||
(close-port child)
|
||||
(thunk))
|
||||
(_ ;parent died or something
|
||||
(primitive-exit 2))))))
|
||||
(lambda (pid)
|
||||
(close-port child)
|
||||
(when (memq 'user namespaces)
|
||||
(initialize-user-namespace pid host-uids
|
||||
#:guest-uid guest-uid
|
||||
#:guest-gid guest-gid))
|
||||
;; TODO: Initialize cgroups.
|
||||
(write 'ready parent)
|
||||
(newline parent)
|
||||
|
||||
;; Check whether the child process' setup phase succeeded.
|
||||
(let ((message (read parent)))
|
||||
(close-port parent)
|
||||
(match message
|
||||
('ready ;success
|
||||
pid)
|
||||
(((? symbol? key) args ...) ;exception
|
||||
(apply throw key args))
|
||||
(_ ;unexpected termination
|
||||
#f)))))))))
|
||||
;; Check whether the child process' setup phase succeeded.
|
||||
(let ((message (read parent)))
|
||||
(close-port parent)
|
||||
(match message
|
||||
('ready ;success
|
||||
pid)
|
||||
(((? symbol? key) args ...) ;exception
|
||||
(apply throw key args))
|
||||
(_ ;unexpected termination
|
||||
#f))))))))
|
||||
|
||||
;; FIXME: This is copied from (guix utils), which we cannot use because it
|
||||
;; would pull (guix config) and all.
|
||||
|
|
|
|||
|
|
@ -150,6 +150,7 @@
|
|||
CLONE_THREAD
|
||||
CLONE_VM
|
||||
clone
|
||||
safe-clone
|
||||
unshare
|
||||
setns
|
||||
get-user-ns
|
||||
|
|
@ -1170,17 +1171,45 @@ caller lacks root privileges."
|
|||
Turning finalization off shuts down the finalization thread as a side effect."
|
||||
(->bool ((force proc) (if enabled? 1 0))))))
|
||||
|
||||
(define-syntax-rule (without-automatic-finalization exp)
|
||||
"Turn off automatic finalization within the dynamic extent of EXP."
|
||||
(define-syntax-rule (without-automatic-finalization body ...)
|
||||
"Turn off automatic finalization within the dynamic extent of BODY. This is
|
||||
useful to ensure there is no finalization thread."
|
||||
(let ((enabled? #t))
|
||||
(dynamic-wind
|
||||
(lambda ()
|
||||
(set! enabled? (%set-automatic-finalization-enabled?! #f)))
|
||||
(lambda ()
|
||||
exp)
|
||||
body ...)
|
||||
(lambda ()
|
||||
(%set-automatic-finalization-enabled?! enabled?)))))
|
||||
|
||||
(define-syntax-rule (without-garbage-collection body ...)
|
||||
"Turn off garbage collection within the dynamic extent of BODY. This is useful
|
||||
to avoid the creation new garbage collection thread. Note that pre-existing
|
||||
GC marker threads are only disabled, not terminated."
|
||||
(dynamic-wind
|
||||
(lambda ()
|
||||
(gc-disable))
|
||||
(lambda ()
|
||||
body ...)
|
||||
(lambda ()
|
||||
(gc-enable))))
|
||||
|
||||
(define-syntax-rule (without-threads body ...)
|
||||
"Ensure the Guile finalizer thread is stopped and that garbage collection does
|
||||
not run. Note that pre-existing GC marker threads are only disabled, not
|
||||
terminated. This also leaves the signal handling thread to be disabled by
|
||||
another means, since there is no Guile API to do so."
|
||||
;; Note: the three kind of threads that Guile can spawn are the finalization
|
||||
;; thread, the signal thread, or the GC marker threads.
|
||||
(without-automatic-finalization
|
||||
(without-garbage-collection body ...)))
|
||||
|
||||
(define (ensure-signal-delivery-thread)
|
||||
"Ensure the signal delivery thread is spawned and its state set
|
||||
to 'RUNNING'. This is valid as of the implementation as of Guile 3.0.9."
|
||||
(sigaction SIGUSR1)) ;could be any signal
|
||||
|
||||
;; The libc interface to sys_clone is not useful for Scheme programs, so the
|
||||
;; low-level system call is wrapped instead. The 'syscall' function is
|
||||
;; declared in <unistd.h> as a variadic function; in practice, it expects 6
|
||||
|
|
@ -1223,6 +1252,24 @@ are shared between the parent and child processes."
|
|||
(list err))
|
||||
ret)))))
|
||||
|
||||
(define (safe-clone flags child parent)
|
||||
"This is a raw clone syscall wrapper that ensures no Guile thread will be
|
||||
spawned during execution of the child. `clone' is called with FLAGS. CHILD
|
||||
is a thunk to run in the child process. PARENT is procedure that accepts the
|
||||
child PID as argument. This is useful in many contexts, such as when calling
|
||||
`unshare' or async-unsafe procedures in the child when the parent process
|
||||
memory (CLONE_VM) or threads (CLONE_THREAD) are shared with it."
|
||||
;; TODO: Contribute `clone' to Guile, and handle these complications there,
|
||||
;; similarly to how it's handled for scm_fork in posix.c.
|
||||
|
||||
;; XXX: This is a hack: as of Guile 3.0.9, by starting the signal delivery
|
||||
;; thread in the parent, its state will be known as RUNNING, and the child
|
||||
;; won't attempt to start it itself.
|
||||
(ensure-signal-delivery-thread)
|
||||
(match (clone flags)
|
||||
(0 (without-threads (child)))
|
||||
(pid (parent pid))))
|
||||
|
||||
(define (thread-count)
|
||||
"Return the complete thread count of the current process. Unlike
|
||||
`all-threads', this also counts the Guile signal delivery, and finalizer
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
;;; Copyright © 2015 David Thompson <davet@gnu.org>
|
||||
;;; Copyright © 2020 Simon South <simon@simonsouth.net>
|
||||
;;; Copyright © 2020 Mathieu Othacehe <m.othacehe@gmail.com>
|
||||
;;; Copyright © 2025 Maxim Cournoyer <maxim@guixotic.coop>
|
||||
;;;
|
||||
;;; This file is part of GNU Guix.
|
||||
;;;
|
||||
|
|
@ -29,7 +30,8 @@
|
|||
#:use-module (srfi srfi-71)
|
||||
#:use-module (system foreign)
|
||||
#:use-module ((ice-9 ftw) #:select (scandir))
|
||||
#:use-module (ice-9 match))
|
||||
#:use-module (ice-9 match)
|
||||
#:use-module (ice-9 threads))
|
||||
|
||||
;; Test the (guix build syscalls) module, although there's not much that can
|
||||
;; actually be tested without being root.
|
||||
|
|
@ -158,6 +160,38 @@
|
|||
(lambda args
|
||||
(system-error-errno args))))
|
||||
|
||||
(define child-thunk
|
||||
(lambda ()
|
||||
(gc) ;spawn GC threads
|
||||
(primitive-exit
|
||||
(catch 'system-error
|
||||
(lambda ()
|
||||
(unshare CLONE_THREAD)
|
||||
0) ;no error
|
||||
(lambda args
|
||||
(system-error-errno args))))))
|
||||
|
||||
(define parent-proc
|
||||
(lambda (pid)
|
||||
(match (waitpid pid)
|
||||
((_ . status)
|
||||
(status:exit-val status)))))
|
||||
|
||||
(unless perform-container-tests?
|
||||
(test-skip 1))
|
||||
(test-equal "clone and unshare triggers EINVAL"
|
||||
EINVAL
|
||||
(match (clone (logior CLONE_NEWUSER SIGCHLD))
|
||||
(0 (child-thunk))
|
||||
(pid (parent-proc pid))))
|
||||
|
||||
(unless perform-container-tests?
|
||||
(test-skip 1))
|
||||
(test-equal "safe-clone and unshare succeeds"
|
||||
0
|
||||
(safe-clone (logior CLONE_NEWUSER SIGCHLD)
|
||||
child-thunk parent-proc))
|
||||
|
||||
(unless perform-container-tests?
|
||||
(test-skip 1))
|
||||
(test-assert "setns"
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue