Reinstate "linux-container: Remove #:lock-mounts? and related code."

This reverts commit e0e64be8de.
This commit is contained in:
Maxim Cournoyer 2025-10-16 16:36:43 +09:00
parent d0144544ff
commit 10edda5aa5
No known key found for this signature in database
GPG key ID: 1260E46482E63562
3 changed files with 107 additions and 41 deletions

View file

@ -190,7 +190,10 @@ for the process."
(remount-read-only "/"))))
(define* (initialize-user-namespace pid host-uids
#:key (guest-uid 0) (guest-gid 0))
#:key
(host-uid (getuid))
(host-gid (getgid))
(guest-uid 0) (guest-gid 0))
"Configure the user namespace for PID. HOST-UIDS specifies the number of
host user identifiers to map into the user namespace. GUEST-UID and GUEST-GID
specify the first UID (respectively GID) that host UIDs (respectively GIDs)
@ -201,24 +204,21 @@ map to in the namespace."
(define (scope file)
(string-append proc-dir file))
(let ((uid (getuid))
(gid (getgid)))
;; Only root can write to the gid map without first disabling the
;; setgroups syscall.
(unless (and (zero? uid) (zero? gid))
(call-with-output-file (scope "/setgroups")
(lambda (port)
(display "deny" port))))
;; Map the user/group that created the container to the root user
;; within the container.
(call-with-output-file (scope "/uid_map")
;; Only root can write to the gid map without first disabling the
;; setgroups syscall.
(unless (and (zero? host-uid) (zero? host-gid))
(call-with-output-file (scope "/setgroups")
(lambda (port)
(format port "~d ~d ~d" guest-uid uid host-uids)))
(call-with-output-file (scope "/gid_map")
(lambda (port)
(format port "~d ~d ~d" guest-gid gid host-uids)))))
(display "deny" port))))
;; Map the user/group that created the container to the root user
;; within the container.
(call-with-output-file (scope "/uid_map")
(lambda (port)
(format port "~d ~d ~d" guest-uid host-uid host-uids)))
(call-with-output-file (scope "/gid_map")
(lambda (port)
(format port "~d ~d ~d" guest-gid host-gid host-uids))))
(define (namespaces->bit-mask namespaces)
"Return the number suitable for the 'flags' argument of 'clone' that
@ -239,12 +239,14 @@ corresponds to the symbols in NAMESPACES."
#:key (guest-uid 0) (guest-gid 0)
(populate-file-system (const #t))
(loopback-network? #t)
(lock-mounts? #t)
writable-root?)
"Run THUNK in a new container process and return its PID. ROOT specifies
the root directory for the container. MOUNTS is a list of <file-system>
objects that specify file systems to mount inside the container. NAMESPACES
is a list of symbols that correspond to the possible Linux namespaces: mnt,
ipc, uts, user, and net.
ipc, uts, user, and net. When LOCK-MOUNTS? is true, arrange so that none of
MOUNTS can be unmounted or remounted individually from within THUNK.
When LOOPBACK-NETWORK? is true and 'net is amount NAMESPACES, set up the
loopback device (\"lo\") and a minimal /etc/hosts.
@ -304,6 +306,28 @@ that host UIDs (respectively GIDs) map to in the namespace."
;; cannot be 'read' so they shouldn't be written as is.
(write args child)
(primitive-exit 3))))
(when (and lock-mounts?
(memq 'mnt namespaces)
(memq 'user namespaces))
;; Create a new mount namespace owned by a new user
;; namespace to "lock" together previous mounts, such that
;; they cannot be unmounted or remounted separately--see
;; mount_namespaces(7).
;;
;; Note: at this point, the process is single-threaded (no
;; GC mark threads, no finalization thread, etc.) which is
;; why unshare(CLONE_NEWUSER) can be used.
(let ((uid (getuid)) (gid (getgid)))
(unshare (logior CLONE_NEWUSER CLONE_NEWNS))
(when (file-exists? "/proc/self")
(initialize-user-namespace (getpid)
host-uids
#:host-uid uid
#:host-gid gid
#:guest-uid guest-uid
#:guest-gid guest-gid))))
;; TODO: Manage capabilities.
(write 'ready child)
(close-port child)
@ -376,6 +400,7 @@ if there are no child processes left."
(define* (call-with-container mounts thunk #:key (namespaces %namespaces)
(host-uids 1) (guest-uid 0) (guest-gid 0)
(lock-mounts? #t)
(relayed-signals (list SIGINT SIGTERM))
(child-is-pid1? #t)
(populate-file-system (const #t))
@ -460,6 +485,7 @@ load path must be adjusted as needed."
(call-with-temporary-directory
(lambda (root)
(let ((pid (run-container root mounts namespaces host-uids thunk*
#:lock-mounts? lock-mounts?
#:guest-uid guest-uid
#:guest-gid guest-gid
#:populate-file-system populate-file-system
@ -480,24 +506,35 @@ return the exit status, an integer as returned by 'waitpid'."
(0
(call-with-clean-exit
(lambda ()
(for-each (lambda (ns)
(let ((source (namespace-file (getpid) ns))
(target (namespace-file pid ns)))
;; Joining the namespace that the process already
;; belongs to would throw an error so avoid that.
;; XXX: This /proc interface leads to TOCTTOU.
(unless (string=? (readlink source) (readlink target))
(call-with-input-file source
(lambda (current-ns-port)
(call-with-input-file target
(lambda (new-ns-port)
(setns (fileno new-ns-port) 0))))))))
;; It's important that the user namespace is joined first,
;; so that the user will have the privileges to join the
;; other namespaces. Furthermore, it's important that the
;; mount namespace is joined last, otherwise the /proc mount
;; point would no longer be accessible.
'("user" "ipc" "uts" "net" "pid" "mnt"))
;; First, determine the user namespace that owns the pid namespace and
;; join that user namespace (the assumption is that it also owns all
;; the other namespaces). It's important that the user namespace is
;; joined first, so that the user will have the privileges to join the
;; other namespaces.
(let* ((pid-ns (open-fdes (namespace-file pid "pid")
(logior O_CLOEXEC O_RDONLY)))
(user-ns (get-user-ns pid-ns)))
(close-fdes pid-ns)
(unless (equal? (stat user-ns)
(stat (namespace-file (getpid) "user")))
(setns user-ns 0))
(close-fdes user-ns)
;; Then join all the remaining namespaces.
(for-each (lambda (ns)
(let ((source (namespace-file (getpid) ns))
(target (namespace-file pid ns)))
;; Joining the namespace that the process already
;; belongs to would throw an error so avoid that.
;; XXX: This /proc interface leads to TOCTTOU.
(unless (string=? (readlink source) (readlink target))
(call-with-input-file target
(lambda (new-ns-port)
(setns (fileno new-ns-port) 0))))))
;; It's important that the mount namespace is joined last,
;; otherwise the /proc mount point would no longer be
;; accessible.
'("ipc" "uts" "net" "pid" "mnt")))
(purify-environment)
(chdir "/")

View file

@ -317,6 +317,10 @@ Run the container with the given options."))
#:namespaces (if #$shared-network?
(delq 'net %namespaces)
%namespaces)
;; XXX: Work around <https://issues.guix.gnu.org/78356>.
#:lock-mounts? #f
#:writable-root? #t
#:process-spawned-hook explain)))))

View file

@ -1,6 +1,6 @@
;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2015 David Thompson <davet@gnu.org>
;;; Copyright © 2016, 2017, 2019, 2023 Ludovic Courtès <ludo@gnu.org>
;;; Copyright © 2016-2017, 2019, 2023, 2025 Ludovic Courtès <ludo@gnu.org>
;;;
;;; This file is part of GNU Guix.
;;;
@ -110,6 +110,26 @@
(assert-exit (file-exists? "/testing")))
#:namespaces '(user mnt))))
(skip-if-unsupported)
(test-equal "call-with-container, mnt namespace, locked mounts"
EINVAL
;; umount(2) fails with EINVAL when targeting a mount point that is
;; "locked".
(status:exit-val
(call-with-container (list (file-system
(device "none")
(mount-point "/testing")
(type "tmpfs")
(check? #f)))
(lambda ()
(primitive-exit (catch 'system-error
(lambda ()
(umount "/testing")
0)
(lambda args
(system-error-errno args)))))
#:namespaces '(user mnt))))
(skip-if-unsupported)
(test-equal "call-with-container, mnt namespace, wrong bind mount"
`(system-error ,ENOENT)
@ -169,7 +189,8 @@
#:namespaces '(user mnt))))
(skip-if-unsupported)
(test-assert "container-excursion"
(test-equal "container-excursion"
0
(call-with-temporary-directory
(lambda (root)
;; Two pipes: One for the container to signal that the test can begin,
@ -193,7 +214,11 @@
(readlink (string-append "/proc/" pid "/ns/" ns)))
'("user" "ipc" "uts" "net" "pid" "mnt"))))
(let* ((pid (run-container root '() %namespaces 1 container))
(let* ((pid (run-container root '() %namespaces 1 container
;; Do not lock mounts so the user namespace
;; appears to be the same seen from inside
;; and from outside.
#:lock-mounts? #f))
(container-namespaces (namespaces pid))
(result
(begin
@ -213,7 +238,7 @@
(write 'done end-out)
(close end-out)
(waitpid pid)
(zero? result)))))))
result))))))
(skip-if-unsupported)
(test-equal "container-excursion, same namespaces"