Skip to content

Commit fa91da8

Browse files
committed
Split userns code to its own struct
1 parent c436b1b commit fa91da8

File tree

2 files changed

+82
-50
lines changed

2 files changed

+82
-50
lines changed

src/runc/container.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ impl Container {
150150
MountAttrFlags::empty(),
151151
)?;
152152

153-
ns.enter(|| -> Result<_> {
153+
ns.with(|| -> Result<_> {
154154
// Don't interfere us setting the desired mode!
155155
rustix::process::umask(Mode::empty());
156156

@@ -242,7 +242,7 @@ impl Container {
242242
(major, minor): (u32, u32),
243243
) -> Result<()> {
244244
let ns = crate::util::namespace::MntNamespace::of_pid(self.pid)?;
245-
ns.enter(|| {
245+
ns.with(|| {
246246
if let Some(parent) = node.parent() {
247247
let _ = std::fs::create_dir_all(parent);
248248
}
@@ -264,7 +264,7 @@ impl Container {
264264
}
265265

266266
pub async fn symlink(&self, source: &Path, link: &Path) -> Result<()> {
267-
crate::util::namespace::MntNamespace::of_pid(self.pid)?.enter(|| {
267+
crate::util::namespace::MntNamespace::of_pid(self.pid)?.with(|| {
268268
if let Some(parent) = link.parent() {
269269
let _ = std::fs::create_dir_all(parent);
270270
}
@@ -276,7 +276,7 @@ impl Container {
276276
}
277277

278278
pub async fn rm(&self, node: &Path) -> Result<()> {
279-
crate::util::namespace::MntNamespace::of_pid(self.pid)?.enter(|| {
279+
crate::util::namespace::MntNamespace::of_pid(self.pid)?.with(|| {
280280
let _ = std::fs::remove_file(node);
281281
})
282282
}

src/util/namespace.rs

Lines changed: 78 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use std::fs::File;
2+
use std::ops::Deref;
23
use std::os::fd::AsFd;
34
use std::path::Path;
45

@@ -38,23 +39,17 @@ impl IdMap {
3839
}
3940
}
4041

41-
pub struct MntNamespace {
42-
mnt_fd: File,
42+
pub struct UserNamespace {
4343
uid_map: IdMap,
4444
gid_map: IdMap,
4545
}
4646

47-
impl MntNamespace {
47+
impl UserNamespace {
4848
/// Open the mount namespace of a process.
49-
pub fn of_pid(pid: Pid) -> Result<MntNamespace> {
50-
let mnt_fd = File::open(format!("/proc/{}/ns/mnt", pid.as_raw_nonzero()))?;
49+
pub fn of_pid(pid: Pid) -> Result<Self> {
5150
let uid_map = IdMap::read(format!("/proc/{}/uid_map", pid.as_raw_nonzero()).as_ref())?;
5251
let gid_map = IdMap::read(format!("/proc/{}/gid_map", pid.as_raw_nonzero()).as_ref())?;
53-
Ok(MntNamespace {
54-
mnt_fd,
55-
uid_map,
56-
gid_map,
57-
})
52+
Ok(Self { uid_map, gid_map })
5853
}
5954

6055
/// Check if we're in an user namespace.
@@ -72,48 +67,85 @@ impl MntNamespace {
7267
Ok(self.gid_map.translate(gid).context("GID overflows")?)
7368
}
7469

70+
/// "Enter" the user namespace.
71+
///
72+
/// This operation is not reversible.
73+
///
74+
/// This does not actually enter the user namespace, but rather just switch to become the root
75+
/// user inside the namespace.
76+
///
77+
/// Entering the user namespace turns out to be problematic.
78+
/// The reason seems to be this line [1]:
79+
/// which means `CAP_MKNOD` capability of the *init* namespace is needed.
80+
/// However task's associated security context is all relative to its current
81+
/// user namespace [2], so once you enter a user namespace there's no way of getting
82+
/// back `CAP_MKNOD` of the init namespace anymore.
83+
/// (Yes this means that even if CAP_MKNOD is granted to the container, you cannot
84+
/// create device nodes within it.)
85+
///
86+
/// [1]: https://elixir.bootlin.com/linux/v6.11.1/source/fs/namei.c#L4073
87+
/// [2]: https://elixir.bootlin.com/linux/v6.11.1/source/include/linux/cred.h#L111
88+
pub fn enter(&self) -> Result<()> {
89+
// By default `setuid` will drop capabilities when transitioning from root
90+
// to non-root user. This bit prevents it so our code still have superpower.
91+
rustix::thread::set_capabilities_secure_bits(CapabilitiesSecureBits::NO_SETUID_FIXUP)?;
92+
93+
rustix::thread::set_thread_uid(Uid::from_raw(self.uid(0)?))?;
94+
rustix::thread::set_thread_gid(Gid::from_raw(self.gid(0)?))?;
95+
Ok(())
96+
}
97+
}
98+
99+
pub struct MntNamespace {
100+
mnt_fd: File,
101+
user_ns: UserNamespace,
102+
}
103+
104+
impl Deref for MntNamespace {
105+
type Target = UserNamespace;
106+
107+
fn deref(&self) -> &UserNamespace {
108+
&self.user_ns
109+
}
110+
}
111+
112+
impl MntNamespace {
113+
/// Open the mount namespace of a process.
114+
pub fn of_pid(pid: Pid) -> Result<MntNamespace> {
115+
let mnt_fd = File::open(format!("/proc/{}/ns/mnt", pid.as_raw_nonzero()))?;
116+
let user_ns = UserNamespace::of_pid(pid)?;
117+
Ok(MntNamespace { mnt_fd, user_ns })
118+
}
119+
75120
/// Enter the mount namespace.
76-
pub fn enter<T: Send, F: FnOnce() -> T + Send>(&self, f: F) -> Result<T> {
121+
///
122+
/// This operation is not reversible.
123+
pub fn enter(&self) -> Result<()> {
124+
// Unshare FS for this specific thread so we can switch to another namespace.
125+
// Not doing this will cause EINVAL when switching to namespaces.
126+
rustix::thread::unshare(UnshareFlags::FS)?;
127+
128+
// Switch this particular thread to the container's mount namespace.
129+
rustix::thread::move_into_link_name_space(
130+
self.mnt_fd.as_fd(),
131+
Some(LinkNameSpaceType::Mount),
132+
)?;
133+
134+
// If user namespace is used, we must act like the root user *inside*
135+
// namespace to be able to create files properly (otherwise EOVERFLOW
136+
// will be returned when creating file).
137+
self.user_ns.enter()?;
138+
Ok(())
139+
}
140+
141+
/// Execute inside the mount namespace.
142+
pub fn with<T: Send, F: FnOnce() -> T + Send>(&self, f: F) -> Result<T> {
77143
// To avoid messing with rest of the process, we do everything in a new thread.
78144
// Use scoped thread to avoid 'static bound (we need to access fd).
79145
std::thread::scope(|scope| {
80146
scope
81147
.spawn(|| -> Result<T> {
82-
// Unshare FS for this specific thread so we can switch to another namespace.
83-
// Not doing this will cause EINVAL when switching to namespaces.
84-
rustix::thread::unshare(UnshareFlags::FS)?;
85-
86-
// Switch this particular thread to the container's mount namespace.
87-
rustix::thread::move_into_link_name_space(
88-
self.mnt_fd.as_fd(),
89-
Some(LinkNameSpaceType::Mount),
90-
)?;
91-
92-
// If user namespace is used, we must act like the root user *inside*
93-
// namespace to be able to create files properly (otherwise EOVERFLOW
94-
// will be returned when creating file).
95-
//
96-
// Entering the user namespace turns out to be problematic.
97-
// The reason seems to be this line [1]:
98-
// which means `CAP_MKNOD` capability of the *init* namespace is needed.
99-
// However task's associated security context is all relative to its current
100-
// user namespace [2], so once you enter a user namespace there's no way of getting
101-
// back `CAP_MKNOD` of the init namespace anymore.
102-
// (Yes this means that even if CAP_MKNOD is granted to the container, you cannot
103-
// create device nodes within it.)
104-
//
105-
// [1]: https://elixir.bootlin.com/linux/v6.11.1/source/fs/namei.c#L4073
106-
// [2]: https://elixir.bootlin.com/linux/v6.11.1/source/include/linux/cred.h#L111
107-
108-
// By default `setuid` will drop capabilities when transitioning from root
109-
// to non-root user. This bit prevents it so our code still have superpower.
110-
rustix::thread::set_capabilities_secure_bits(
111-
CapabilitiesSecureBits::NO_SETUID_FIXUP,
112-
)?;
113-
114-
rustix::thread::set_thread_uid(Uid::from_raw(self.uid(0)?))?;
115-
rustix::thread::set_thread_gid(Gid::from_raw(self.gid(0)?))?;
116-
148+
self.enter()?;
117149
Ok(f())
118150
})
119151
.join()

0 commit comments

Comments
 (0)