metalos/metalctl/src/switch_root.rs (193 lines of code) (raw):

/* * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ use std::path::Path; use anyhow::{anyhow, Context, Result}; use slog::{debug, o, Logger}; use structopt::StructOpt; use systemd::{FilePath, Systemd}; use crate::mount::{mount, Opts as MountOpts}; #[derive(StructOpt)] pub struct Opts { snapshot: Option<String>, } /// Prior to invoking `systemctl switch-root`, some setup work is required. /// Mainly, we need to fiddle with mounts so that /sysroot is the rw snapshot for /// the current bootid. This is necessary so that the newly invoked systemd has /// the correct root mount point. pub async fn switch_root(log: Logger, opts: Opts) -> Result<()> { let (device, options) = find_rootdisk_device().context("failed to find /rootdisk device")?; let mut options: Vec<_> = options.split(',').map(|s| s.to_string()).collect(); let mut log = log.new(o!()); if let Some(snapshot) = opts.snapshot { log = log.new(o!("snapshot" => snapshot.clone())); options = replace_subvol(options, &snapshot) .context("failed to replace subvolume in mount options")?; } std::fs::create_dir("/sysroot").context("failed to mkdir /sysroot")?; debug!( log, "mounting subvolume on {} with options {:?}", device, options ); mount( log.clone(), MountOpts { bind: false, source: device.clone(), target: "/sysroot".into(), fstype: Some("btrfs".into()), options: options.clone(), }, ) .with_context(|| format!("failed to mount '{}' on /sysroot {:?}", device, options))?; let sd = Systemd::connect(log.clone()).await?; // systemctl daemon-reload is necessary after mounting the // to-switch-root-into snapshot at /sysroot, since systemd will // automatically reload some unit configuration from /sysroot when running // inside the initrd, and this behavior is necessary to pass the correct // state of units into the new systemd in the root fs. debug!(log, "requesting systemd reload"); sd.reload() .await .context("failed to reload systemd units (systemctl daemon-reload)")?; debug!(log, "switch-rooting into /sysroot"); // ask systemd to switch-root to the new root fs sd.switch_root(FilePath::new("/sysroot"), FilePath::new("")) .await .context("failed to trigger switch-root (systemctl switch-root /syroot)") } fn replace_subvol<S: AsRef<str>, T: AsRef<str>>(options: Vec<S>, new: T) -> Result<Vec<String>> { let mut out = Vec::new(); for opt in options.into_iter() { if opt.as_ref().starts_with("subvolid=") { continue; } let new_op = match opt.as_ref().strip_prefix("subvol=") { Some(subvol) => { // the subvolume that we are switch-rooting into is guaranteed // to be nested under whatever subvolume is already mounted at // /run/fs/control. So we want to strip off the /run/fs/control // so that we can get the path relative to the top of the volume let new = match Path::new(new.as_ref()).strip_prefix(metalos_paths::control()) { Ok(subvol) => subvol, Err(_) => { return Err(anyhow!( "Found subvolume ({}) option but it didn't start with {}", new.as_ref(), metalos_paths::control().display(), )); } }; format!( "subvol={}/{}", subvol, new.to_str() .with_context(|| format!("new subvol {:?} was not utf-8", new))? ) } None => opt.as_ref().into(), }; out.push(new_op); } Ok(out) } fn find_rootdisk_device() -> Result<(String, String)> { let mounts = std::fs::read_to_string("/proc/mounts").context("failed to read /proc/mounts")?; let (dev, opts) = parse_rootdisk_device(mounts)?; // attempt to resolve any symlinks or otherwise non-canonical paths let dev = std::fs::canonicalize(&dev) .map(|path| path.to_string_lossy().into()) .unwrap_or(dev); Ok((dev, opts)) } /// Parse /proc/mounts output to find the device which is mounted at /run/fs/control fn parse_rootdisk_device(mounts: String) -> Result<(String, String)> { let (mut dev, opts): (String, String) = mounts .lines() .filter_map(|l| { let fields: Vec<_> = l.split_whitespace().collect(); if Path::new(fields[1]) == metalos_paths::control() { Some((fields[0].into(), fields[3].into())) } else { None } }) .next() .ok_or_else(|| anyhow!("{} not in mounts", metalos_paths::control().display()))?; // /proc/mounts escapes characters with octal if dev.contains('\\') { let mut octal_chars: Option<String> = None; dev = dev.chars().fold("".to_string(), |mut s, ch| { if let Some(ref mut oc) = octal_chars { oc.push(ch); if oc.len() == 3 { let escaped = u32::from_str_radix(&oc, 8) .with_context(|| format!("'{}' is not a valid octal number", &oc)) .unwrap(); let escaped = char::from_u32(escaped) .with_context(|| format!("0o{} is not a valid character", escaped)) .unwrap(); s.push(escaped); octal_chars = None; } } else if ch == '\\' { octal_chars = Some(String::new()); } else { s.push(ch); } s }); } Ok((dev, opts)) } #[cfg(test)] mod tests { use super::{parse_rootdisk_device, replace_subvol}; use anyhow::Result; #[test] fn rootdisk_device() -> Result<()> { let input = r#"rootfs / rootfs rw 0 0 proc /proc proc rw,nosuid,nodev,noexec,relatime 0 0 sysfs /sys sysfs rw,nosuid,nodev,noexec,relatime 0 0 devtmpfs /dev devtmpfs rw,nosuid,size=4096k,nr_inodes=65536,mode=755 0 0 securityfs /sys/kernel/security securityfs rw,nosuid,nodev,noexec,relatime 0 0 tmpfs /dev/shm tmpfs rw,nosuid,nodev 0 0 devpts /dev/pts devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=000 0 0 tmpfs /run tmpfs rw,nosuid,nodev,size=806188k,nr_inodes=819200,mode=755 0 0 cgroup2 /sys/fs/cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot 0 0 pstore /sys/fs/pstore pstore rw,nosuid,nodev,noexec,relatime 0 0 bpf /sys/fs/bpf bpf rw,nosuid,nodev,noexec,relatime,mode=700 0 0 fs0 /data/users/vmagro/fbsource 9p ro,dirsync,relatime,loose,access=client,trans=virtio 0 0 fs2 /data/users/vmagro/scratch/dataZusersZvmagroZfbsource/buck-image-out 9p ro,dirsync,relatime,loose,access=client,trans=virtio 0 0 fs1 /mnt/gvfs 9p ro,dirsync,relatime,loose,access=client,trans=virtio 0 0 usr-local-fbcode /usr/local/fbcode 9p ro,dirsync,relatime,loose,access=client,trans=virtio 0 0 /dev/vdc /vmtest btrfs ro,relatime,space_cache,subvolid=256,subvol=/volume 0 0 /dev/vda /run/fs/control btrfs rw,relatime,space_cache,subvolid=256,subvol=/volume 0 0"#.to_string(); assert_eq!( parse_rootdisk_device(input)?, ( "/dev/vda".into(), "rw,relatime,space_cache,subvolid=256,subvol=/volume".into() ) ); let input = r#"rootfs / rootfs rw 0 0 /dev/vdc /vmtest btrfs ro,relatime,space_cache,subvolid=256,subvol=/volume 0 0 /dev/disk/by-label/\134x2f /run/fs/control btrfs rw,relatime,space_cache,subvolid=256,subvol=/volume 0 0"#.to_string(); assert_eq!( parse_rootdisk_device(input)?, ( r"/dev/disk/by-label/\x2f".into(), "rw,relatime,space_cache,subvolid=256,subvol=/volume".into() ) ); Ok(()) } #[test] fn subvol_replacements() { assert_eq!( replace_subvol( vec![ "rw", "relatime", "space_cache", "subvolid=256", "subvol=volume" ], "/run/fs/control/run/boot/0:bootid", ) .expect("Failed to call replace_subvol"), vec![ "rw", "relatime", "space_cache", "subvol=volume/run/boot/0:bootid", ], ) } }