antlir/loopback.py (235 lines of code) (raw):
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
This is a poor man's port of set_up_volume.sh to allow `package.new` to
emit btrfs loopbacks.
"""
import os
import subprocess
import sys
from typing import Iterable, Optional
from .bzl.loopback_opts import loopback_opts_t
from .common import get_logger, kernel_version, run_stdout_to_err
from .fs_utils import Path, temp_dir
from .unshare import Unshare, nsenter_as_root, nsenter_as_user
log = get_logger()
KiB = 2 ** 10
MiB = 2 ** 20
# Otherwise, `mkfs.btrfs` fails with:
# ERROR: minimum size for each btrfs device is 114294784
MIN_CREATE_BYTES = 109 * MiB
# The smallest size, to which btrfs will GROW a tiny filesystem. For
# lower values, `btrfs resize` prints:
# ERROR: unable to resize '_foo/volume': Invalid argument
# MIN_GROW_BYTES = 175 * MiB
#
# When a filesystem's `min-dev-size` is small, `btrfs resize` below this
# limit will fail to shrink with `Invalid argument`.
MIN_SHRINK_BYTES = 256 * MiB
class LoopbackVolume:
def __init__(
self,
unshare: Optional[Unshare],
image_path: Path,
fs_type: str,
# pyre-fixme[9]: mount_options has type `Iterable[str]`; used as `None`.
mount_options: Iterable[str] = None,
loopback_opts: Optional[loopback_opts_t] = None,
) -> None:
self._unshare = unshare
self._temp_dir_ctx = temp_dir()
self._image_path = Path(image_path).abspath()
self._fs_type = fs_type
self._mount_dir: Optional[Path] = None
self._mount_options = mount_options or None
self._temp_dir: Optional[Path] = None
self._loopback_opts: Optional[loopback_opts_t] = loopback_opts
def __enter__(self) -> "LoopbackVolume":
self._temp_dir = self._temp_dir_ctx.__enter__().abspath()
try:
self._mount_dir = self._temp_dir / b"volume"
os.mkdir(self._mount_dir)
# pyre-fixme[16]: `LoopbackVolume` has no attribute `_loop_dev`.
self._loop_dev = self.mount()
except BaseException: # pragma: nocover
self.__exit__(*sys.exc_info())
raise
return self
def __exit__(self, exc_type, exc_val, exc_tb) -> bool:
"This only suppresses exceptions if TemporaryDirectory.__exit__ does."
if self._mount_dir:
# If this throws, we won't be able to clean up `_mount_dir`, so
# let the error fly. If the loopback is inside an Unshare
# object, the mount itself will eventually get cleaned up, but
# we don't have ownership to trigger Unshare cleanup, and in any
# case, that kind of clean-up is asynchronous, and would be
# tricky to await properly.
#
# NB: It's possible to use tmpfs and namespaces to guarantee
# cleanup, but it's just an empty directory in `/tmp`, so it's
# really not worth the complexity.
self.unmount_if_mounted()
return self._temp_dir_ctx.__exit__(exc_type, exc_val, exc_tb)
def mount(self) -> Path:
mount_opts = "loop,discard,nobarrier"
if self._mount_options:
mount_opts += ",{}".format(",".join(self._mount_options))
log.info(
f"Mounting {self._fs_type} {self._image_path} at {self._mount_dir} "
f"with {mount_opts}"
)
# Explicitly set filesystem type to detect shenanigans.
run_stdout_to_err(
nsenter_as_root(
self._unshare,
"mount",
"-t",
self._fs_type,
"-o",
mount_opts,
self._image_path,
# pyre-fixme[6]: Expected `List[Variable[typing.AnyStr <: [str,
# bytes]]]` for 8th param but got `Optional[Path]`.
self._mount_dir,
),
check=True,
)
loop_dev = subprocess.check_output(
nsenter_as_user(
self._unshare,
"findmnt",
"--noheadings",
"--output",
"SOURCE",
# pyre-fixme[6]: Expected `List[Variable[typing.AnyStr <: [str,
# bytes]]]` for 6th param but got `Optional[Path]`.
self._mount_dir,
)
).rstrip(b"\n")
# This increases the chances that --direct-io=on will succeed, since one
# of the common failure modes is that the loopback's sector size is NOT
# a multiple of the sector size of the underlying device (the devices
# we've seen in production have sector sizes of 512, 1024, or 4096).
if (
run_stdout_to_err(
["sudo", "losetup", "--sector-size=4096", loop_dev]
).returncode
!= 0
): # pragma: nocover
log.error(
f"Failed to set --sector-size=4096 for {loop_dev}, setting "
"direct IO is more likely to fail."
)
# This helps perf and avoids doubling our usage of buffer cache.
# Also, when the image is on tmpfs, setting direct IO fails.
if (
run_stdout_to_err(
["sudo", "losetup", "--direct-io=on", loop_dev]
).returncode
!= 0
): # pragma: nocover
log.error(
f"Could not enable --direct-io for {loop_dev}, expect worse "
"performance."
)
# pyre-fixme[7]: Expected `Path` but got `bytes`.
return loop_dev
def unmount_if_mounted(self) -> None:
if self._mount_dir:
# Nothing might have been mounted, ignore exit code
run_stdout_to_err(
nsenter_as_root(self._unshare, "umount", self._mount_dir)
)
def dir(self) -> Path:
# pyre-fixme[7]: Expected `Path` but got `Optional[Path]`.
return self._mount_dir
def btrfs_compress_mount_opts() -> str:
# kernel versions pre-5.1 did not support compression level tuning
return "compress=zstd" if kernel_version() < (5, 1) else "compress=zstd:19"
class BtrfsLoopbackVolume(LoopbackVolume):
def __init__(self, size_bytes: int, **kwargs) -> None:
if size_bytes < MIN_CREATE_BYTES:
raise AttributeError(
f"A btrfs loopback must be at least {MIN_CREATE_BYTES} bytes. "
f"requested size: {size_bytes}"
)
self._size_bytes = size_bytes
super().__init__(
mount_options=[btrfs_compress_mount_opts()],
fs_type="btrfs",
**kwargs,
)
def __enter__(self) -> "BtrfsLoopbackVolume":
try:
self._format()
except BaseException: # pragma: nocover
self.__exit__(*sys.exc_info())
raise
# pyre-fixme[7]: Expected `BtrfsLoopbackVolume` but got
# `LoopbackVolume`.
return super().__enter__()
def _create_or_resize_image_file(self, size_bytes: int) -> int:
"""
If this is resizing an existing loopback that is mounted, then
be sure to call `btrfs filesystem resize` and `losetup --set-capacity`
in the appropriate order.
"""
# Avoid an old kernel bug that is fixed since 4.16:
# btrfs soft lockup: `losetup --set-capacity /dev/loopN`
# wrongly sets block size to 1024 when backing file size is 4096-odd.
#
# Future: maybe we shouldn't hardcode 4096, but instead query:
# blockdev --getbsz /dev/loopSOMETHING
if kernel_version() < (4, 16):
block_size = 4096
rounded = (
size_bytes
+ (block_size - (size_bytes % block_size)) % block_size
)
if size_bytes != rounded:
log.warning(
f"Rounded image size {size_bytes} up to {rounded} to avoid "
"kernel bug.",
)
size_bytes = rounded
run_stdout_to_err(
["truncate", "-s", str(size_bytes), self._image_path], check=True
)
return size_bytes
def receive(self, send: int) -> subprocess.CompletedProcess:
"""
Receive a btrfs sendstream from the `send` fd
"""
return run_stdout_to_err(
# pyre-fixme[16]: `Optional` has no attribute `nsenter_as_root`.
self._unshare.nsenter_as_root(
"btrfs",
"receive",
self.dir(),
),
stdin=send,
stderr=subprocess.PIPE,
)
def _format(self) -> None:
"""
Format the loopback image with a btrfs filesystem of size
`self._size_bytes`
"""
log.info(
f"Formatting btrfs {self._size_bytes}-byte FS at {self._image_path}"
)
self._size_bytes = self._create_or_resize_image_file(self._size_bytes)
maybe_label = (
["--label", self._loopback_opts.label]
if self._loopback_opts and self._loopback_opts.label
else []
)
# Note that this can fail with 'cannot check mount status' if the
# host is in a bad state:
# - a file backing a loop device got deleted, or
# - multiple filesystems with the same UUID got mounted as a loop
# device, breaking the metadata for the affected loop device (this
# latter issue is a kernel bug).
# We don't check for this error case since there's nothing we can do to
# remediate it.
# The default profile for btrfs filesystem is the DUP. The man page
# says:
# > The mkfs utility will let the user create a filesystem with profiles
# > that write the logical blocks to 2 physical locations.
# Switching to the SINGLE profile (below) saves a lot of space (30-40%)
# as reported by `btrfs inspect-internal min-dev-size`), and loses some
# redundancy on rotational hard drives. Long history of using
# `-m single` never showed any issues with such lesser redundancy.
run_stdout_to_err(
[
"mkfs.btrfs",
"--metadata",
"single",
*maybe_label,
self._image_path,
],
check=True,
)
def minimize_size(self) -> int:
"""
Minimizes the loopback as much as possibly by inspecting
the btrfs internals and resizing the filesystem explicitly.
Returns the new size of the loopback in bytes.
"""
min_size_out = subprocess.check_output(
nsenter_as_root(
self._unshare,
"btrfs",
"inspect-internal",
"min-dev-size",
# pyre-fixme[6]: Expected `List[Variable[typing.AnyStr <: [str,
# bytes]]]` for 5th param but got `Optional[Path]`.
self._mount_dir,
)
).split(b" ")
assert min_size_out[1] == b"bytes"
maybe_min_size_bytes = int(min_size_out[0])
# Btrfs filesystems cannot be resized below a certain limit, if if we
# have a smaller fs than the limit, we just use the limit.
min_size_bytes = (
maybe_min_size_bytes
if maybe_min_size_bytes >= MIN_SHRINK_BYTES
else MIN_SHRINK_BYTES
)
if min_size_bytes >= self._size_bytes:
log.info(
f"Nothing to do: the minimum resize limit {min_size_bytes} "
"is no less than the current filesystem size of "
f"{self._size_bytes} bytes."
)
return self._size_bytes
log.info(
f"Shrinking {self._image_path} to the btrfs minimum: "
f"{min_size_bytes} bytes."
)
run_stdout_to_err(
nsenter_as_root(
self._unshare,
"btrfs",
"filesystem",
"resize",
str(min_size_bytes),
# pyre-fixme[6]: Expected `List[Variable[typing.AnyStr <: [str,
# bytes]]]` for 6th param but got `Optional[Path]`.
self._mount_dir,
),
check=True,
)
fs_bytes = int(
subprocess.check_output(
nsenter_as_user(
self._unshare,
"findmnt",
"--bytes",
"--noheadings",
"--output",
"SIZE",
# pyre-fixme[6]: Expected `List[Variable[typing.AnyStr <: [str,
# bytes]]]` for 7th param but got `Optional[Path]`.
self._mount_dir,
)
)
)
self._create_or_resize_image_file(fs_bytes)
run_stdout_to_err(
# pyre-fixme[16]: `BtrfsLoopbackVolume` has no attribute
# `_loop_dev`.
["sudo", "losetup", "--set-capacity", self._loop_dev],
check=True,
)
assert min_size_bytes == fs_bytes
self._size_bytes = min_size_bytes
return self._size_bytes
def get_size(self) -> int:
return self._size_bytes