pkg/raid/raid.go (204 lines of code) (raw):
// Copyright 2024 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raid
import (
"errors"
"fmt"
"os"
"regexp"
"slices"
"strings"
"k8s.io/klog/v2"
"github.com/GoogleCloudPlatform/csi-node-cache/pkg/util"
)
const (
mdadmCmd = "/bin/mdadm"
mdstatFile = "/proc/mdstat"
)
var (
mdstatInactive = regexp.MustCompile(`^([^ ]+) : inactive ([a-zA-Z0-9]+)`)
)
type RaidArray interface {
Init() error
Device() string
Stop() error
}
type mirrorArray struct {
target string
primary string
replicas []string
}
var _ RaidArray = &mirrorArray{}
type stripedArray struct {
target string
devices []string
}
func NewMirrorArray(target, primary string, replicas ...string) RaidArray {
return &mirrorArray{target: target, primary: primary, replicas: replicas}
}
func (m *mirrorArray) Device() string {
return m.target
}
func (m *mirrorArray) Init() error {
if err := validateDevice(m.primary); err != nil {
return err
}
for _, dev := range m.replicas {
if err := validateDevice(dev); err != nil {
return err
}
}
if err := stopAllInactive(); err != nil {
return err
}
primaryIsRaid, err := isExistingRaidVolume(m.target, m.primary)
if err != nil {
return fmt.Errorf("Error when checking if %s is already a raid disk: %w", m.primary, err)
}
if primaryIsRaid {
return assembleExistingMirror(m.target, m.primary, m.replicas...)
}
for _, repl := range m.replicas {
replIsRaid, err := isExistingRaidVolume(m.target, repl)
if err != nil {
return fmt.Errorf("Error when checking if replica %s is aleady a raid disk: %s", repl, err)
}
if replIsRaid {
return assembleExistingMirror(m.target, repl, slices.Concat([]string{m.primary}, m.replicas)...)
}
}
return createNewMirror(m.target, slices.Concat([]string{m.primary}, m.replicas)...)
}
func (m *mirrorArray) Stop() error {
return stopRaidDevice(m.Device())
}
func NewStripedArray(target string, devices ...string) RaidArray {
return &stripedArray{target: target, devices: devices}
}
func (s *stripedArray) Device() string {
return s.target
}
func (s *stripedArray) Init() error {
if err := isRaidDevice(s.target); err == nil {
return nil
}
for _, dev := range s.devices {
if err := validateDevice(dev); err != nil {
return err
}
}
if err := stopAllInactive(); err != nil {
return err
}
for _, dev := range s.devices {
isRaid, err := isExistingRaidVolume(s.target, dev)
if err != nil {
return fmt.Errorf("Error when checking if devicce %s is already a raid disk: %s", dev, err)
}
if isRaid {
return assembleExistingStriped(s.target, s.devices...)
}
}
return createNewStriped(s.target, s.devices...)
}
func (s *stripedArray) Stop() error {
return stopRaidDevice(s.Device())
}
func createNewMirror(target string, devices ...string) error {
output, err := runMdadm(slices.Concat([]string{"--create", target, "--level", "1", "--run", "--raid-devices", fmt.Sprintf("%d", len(devices))}, devices)...)
if err != nil {
return fmt.Errorf("Mirror raid creation for %s={%v} failed (%w): %s", target, devices, err, output)
}
return nil
}
func assembleExistingMirror(target, existing string, devices ...string) error {
for _, d := range devices {
if d != existing {
_ = wipeDevice(d) // Ignore any error, if there's a problem it will fail in the assemble
}
}
output, err := runMdadm("--assemble", target, existing, "--run")
if err != nil {
return fmt.Errorf("Could not bootstrap assemble from %s (%w): %s", existing, err, output)
}
output, err = runMdadm(slices.Concat([]string{"--add", target}, devices)...)
if err != nil {
_, _ = runMdadm("--stop", target) // Try to clean up as best we can
return fmt.Errorf("Could not add other devices to existing primary %s/%v (%w): %s", existing, devices, err, output)
}
return nil
}
func createNewStriped(target string, devices ...string) error {
// Force is needed if the number of devices is 1.
output, err := runMdadm(slices.Concat([]string{"--create", target, "--force", "--level", "0", "--run", "--raid-devices", fmt.Sprintf("%d", len(devices))}, devices)...)
if err != nil {
return fmt.Errorf("Striped raid creation for %s={%v} failed (%w): %s", target, devices, err, output)
}
return nil
}
func assembleExistingStriped(target string, devices ...string) error {
output, err := runMdadm(slices.Concat([]string{"--assemble", target}, devices, []string{"--run"})...)
if err != nil {
return fmt.Errorf("Existing assemble failed on %v (%w): %s", devices, err, output)
}
return nil
}
func stopAllInactive() error {
statBytes, err := os.ReadFile(mdstatFile)
if err != nil {
return fmt.Errorf("Cannot open %s for stopping inactive: %w", mdstatFile, err)
}
inactive_devices := getInactiveDevices(string(statBytes))
for _, device := range inactive_devices {
klog.Infof("Stopping inactive device %s", device)
err := stopRaidDevice(device)
if err != nil {
klog.Warningf("Could not stop inactive device %s, continuing anyway: %v", device, err)
}
}
return nil
}
func stopRaidDevice(device string) error {
if output, err := runMdadm("--stop", device); err != nil {
return fmt.Errorf("Could not stop %s (%v): %s", device, err, output)
}
return nil
}
func getInactiveDevices(mdstats string) []string {
stats := strings.Split(mdstats, "\n")
devices := []string{}
for _, line := range stats {
matches := mdstatInactive.FindStringSubmatch(line)
if len(matches) != 3 {
continue
}
devices = append(devices, fmt.Sprintf("/dev/%s", matches[1]))
}
return devices
}
func wipeDevice(device string) error {
if _, err := os.Stat(device); errors.Is(err, os.ErrNotExist) {
return fmt.Errorf("Device %s to be wiped does not exist", device)
}
_, _ = runMdadm("--zero-superblock", device)
// There's nothing to recover on errors. If the device was not already an array element, the command will fail.
return nil
}
func isRaidDevice(device string) error {
_, err := runMdadm("--detail", device)
return err // Maybe there's more information to extract from the output?
}
func validateDevice(device string) error {
info, err := os.Stat(device)
if err != nil {
return fmt.Errorf("Could not stat device %s raid: %w", device, err)
}
if info.Mode()&os.ModeDevice == 0 {
return fmt.Errorf("Expected %s to be a device", device)
}
return nil
}
func isExistingRaidVolume(target, device string) (bool, error) {
_, err := runMdadm("--examine", device)
return err == nil, nil
}
func runMdadm(args ...string) (string, error) {
output, err := util.RunCommand(mdadmCmd, args...)
return string(output), err
}