cookbooks/fb_storage/libraries/storage.rb (910 lines of code) (raw):
# vim: syntax=ruby:expandtab:shiftwidth=2:softtabstop=2:tabstop=2
#
# Copyright (c) 2016-present, Facebook, Inc.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
module FB
# The storage class takes the user-specificed config, and provides useful
# interfaces to it. It maps it to real devices, and also generates fstab
# configs from it. There are also some util class methods
class Storage
REPLACED_DISKS_DIR = '/var/chef/hotswap_replaced_disks'.freeze
CONVERGE_ALL_FILE = '/var/chef/storage_force_converge_all'.freeze
ERASE_ALL_FILE = '/var/chef/storage_force_erase_all'.freeze
ALREADY_ERASED_ALL_FILE = '/var/chef/.storage_already_erased_all'.freeze
PREVIOUS_DISK_ORDER = '/etc/.chef_disk_order'.freeze
FORCE_WRITE_CUSTOM_DISK_ORDER =
'/var/chef/storage_force_write_custom_disk_order'.freeze
DEV_ID_DIR = '/dev/disk/by-id'.freeze
# 'size' from sysfs always assumes 512 byte blocks
SECTOR_SIZE = 512
# Helper function for hybrid XFS users. Given the index (into
# `eligible_devices` of the device to be used for metadata, and the number
# of filesystems we expect to create, it will return the size of each
# metadata partition.
def self.hybrid_xfs_md_part_size(node, md_idx, num_fses,
sectors_reserved = 0)
device_sectors = self.hybrid_md_idx_size(node, md_idx)
sectors_available = device_sectors - sectors_reserved
# The divide-by 2K and multiply-by 2K is just to get into page
# alignment
sectors_per_part = ((sectors_available / num_fses) / 2048) * 2048
# MB per partition - get bytes, deviced by 1MB
((sectors_per_part * SECTOR_SIZE) / (1024 * 1024))
end
def self.hybrid_md_idx_size(node, md_idx)
devs = self.sorted_devices(node, FB::Fstab.get_in_maint_disks)
md_dev = devs[md_idx]
node['block_device'][md_dev]['size'].to_i
end
def self.mountpoint_uses_whole_device(node, mp)
# if the mountpoint doesn't exist, we're new and can build it
# on a partition
return false unless node.filesystem_data['by_mountpoint'][mp]
dev = node.filesystem_data['by_mountpoint'][mp]['devices'][0]
# return false if we find a partition, keep it as such
# In this case I want this code very clear, so we're violating this lint
# rule
# rubocop:disable Style/IfInsideElse
if dev.match(/(nvme|ether)/)
return false if dev.match(/p\d+$/)
else
return false if dev.match(/\d+$/)
end
# rubocop:enable Style/IfInsideElse
# if we're not new AND we didn't find a partition, then we can keep
# this on a whole device
true
end
# List of devices the storage API shouldn't touch, so the ones holding
# / and /boot as touching those could result in tears
def self.devices_to_skip(node)
# Legacy. We should probably fail hard here
return [] unless node.device_of_mount('/')
# If / or /boot is mounted in a RAID array, exclude all the members
root_dev = self.root_device_name(node)
return [] unless root_dev
boot_dev = self.boot_device_name(node)
to_skip = Set.new([root_dev, boot_dev])
to_skip.dup.each do |oot_dev|
if oot_dev&.start_with?('md')
Dir.glob("/sys/block/#{oot_dev}/slaves/*").each do |x|
to_skip << ::File.basename(x)
end
end
end
to_skip
end
# List of devices eligble for managing by the fb_storage storage system,
# i.e., non-root & boot devices
def self.eligible_devices(node)
devices_to_skip = self.devices_to_skip(node)
# Legacy. We should probably fail hard here
return [] if devices_to_skip.length.zero?
node['block_device'].to_hash.reject do |x, _y|
['ram', 'loop', 'dm-', 'sr', 'md'].include?(x.delete('0-9')) ||
devices_to_skip.include?(x)
end.keys
end
def self.root_device_name(node)
self.get_device_name(node, '/')
end
def self.boot_device_name(node)
self.get_device_name(node, '/boot') if Pathname.new('/boot').mountpoint?
end
# Return the short device name of the physical device for a given mount,
# i.e. 'sda', not to be confused with '/dev/sda' or '/dev/sda3'
def self.get_device_name(node, mount_point)
# This could be a bare device (/dev/md0) or a partition (/dev/sda1)
# or a symbolic link since 5.12 (/dev/mapper/transient)
device_or_partition = node.device_of_mount(mount_point)
if File.symlink?(device_or_partition)
device_or_partition = File.realpath(device_or_partition)
end
device_or_partition_base = File.basename(device_or_partition)
if node['block_device'][device_or_partition_base]
return device_or_partition_base
else
root_dev = device_name_from_partition(device_or_partition)
return File.basename(root_dev)
end
end
# Give a device path and a partition number, return the proper
# partition's device path
def self.partition_device_name(device, partnum)
prefix = /[0-9]$/.match(device) ? 'p' : ''
"#{device}#{prefix}#{partnum}"
end
# Given a device including a partiition, return just the device without
# the partition. i.e.
# /dev/sda1 -> /dev/sd
# /dev/md0p0 -> /dev/md0
# /dev/nvme0n1p0 -> /dev/nvm0n1
#
# In reality we can just check for the RE /[0-9]+p[0-9]+$/ to know if
# we need to drop a pX or an X...
#
# HOWEVER, since you can make a filesystem on a whole device (we generally
# frown upon it, but you never know what you'll run into), this method can
# be called with a device path that actually isn't a partition. In such
# cases that can give you the wrong behavior. This is why
# https://github.com/facebook/chef-cookbooks/commit/22d564a3be86a5258c4a404da997bfc3901a3fe2
# was needed.
#
# So, for devices that we *know* would require such
# a thing, we also force them to use that regex, so if someone erroneously
# passes in `/dev/md0`, we give them back `/dev/md0`.
def self.device_name_from_partition(partition)
if partition =~ /[0-9]+p[0-9]+$/ || partition =~ %r{/(nvme|etherd|md|nbd)}
re = /p[0-9]+$/
else
re = /[0-9]+$/
end
partition.sub(re, '')
end
# External automation can pass us disks to rebuild for hot-swap. In order
# to ensure atomicity, we have one file per device, named by that device.
def self.disks_from_automation
result = []
if ::File.directory?(FB::Storage::REPLACED_DISKS_DIR)
Dir.new(FB::Storage::REPLACED_DISKS_DIR).each do |entry|
next if ['.', '..'].include?(entry)
result << "/dev/#{entry}"
end
end
unless result.empty?
Chef::Log.info(
"fb_storage: Disks automation requested converging of: #{result}",
)
end
result
end
def self.block_device_split(dev)
if dev.start_with?('sd')
m = dev.match(/^(sd)([a-z]+)$/)
elsif dev.start_with?('fio')
m = dev.match(/^(fio)([a-z]+)$/)
elsif dev.start_with?('nvme')
m = dev.match(/^(nvme)(\d+n\d+)$/)
elsif dev.start_with?('nbd')
m = dev.match(/^(nbd)(\d+)$/)
elsif dev.start_with?('vd')
m = dev.match(/^(vd)([a-z]+)$/)
end
unless m
fail "fb_storage: Cannot parse #{dev} for sorting"
end
[m[1], m[2]]
end
def self.length_alpha(a, b)
a.length == b.length ? a <=> b : a.length <=> b.length
end
def self.scsi_device_sort(a, b, disk_to_scsi_mapping)
Chef::Log.debug(
"fb_storage: scsi_device_sort: Sorting #{a} and #{b}",
)
if disk_to_scsi_mapping[a] && !disk_to_scsi_mapping[b]
Chef::Log.debug(
"fb_storage: #{a} is on SCSI bus, #{b} is not, #{a} " +
'sorts first',
)
return -1
elsif !disk_to_scsi_mapping[a] && disk_to_scsi_mapping[b]
Chef::Log.debug(
"fb_storage: #{a} is not on SCSI bus, #{b} is, #{b} " +
'sorts first',
)
return 1
elsif disk_to_scsi_mapping[a] && disk_to_scsi_mapping[b]
Chef::Log.debug(
"fb_storage: #{a} and #{b} are both on the SCSI bus " +
'sorting by address',
)
return sort_scsi_slots(disk_to_scsi_mapping[a],
disk_to_scsi_mapping[b])
end
0
end
def self.block_device_sort(a, b, disk_to_scsi_mapping)
(atype, ainstance) = block_device_split(File.basename(a))
(btype, binstance) = block_device_split(File.basename(b))
if atype == btype
if atype == 'nvme' && btype == 'nvme'
Chef::Log.debug(
'fb_storage: Special nvme sorting',
)
# nvme is 0n1 or 1n3 or whatever, split in the n
# and each part can sort as an integer.
ainstance = ainstance.split('n').map(&:to_i)
binstance = binstance.split('n').map(&:to_i)
return ainstance <=> binstance
end
Chef::Log.debug(
'fb_storage: Types are the same, sorting by SCSI',
)
r = scsi_device_sort(a, b, disk_to_scsi_mapping)
# 0 is they both weren't SCSI disks (or they're both in the same
# SCSI slot, but that is not possible :))
if r.zero?
Chef::Log.debug(
'fb_storage: SCSI sort failed, sorting by name',
)
return length_alpha(ainstance, binstance)
else
return r
end
else
Chef::Log.debug(
'fb_storage: Types not same sorting by type',
)
length_alpha(atype, btype)
end
end
# sorts shelves themselves
def self.sort_shelves(a, b)
a_base = File.basename(a)
b_base = File.basename(b)
if a_base.start_with?('sg')
return a_base.gsub('sg', '').to_i <=> b_base.gsub('sg', '').to_i
end
a_array = a_base.split(':').map(&:to_i)
b_array = b_base.split(':').map(&:to_i)
a_array <=> b_array
end
# sorts disks in disk shelves
def self.sort_disk_shelves(a, b)
if a['shelf'] == b['shelf']
a['disk'] <=> b['disk']
else
sort_shelves(a['shelf'], b['shelf'])
end
end
def self.sort_scsi_slots(a, b)
a.split(':').map(&:to_i) <=> b.split(':').map(&:to_i)
end
# returns nil if no previous file
def self.load_previous_disk_order(include_version = false)
# size? returns nil if the file does not exist or is 0 bytes.
#
# We don't want to fail if someone TOUCHES the file, but we do want to
# fail if the file is non-0-bytes and we still fail to parse it
version = disks = nil
if File.size?(PREVIOUS_DISK_ORDER)
f = JSON.parse(File.read(PREVIOUS_DISK_ORDER))
# v1 of the file was just an array of disks
case f
when Array
version = 1
disks = f.empty? ? nil : f
when Hash
unless f['version'] == 2
fail 'fb_storage: Unknown format of persistent-order cache file!'
end
version = f['version']
disklist = []
f['disks'].each do |id|
# If we have a corrupted file, ignore it, and re-generate it later
if id.nil?
return nil
end
sysfile = "#{DEV_ID_DIR}/#{id}"
if File.exist?(sysfile)
disklist << File.basename(File.readlink(sysfile))
else
Chef::Log.warn(
"fb_storage: Unable to translate #{id} into" +
' drive path - probably replaced disk.',
)
# just put the id in there, we won't match it as a disk
# and know the disk has been removed
disklist << id
end
end
disks = disklist
end
if include_version
return { 'version' => version, 'disks' => disks }
else
return disks
end
end
nil
end
# disks parameter is a list of the block devices, i.e. ['sdb', 'sdc', ...]
# translates a list of disks into a list of global ids, returns a
# versioned hash
def self.gen_persistent_disk_data(disks)
id_list = []
# id_map maps a device ('sdc') to a global id
# ('scsi-3600605b00c0c2d9020b8d13611e63d52')
id_map = {}
Dir.open(DEV_ID_DIR).each do |entry|
next if %w{. ..}.include?(entry)
p = "#{DEV_ID_DIR}/#{entry}"
id_map[File.basename(File.readlink(p))] = entry
end
disks.each do |disk|
id = id_map[disk]
if id.nil?
msg = "fb_storage: Can't convert #{disk} to an id"
if FB::Fstab.get_in_maint_disks.include?(disk)
Chef::Log.warn(
"#{msg}, but it's in maintenance, so using #{disk}",
)
id = disk
else
fail "fb_storage: Can't convert #{disk} to an id"
end
end
id_list << id
end
{
'version' => 2,
'disks' => id_list,
}
end
def self.write_out_disk_order(disks, version = 2)
case version
when 1
data = disks
when 2
data = gen_persistent_disk_data(disks)
else
fail 'fb_storage: Unknown persistent disk format ' +
"specified: #{version}"
end
File.open(PREVIOUS_DISK_ORDER, 'w') do |fd| # ~FB030
Chef::Log.debug('fb_storage: Writing out disk order')
fd.write(JSON.generate(data))
end
end
def self.persistent_data_file_version
x = load_previous_disk_order(true)
return x ? x['version'] : nil
end
# we assume someone has checked with `persistent_data_file_version`
# before calling this needlessly
def self.convert_persistent_data_file
x = load_previous_disk_order
# just to be safe
return unless x
write_out_disk_order(x, 2) unless x.empty?
end
# Some hosts cannot use /dev/by-id because some of their devices
# have no information for mapping (FIO), so we never convert those
def self.can_use_dev_id?(node)
!node.virtual? &&
File.directory?(FB::Storage::DEV_ID_DIR) &&
node['block_device'].keys.none? do |x|
x.start_with?('fio', 'nbd', 'ether')
end
end
# now we have both the previous ordering and the new ordering. We want
# to keep the previous ordering, but allow disks to have been
# replaced. Let's say this is our old mapping
# sdb, sdc, sdd, sde
# and then sdc goes away and sdf gets added. what we really want to do
# is slot 'f' where 'c' was. Our SCSI slot ordering above should always
# do the right thing and return f in c's slot - but to be extra safe,
# we always use the existing order to ensure that even if we had some
# ordering bug before, we don't change the ordering now.
#
# So we do this by walking the previous list and noting which slots are
# now invalid. Using the example above, that means we'd now have a list
# of [1] (the 1 slot in the array is sdc which is not in the new list).
#
# Then we drop all disks on the old list from the new list. That would
# leave us with only ['sdf']. These two lists should be the same length.
#
# Then we slot in each element in the second list to the nth element on
# our previous ordering based on the first list.
#
# This function is only called when the prev set of devices and the new
# set of devices are not the same
def self.calculate_updated_order(prev, devs)
Chef::Log.debug(
'fb_storage: Attempting to merge old and new config',
)
new_mapping = prev.dup
slots_to_replace = []
Chef::Log.debug("fb_storage: previous list: #{prev}")
Chef::Log.debug("fb_storage: current devs: #{devs}")
prev.each_with_index do |disk, index|
slots_to_replace << index unless devs.include?(disk)
end
new_disks = devs.reject do |disk|
prev.include?(disk)
end
if new_disks.size != slots_to_replace.size
fail 'fb_storage: Could not map disks to previous ' +
"ordering: new disks: #{new_disks}, avail slots: " +
"#{slots_to_replace}."
end
if new_disks.size.zero?
fail 'fb_storage: Found no difference between old' +
' and new disks, but somehow didn\'t think they were the same' +
' before. Bailing out because I\'m very scared.'
end
new_disks.each_with_index do |disk, index|
slot = slots_to_replace[index]
new_mapping[slot] = disk
end
Chef::Log.info(
"fb_storage: Previous disk mapping: #{prev}",
)
Chef::Log.info(
"fb_storage: New disk mapping: #{new_mapping}",
)
new_mapping
end
def self._handle_custom_device_order_method(node)
if node['fb_storage']['_clowntown_device_order_method'] &&
(node.firstboot_tier? || File.exist?(FORCE_WRITE_CUSTOM_DISK_ORDER))
begin
order = node['fb_storage'][
'_clowntown_device_order_method'].call(node)
write_out_disk_order(order)
ensure
if File.exist?(FORCE_WRITE_CUSTOM_DISK_ORDER)
File.delete(FORCE_WRITE_CUSTOM_DISK_ORDER)
end
end
end
end
# We need to (consistently) map what's on the box to what's in the config
# This is the "meat" of what the `storage` API in fb_storage does since
# it provides a "generic" config.
def self.sorted_devices(node, maintenance_disks)
if node['fb_storage']['_ordered_disks']
return node['fb_storage']['_ordered_disks']
end
self._handle_custom_device_order_method(node)
prev = load_previous_disk_order
disk_to_slot_mapping = {}
if node['fb'] && node['fb']['fbjbod'] &&
!node['fb']['fbjbod']['shelves'].keys.length.zero?
shelves = node['fb']['fbjbod']['shelves'].keys.sort
shelves.each do |shelf|
node['fb']['fbjbod']['shelves'][shelf].
each_with_index do |drive, drive_index|
disk_to_slot_mapping[drive] = {
'disk' => drive_index,
'shelf' => shelf,
}
end
end
end
disk_to_scsi_mapping = {}
node['scsi']&.each do |id, info|
disk_to_scsi_mapping[info['device']] = id
end
unsorted_devs = Set.new(
FB::Storage.eligible_devices(node) +
maintenance_disks.map { |x| ::File.basename(x) },
)
# We might have been running an older version which did not account
# for RAID or /boot being on another disk, so ensure we skip those
devices_to_skip = self.devices_to_skip(node)
prev&.keep_if do |disk|
if devices_to_skip.include?(disk)
Chef::Log.warn('fb_storage: previous ordering includes now skipped ' +
" disk: #{disk}")
false
else
true
end
end
# If the set of disks have not changed since last time, use the old
# order.
if prev && unsorted_devs == Set.new(prev)
Chef::Log.debug(
'fb_storage: Using previous disk ordering from cache',
)
node.default['fb_storage']['_ordered_disks'] = prev
return prev
end
devs = unsorted_devs.to_a.sort do |a, b|
fa = "/dev/#{a}"
fb = "/dev/#{b}"
# first and foremost sort drives in a JBOD after ones not in a
# shelf
if !disk_to_slot_mapping[fa] && disk_to_slot_mapping[fb]
Chef::Log.debug(
"fb_storage: #{a} is not jbod, #{b} is, #{a} sorts first",
)
-1
elsif disk_to_slot_mapping[fa] && !disk_to_slot_mapping[fb]
# same...
Chef::Log.debug(
"fb_storage: #{a} is jbod, #{b} is not, #{b} sorts first",
)
1
elsif disk_to_slot_mapping[fa] && disk_to_slot_mapping[fb]
# if they're both in a sled, sort by slot number
Chef::Log.debug(
"fb_storage: #{a} and #{b} are both jbod " +
'sorting by slot number',
)
sort_disk_shelves(disk_to_slot_mapping[fa],
disk_to_slot_mapping[fb])
else
# both devices are not on fbjob so we can sort them
# using our normal sorting algorithm, which will sort by type
# first, then scsibus if applicable within that, then name
Chef::Log.debug(
"fb_storage: #{a} and #{b} are neither jbod " +
'sorting by length, alphanumeric',
)
block_device_sort(fa, fb, disk_to_scsi_mapping)
end
end
if prev
devs = calculate_updated_order(prev, devs)
end
if prev != devs && !devs.empty?
if can_use_dev_id?(node)
version = 2
else
version = 1
end
write_out_disk_order(devs, version)
end
node.default['fb_storage']['_ordered_disks'] = devs
devs
end
def self.build_mapping(node, maintenance_disks)
devs = sorted_devices(node, maintenance_disks)
# We need to dup this to a real array not the ImmutableArray we get back
# because we'll make modifications to this copy
config = node['fb_storage']['devices'].to_a
num_requested = config.count
if devs.count > num_requested
fail "fb_storage: #{num_requested} requested devices, " +
"which is fewer than available devices #{devs.count} (#{devs}). " +
'Probably something is wrong. Bailing out!'
elsif devs.count < num_requested
fail "fb_storage: Requested #{num_requested} disks but " +
"only #{devs.count} available. Bailing out!"
end
# if we have any storage arrays, prep our datastructure first so
# when we go through devices we can further fill this out
#
# Note since we treat hybrid XFS filesystems like arrays, we will
# allocate md numbers to them, so if you mix-and-match the two you may
# not get md numbers starting at 0, but that's not actually a problem.
desired_arrays = {}
node['fb_storage']['arrays']&.each_with_index do |cfg, idx|
desired_arrays["/dev/md#{idx}"] = cfg.to_hash
desired_arrays["/dev/md#{idx}"]['members'] = []
end
# If / or /boot is mounted in a software RAID array, make sure to skip it
{ '/' => self.root_device_name(node),
'/boot' => self.boot_device_name(node) }.each do |fs, dev|
next unless dev && dev.start_with?('md')
dev_path = "/dev/#{dev}"
if desired_arrays[dev_path]
unless desired_arrays[dev_path]['_skip']
fail "fb_storage: Asked to configure #{dev} but that is `#{fs}`!"
end
else
desired_arrays[dev_path] = { 'members' => [], '_skip' => true }
end
end
desired_disks = {}
devs.each_with_index do |device, index|
# AOE devices are a bit special. They come up as "etherd!e1.1" but
# that maps to "/dev/etherd/e1.1"
dpath = FB::Storage.device_path_from_name(device)
Chef::Log.debug(
"fb_storage: Processing #{dpath}(#{device}): " +
(config[index]).to_s,
)
desired_disks[dpath] = config[index]
next if config[index]['_skip']
config[index]['partitions'].each_with_index do |part, pindex|
pdevice = partition_device_name(
dpath,
config[index]['whole_device'] ? '' : pindex + 1,
)
if part['_swraid_array']
array_num = part['_swraid_array']
desired_arrays["/dev/md#{array_num}"]['members'] << pdevice
elsif part['_swraid_array_journal']
array_num = part['_swraid_array_journal']
desired_arrays["/dev/md#{array_num}"]['journal'] = pdevice
elsif part['_xfs_rt_data']
array_num = part['_xfs_rt_data']
desired_arrays["/dev/md#{array_num}"]['members'] << pdevice
desired_disks[dpath]['partitions'][pindex]['part_name'] ||=
desired_arrays["/dev/md#{array_num}"]['mount_point']
elsif part['_xfs_rt_metadata']
array_num = part['_xfs_rt_metadata']
desired_arrays["/dev/md#{array_num}"]['journal'] = pdevice
desired_disks[dpath]['partitions'][pindex]['part_name'] ||=
"md:#{desired_arrays["/dev/md#{array_num}"]['mount_point']}"
elsif part['_xfs_rt_rescue']
array_num = part['_xfs_rt_rescue']
# we don't do anything with rescue devices, it's the moral
# equivalent of _no_mkfs and _no_mount - it's simply
# saved space for a human to `dd` stuff to ...
# but we'll go ahead and track it anyway
desired_arrays["/dev/md#{array_num}"]['rescue'] = pdevice
desired_disks[dpath]['partitions'][pindex]['part_name'] ||=
'md_rescue:' +
desired_arrays["/dev/md#{array_num}"]['mount_point']
end
end
end
data = { :disks => desired_disks, :arrays => desired_arrays }
Chef::Log.debug(
'fb_storage: Disk mapping: ' +
JSON.pretty_generate(data),
)
return data
end
# All desired partitions for a given device
def self.partition_names(device, conf)
Chef::Log.debug("parition_names: #{device} #{conf}")
results = []
conf['partitions'].each_with_index do |_, i|
results << FB::Storage.partition_device_name(device, i + 1)
end
results
end
# Take a path like '/dev/nvme0n1p2' or '/dev/sdb1' or '/dev/etherd/e1.1'
# and return a device name that would show up in node['block_devices']
# which is mostly just `basename` except in the AOE case...
def self.device_name_from_path(path)
File.basename(path.gsub('etherd/', 'etherd!'))
end
# Take names like 'nvme0n1pe' or 'sdb1' or 'etherd!e1.1' and return
# a path. This is mostly just pre-pending /dev/ except in the case of
# AOE where we need to change '!' into another level of directory
def self.device_path_from_name(name)
"/dev/#{name.tr('!', '/')}"
end
attr_reader :config, :hotswap_disks, :arrays
def initialize(node)
@maintenance_disks = FB::Fstab.get_in_maint_disks
@hotswap_disks = disks_from_automation
mapping = build_mapping(node)
@config = mapping[:disks]
@arrays = mapping[:arrays]
# we don't want these changing as we converge...
@existing = node.filesystem_data.to_hash
@existing_arrays = node['mdadm'] ? node['mdadm'].to_hash : {}
end
def all_storage
devices = []
partitions = []
@config.each do |device, conf|
next if conf['_skip']
devices << device
if conf['whole_device']
partitions << device
else
partitions += partition_names(device, conf)
end
end
valid_arrays = @arrays.reject { |_array, conf| conf['_skip'] }.keys
{
:devices => devices,
# when rebuilding all storage, we need to format the arrays
# after we build them
:partitions => partitions + valid_arrays,
:arrays => valid_arrays,
}
end
def self.get_actual_part_name(part)
s = Mixlib::ShellOut.new(
"blkid -o value -s PARTLABEL #{part}",
).run_command
s.error!
retval = s.stdout.strip
# we return nil on empty string because part_name in the config is nil
# when not set
retval.empty? ? nil : retval
end
def get_expected_label_for_hybrid_md_part(part)
@arrays.each_value do |array|
if array['journal'] == part
return array['label']
end
end
return nil
end
# Return a list of devices and partitions that are out of spec.
# Note: this doesn't take into account what we are or are not allowed
# to touch - it's just what doesn't match the desired state
def out_of_spec
@out_of_spec ||= _out_of_spec
end
def _out_of_spec
# a list of devices which have no partition table
missing_partitions = []
# a list of devices & partitions which are missing a filesystem
missing_filesystems = []
# a list of devices where some parts found when whole_device, or wrong
# number of parts found. Partition type is not considered.
mismatched_partitions = []
# a list of devices & partitions where the observed filesystem type
# doesn't match the configured
mismatched_filesystems = []
# a list of arrays that are missing
missing_arrays = []
# arrays that exist but have the wrong members
mismatched_arrays = []
# arrays missin gmembers
incomplete_arrays = {}
# arrays that we don't have configured
extra_arrays = []
@arrays.each do |device, conf|
short_device = File.basename(device)
next if conf['_skip'] || conf['raid_level'] == 'hybrid_xfs'
unless @existing_arrays.include?(short_device)
Chef::Log.debug(
"fb_storage: Array #{device} missing",
)
missing_arrays << device
mismatched_filesystems << device
next
end
existing_array = @existing_arrays[short_device]
existing_device = @existing['by_device'][device]
existing_members_set = Set.new(
existing_array['members'].map { |x| "/dev/#{x}" },
)
if existing_array['journal']
existing_members_set += Set.new(
["/dev/#{existing_array['journal']}"],
)
end
desired_members_set = Set.new(conf['members'])
if conf['journal']
desired_members_set += Set.new([conf['journal']])
end
if existing_array['level'] != conf['raid_level']
Chef::Log.warn(
"fb_storage: Array #{device} has incorrect raid_level" +
" #{existing_array['level']} vs #{conf['raid_level']}",
)
mismatched_arrays << device
# and this will require us to nuke the array, so we'll need to
# make a filesystem too
mismatched_filesystems << device
next
elsif existing_members_set != desired_members_set
# If the members are not the same, there's two options... we're
# simply missing members (maybe a disk is in repair)...
if existing_members_set < desired_members_set
# Except if it's RAID0, we can't fix that...
if [existing_array['level'], conf['raid_level']].include?(0)
Chef::Log.warn(
"fb_storage: Array #{device} is missing members, " +
'but is RAID0 or should be RAID0 so treating it as a ' +
"mismatched array. Existing: #{existing_members_set.to_a} " +
"vs Desired: #{desired_members_set.to_a}",
)
mismatched_arrays << device
# and this will require us to nuke the array, so we'll need to
# make a filesystem too
mismatched_filesystems << device
next
else
missing_members_set = desired_members_set - existing_members_set
# if the disks are in maintenance, there's nothing to do.
unless missing_members_set <= Set.new(@maintenance_disks)
Chef::Log.info(
"fb_storage: Array #{device} is missing " +
"members: #{missing_members_set.to_a}",
)
incomplete_arrays[device] = missing_members_set.to_a
end
end
# or it's entirely made of members we don't expect. In this case,
# we treat it like a full rebuild
else
Chef::Log.warn(
"fb_storage: Array #{device} has incorrect members" +
" #{existing_members_set.to_a} vs #{desired_members_set.to_a}",
)
mismatched_arrays << device
# and this will require us to nuke the array, so we'll need to
# make a filesystem too
mismatched_filesystems << device
next
end
end
if !existing_device || !existing_device['fs_type']
Chef::Log.warn(
"fb_storage: Array #{device} has no FS",
)
missing_filesystems << device
# We have an existing device *and* it has an FS... compare it
elsif existing_device['fs_type'] != conf['type']
current_fs = existing_device ? existing_device['fs_type'] : '(none)'
Chef::Log.warn(
"fb_storage: Array #{device} has incorrect FS" +
" #{current_fs} vs #{conf['type']}",
)
mismatched_filesystems << device
end
end
# Find arrays we don't expect
@existing_arrays.each_key do |shortarray|
array = "/dev/#{shortarray}"
next if @arrays[array]
Chef::Log.info("fb_storage: Extraneous array: #{array}")
extra_arrays << array
end
# now walk our devices config to see what needs convergance
@config.each do |device, conf|
if @maintenance_disks.include?(device)
Chef::Log.info(
"fb_storage: Skipping check of #{device} because it " +
'is marked as "in_maintenance"',
)
next
end
if conf['_skip']
Chef::Log.info(
"fb_storage: Skipping check of #{device} because it " +
'is marked as "skip" in config.',
)
next
end
devparts = @existing['by_device'].to_hash.keys.select do |x|
FB::Storage.device_name_from_partition(x) == device && x != device
end
# sort the partitions numerically by partition number
devparts.sort_by! do |part|
part.match('\d+$')[0].to_i
end
Chef::Log.debug(
"fb_storage: partitions of #{device} are: #{devparts}",
)
dev_info = @existing['by_device'][device]
if conf['whole_device']
# there are two ways a whole-device partition can be represented
# one is no partitions are report... but another is as a "loop"
# partition type with a single "psuedopartition". In this case
# the data for the partition in ohai will be an empty hash and the
# data for the device will have a filesystem.
has_whole_disk_fs = devparts.count == 1 &&
@existing['by_device'][devparts[0]].empty? &&
dev_info['fs_type']
if devparts.empty? || has_whole_disk_fs
expected_label = conf['partitions'][0]['label']
if dev_info.nil? || dev_info.empty?
Chef::Log.debug(
"fb_storage: Entire device #{device} needs " +
'filesystem',
)
missing_filesystems << device
elsif dev_info['fs_type'] != conf['partitions'][0]['type']
Chef::Log.debug(
"fb_storage: Entire device #{device} has " +
'incorrect filesystem',
)
mismatched_filesystems << device
elsif expected_label && dev_info['label'] != expected_label
mismatched_filesystems << device
Chef::Log.debug(
"fb_storage: Entire device #{device} has " +
"incorrect FS label. Expected #{expected_label}, found " +
"#{dev_info['label']}.",
)
end
else
Chef::Log.debug(
"fb_storage: Device #{device} has partitions " +
' but we want a whole-device filesystem',
)
# we have a real partition table
mismatched_partitions << device
mismatched_filesystems << device
end
next
end
# if there are no partitions *and* this isn't formatted
# it's just a missing partition table
if devparts.empty? && !(dev_info && dev_info['fs_type'])
Chef::Log.info(
"fb_storage: #{device} has no partitions and isn't " +
'formatted',
)
missing_partitions << device
missing_filesystems += partition_names(device, conf)
next
end
# OK, we're here we have partitions and expect to have partitions
# ... but are they RIGHT?
if devparts.count != conf['partitions'].count
Chef::Log.warn(
"fb_storage: #{device} has the wrong number of " +
"partitions (#{devparts.count} vs #{conf['partitions'].count})",
)
mismatched_partitions << device
mismatched_filesystems += partition_names(device, conf)
next
end
devparts.each_with_index do |part, index|
Chef::Log.debug(
"fb_storage: Considering partition #{part}",
)
# We skip member devices of mdraid arrays.
if conf['partitions'][index]['_swraid_array'] ||
conf['partitions'][index]['_swraid_array_journal']
Chef::Log.debug('fb_storage: skipping swraid partition')
next
end
# We need to validate that member devices of hybrid "arrays" have the
# correct partition labels since the rtxfs helpers depend on this.
if conf['partitions'][index]['_xfs_rt_data'] ||
conf['partitions'][index]['_xfs_rt_rescue'] ||
conf['partitions'][index]['_xfs_rt_metadata']
expected_part_name = conf['partitions'][index]['part_name']
actual_part_name = FB::Storage.get_actual_part_name(part)
if actual_part_name != expected_part_name
Chef::Log.warn("fb_storage: Partition #{part} expected to " +
"have partlabel '#{expected_part_name}', actual " +
"is '#{actual_part_name}'.")
mismatched_partitions << device
end
end
# We skip further validation for hybrid real-time devices since these
# will not have a filesystem type, label, etc.
if conf['partitions'][index]['_xfs_rt_data'] ||
conf['partitions'][index]['_xfs_rt_rescue']
next
end
partinfo = @existing['by_device'][part]
expected_fs = conf['partitions'][index]['type']
expected_label = conf['partitions'][index]['label']
if !expected_label && conf['partitions'][index]['_xfs_rt_metadata']
# we have to figure out the label that this device corresponds to
# in the array config
expected_label = self.get_expected_label_for_hybrid_md_part(part)
end
if conf['partitions'][index]['_xfs_rt_metadata']
expected_fs = 'xfs'
end
if !partinfo || !partinfo['fs_type']
Chef::Log.warn(
"fb_storage: Partition #{part} has no filesystem",
)
missing_filesystems << part
elsif partinfo['fs_type'] != expected_fs
Chef::Log.warn(
"fb_storage: Partition #{part} has the wrong " +
"filesystem (#{partinfo['fs_type']} vs #{expected_fs})",
)
mismatched_filesystems << part
elsif expected_label && partinfo['label'] != expected_label
Chef::Log.warn(
"fb_storage: Partition #{part} has incorrect " +
"label. Expected #{expected_label}, found " +
"#{partinfo['label']}.",
)
mismatched_filesystems << part
end
end
end
# Special case for hybrid_xfs
# For these arrays we don't go through the normal @array loop, because
# they won't show up in node['mdadm'] - however, we do want to walk
# the missing FSes on members of hybrid_xfs members, and then push
# that up to the "array" level so we actually go ahead and create
# the filesystem
missing_filesystems.each do |fs|
mma = @arrays.select do |_array, config|
config['raid_level'] == 'hybrid_xfs' &&
(config['members'].include?(fs) ||
config['journal'] == fs)
end
missing_filesystems += mma.keys
end
mismatched_filesystems.each do |fs|
mma = @arrays.select do |_array, config|
config['raid_level'] == 'hybrid_xfs' &&
(config['members'].include?(fs) ||
config['journal'] == fs)
end
mismatched_filesystems += mma.keys
end
# Done walking @config, put it all together
{
:mismatched_partitions => mismatched_partitions.sort.uniq,
:mismatched_filesystems => mismatched_filesystems.sort.uniq,
:missing_partitions => missing_partitions.sort.uniq,
:missing_filesystems => missing_filesystems.sort.uniq,
:missing_arrays => missing_arrays.sort.uniq,
:mismatched_arrays => mismatched_arrays.sort.uniq,
:extra_arrays => extra_arrays.sort.uniq,
# this one is a hash...
:incomplete_arrays => incomplete_arrays,
}
end
# Maps the storage config to an fb_fstab config
def gen_fb_fstab(node)
use_labels = node['fb_storage']['fstab_use_labels']
fstab = {}
fstab_fields =
%w{type mount_point opts pass enable_remount allow_mount_failure}
if node['fb_storage']['hybrid_xfs_use_helper']
node.default['fb_fstab']['type_normalization_map']['rtxfs'] = 'xfs'
node.default['fb_fstab']['ignorable_opts'] << /^rtdev=.*/
end
@config.each do |device, devconf|
next if devconf['_skip']
if devconf['whole_device']
partconf = devconf['partitions'][0]
if partconf['_swraid_array'] || partconf['_no_mount'] ||
partconf['_swraid_array_journal']
next
end
name = "storage_#{device}_whole"
fstab[name] = {
'device' => use_labels ? "LABEL=#{devconf['label']}" : device,
}
fstab_fields.each do |field|
fstab[name][field] = devconf['partitions'][0][field]
end
next
end
# rubocop:disable Lint/ShadowingOuterLocalVariable
devconf['partitions'].each_with_index do |partconf, index|
# If we are a member of a SW raid array, or we are a member
# of a hybrid-xfs FS or we've been asked not to mount, then we skip
# generating the fstab entry.
if partconf['_no_mount'] ||
partconf['_swraid_array'] || partconf['_swraid_array_journal'] ||
partconf['_xfs_rt_data'] || partconf['_xfs_rt_rescue'] ||
partconf['_xfs_rt_metadata']
next
end
partnum = index + 1
partition = FB::Storage.partition_device_name(
device, partnum
)
name = "storage_#{partition}"
fstab[name] = {
'device' => use_labels ? "LABEL=#{partconf['label']}" : partition,
}
fstab_fields.each do |field|
fstab[name][field] = partconf[field]
end
end
# rubocop:enable Lint/ShadowingOuterLocalVariable
end
@arrays.each do |array, arrayconf|
next if arrayconf['_skip'] || arrayconf['_no_mount']
name = "storage_#{array}"
if use_labels
device = "LABEL=#{arrayconf['label']}"
elsif arrayconf['raid_level'] == 'hybrid_xfs'
device = arrayconf['journal']
else
device = array
end
fstab[name] = {
'device' => device,
}
fstab_fields.each do |field|
fstab[name][field] = arrayconf[field]
end
if arrayconf['raid_level'] == 'hybrid_xfs'
if node['fb_storage']['hybrid_xfs_use_helper']
fstab[name]['type'] = 'rtxfs'
else
# point the XFS filesystem to it's data device (rtdev)
fstab[name]['opts'] << ",rtdev=#{arrayconf['members'].first}"
end
end
end
fstab
end
private
# we make an instance method that calls a class method for easier testing
# of this method without having to factor out `initialize`.
def build_mapping(node)
FB::Storage.build_mapping(node, @maintenance_disks)
end
def partition_names(device, conf)
FB::Storage.partition_names(device, conf)
end
def disks_from_automation
FB::Storage.disks_from_automation
end
end
end