cookbooks/fb_storage/libraries/storage_handlers.rb (475 lines of code) (raw):
# vim: syntax=ruby:expandtab:shiftwidth=2:softtabstop=2:tabstop=2
#
# Copyright (c) 2016-present, Facebook, Inc.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
module FB
class Storage
# Handlers take a given device and know how to partition/format them.
#
# The base class may not be used itself, but holds common code most
# classes will want to use. It also has a class method to build the right
# object for a given device.
class Handler
NO_BASE_CLASS_MSG = (
'FB::Storage::Handler is not intended ' +
'to be instantiated directly, please use a subclass'
).freeze
MDADM = '/sbin/mdadm'.freeze
# rubocop:disable Style/ClassVars
@@handler_cache = {}
def self.get_handler(device, node)
return @@handler_cache[device] if @@handler_cache[device]
devname = FB::Storage.device_name_from_path(device)
if node['block_device'][devname]
info = node['block_device'][devname].to_hash
else
Chef::Log.debug(
"fb_storage: #{devname} is not in node['block_device']",
)
info = {}
end
node['fb_storage']['_handlers'].each do |handler|
unless handler.superclass == FB::Storage::Handler
fail "fb_storage: handler #{handler.name} is not a subclass of " +
'FB::Storage::Handler, aborting!'
end
if handler.match?(devname, info)
Chef::Log.debug("fb_storage: Creating #{handler.name} handler")
obj = handler.new(device, node)
@@handler_cache[device] = obj
return obj
end
end
fail "fb_storage: unknown handler for device #{devname}"
end
# rubocop:enable Style/ClassVars
attr_accessor :mkfs_timeout
def initialize(device, node)
if self.instance_of?(FB::Storage::Handler)
fail NO_BASE_CLASS_MSG
end
@device = device
@node = node
@existing_partitions = nil
self.mkfs_timeout = 900
end
# Does the handler work for this device?
def self.match?(_devname, _info); end
# Called prior to partitioning
def prep_device; end
def wipe_device
Chef::Log.debug(
"fb_storage: wipe_device called on #{@device}",
)
umount_all_partitions
remove_all_partitions_from_all_arrays
existing_partitions.each do |part|
Chef::Log.info("fb_storage: Deleting #{part}")
pnum = part.sub(/#{@device}p?/, '')
cmd = "/sbin/parted -s '#{@device}' rm #{pnum}"
Chef::Log.debug("fb_storage: running: #{cmd}")
s = Mixlib::ShellOut.new(cmd).run_command
if s.error?
if s.stderr.match(/unrecognised disk label/)
Chef::Log.debug(
'fb_storage: Allowing failed removal of ' +
"#{pnum} from #{@device} as the partition table is " +
'unrecognised.',
)
else
s.error!
end
end
end
end
def partition_device(device_config)
Chef::Log.info(
"fb_storage: Writing gpt table to #{@device}",
)
Mixlib::ShellOut.new("/sbin/parted -s '#{@device}' mklabel gpt").
run_command.error!
return if device_config['whole_device']
parted_commands = []
sfdisk_commands = []
device_config['partitions'].each_with_index do |partinfo, partindex|
partnum = partindex + 1
if partinfo['partition_start']
parted_commands <<
"mkpart primary #{partinfo['partition_start']} " +
"#{partinfo['partition_end']} -a optimal " +
"set #{partnum} boot off"
else
parted_commands << 'mkpart primary 0% 100% set 1 boot off'
end
if partinfo['_swraid_array'] || partinfo['_swraid_array_journal']
parted_commands << "set #{partnum} raid on"
end
if partinfo['part_name']
parted_commands << "name #{partnum} '\"#{partinfo['part_name']}\"'"
end
if partinfo['part_type']
sfdisk_commands <<
"sfdisk #{@device} #{partnum} " +
"--part-type #{partinfo['part_type']}"
end
end
Chef::Log.info("fb_storage: Partitioning #{@device}")
cmd = "/sbin/parted -s '#{@device}' #{parted_commands.join(' ')}"
Chef::Log.debug("fb_storage: Running #{cmd}")
Mixlib::ShellOut.new(cmd).run_command.error!
# wait until a partition shows up
pname = partition_device_name(1)
Chef::Log.debug("fb_storage: Polling for #{pname} to exist")
max_seconds_to_wait = 5
until File.exist?(pname)
if max_seconds_to_wait.zero?
fail 'fb_storage: Made partitions, but partition' +
" #{pname} never showed up. :("
end
Chef::Log.info(
"fb_storage: Waiting for #{pname} to show up...",
)
sleep(1)
max_seconds_to_wait -= 1
end
sfdisk_commands.each do |sfdisk_cmd|
Chef::Log.debug("fb_storage: Running #{sfdisk_cmd}")
Mixlib::ShellOut.new(sfdisk_cmd).run_command.error!
end
end
# Called after partitioning
def condition_device; end
# Called before formatting
def prep_partition(_partition); end
def format_partition(partition, config)
# if the whole drive is being converged, this would already have
# been unmounted, but we could just be converging this partition
umount_by_partition(partition)
unless File.basename(partition).start_with?('md')
Chef::Log.debug(
'fb_storage: Removing from any relevant arrays',
)
remove_device_from_any_arrays(partition)
end
cmd = mkfs_cmd(config['type'])
timeout = self.mkfs_timeout
unless cmd
fail "fb_storage: unknown fstype #{config['type']} for " +
" #{partition}"
end
format_options = default_format_options(config['type'])
if @node['fb_storage']['format_options']
case @node['fb_storage']['format_options']
when String
format_options =
@node['fb_storage']['format_options'].dup
when Hash
format_options =
@node['fb_storage']['format_options'][config['type']]
else
fail "fb_storage: Not sure what to do with 'format_options': " +
@node['fb_storage']['format_options'].to_s
end
end
device = partition
if config['raid_level'] == 'hybrid_xfs'
# If we're hybrid XFS we need to format the metadata device
# (not the fake 'md' device we have), and also pass in the
# data device (rtdev)
device = config['journal']
extsize = config['extsize'] || 262144 # 256KiB, XFS default
format_options << ' -d rtinherit=1 -r rtdev=' +
"#{config['members'].first},extsize=#{extsize}"
# Realtime is not compatible reflinks.
# Default for CentOS 8 is crc=1, so let's switch it off here.
format_options << ' -m crc=0 -m reflink=0'
end
label = config['label']
if config['type'] == 'xfs'
# XFS sucks and doesn't allow labels longer than 12 chars
label = label[0..11]
end
if config['type'] == 'ext4'
timeout *= 2
end
cmd << " #{format_options} -L \"#{label}\" #{device}"
Chef::Log.info(
"fb_storage: Making filesystem on #{device}",
)
Chef::Log.debug("fb_storage: Running #{cmd}")
# In order to make a filesystem on a new md device in a sane
# amount of time you need to stop the resync first. But that's
# dangerous if we crash, so we use a begin-ensure here to make sure
# that we will revert the change before we throw the exception
limit_file = '/proc/sys/dev/raid/speed_limit_max'
if @type == :md && config['raid_level'] != 'hybrid_xfs' &&
File.exist?(limit_file)
need_to_quiesce_md = true
else
need_to_quiesce_md = false
end
begin
if need_to_quiesce_md
Chef::Log.info(
'fb_storage: Stopping md resyncing while creating ' +
'filesystem',
)
limit = File.read(limit_file)
File.write(limit_file, "0\n") # ~FB030
end
mkfs = Mixlib::ShellOut.new(cmd, :timeout => timeout)
mkfs.run_command.error!
ensure
if need_to_quiesce_md
Chef::Log.info(
'fb_storage: Resuming md resyncing after creating ' +
'filesystem',
)
File.write(limit_file, limit) # ~FB030
end
end
end
# Called after formatting
def condition_partition(_partition); end
def nuke_raid_header(device)
# Nuke the metadata...
if File.exist?(device)
cmd = "#{MDADM} --zero-superblock --force #{device}"
Chef::Log.debug("fb_storage: Running #{cmd}")
Mixlib::ShellOut.new(cmd).run_command.error!
# But it turns out that's not enough... if you don't also
# nuke the FS header, it'll get auto-mounted if we build an array
# later, which we're almost certainly about to do
#
# ... but don't fail if that dd failed, it may have been
# smaller than 100MB :)
cmd = "dd if=/dev/zero of=#{device} bs=1024k count=100"
Chef::Log.debug("fb_storage: Running #{cmd}")
Mixlib::ShellOut.new(cmd).run_command
end
end
def array_device_is_in(device)
return nil unless @node['mdadm']
@node['mdadm'].each do |array, info|
Chef::Log.debug(
"fb_storage: Determining if #{device} is in " +
array,
)
short_dev = ::File.basename(device)
all_members = info['members'].dup
all_members << info['journal'] if info['journal']
all_members += info['spares'] if info['spares']
if all_members.include?(short_dev)
Chef::Log.debug(
"fb_storage: #{device} is in #{array}",
)
return "/dev/#{array}"
end
Chef::Log.debug(
"fb_storage: #{device} is NOT in #{array}",
)
end
nil
end
# in a separate method so we can mock it in tests
def _sleep(time)
sleep(time)
end
def remove_device_from_any_arrays(device)
array = array_device_is_in(device)
unless array
Chef::Log.debug(
"fb_storage: #{device} not found in any arrays",
)
return
end
unless File.exist?(array)
Chef::Log.debug(
"fb_storage: Skipping removing #{device} from " +
"#{array} because #{array} no longer exists",
)
return
end
Chef::Log.info(
"fb_storage: Removing #{device} from #{array}",
)
# first we set it faulty
s = Mixlib::ShellOut.new(
"#{MDADM} #{array} --fail #{device}",
).run_command
# we need to stop the array and zero the superblock if the error is
# device or resource busy (and if we are allowed to do so)
if s.stderr.include?('Device or resource busy') &&
@node['fb_storage']['stop_and_zero_mdadm_for_format']
Chef::Log.info("fb_storage: Stopping array #{array}...")
stop_array(array)
Chef::Log.info(
"fb_storage: Zeroing superblock for #{device}...",
)
nuke_raid_header(device)
# we return early from the method here, zeroing superblock removes
# the device from the array
return
else
s.error!
end
# Now, this can take a bit for the drive to quiesce, so we try to
# remove a few times. It usually only takes ~1s, so we try after
# a short sleep, and if that doesn't work we try every 10 seconds
# for one minute
_sleep(2)
tries = 0
max_tries = 6
interval = 10
loop do
s = Mixlib::ShellOut.new(
"#{MDADM} #{array} --remove #{device}",
).run_command
# if it worked, break
break unless s.error?
# If it's any error other than the device being busy, fail.
unless s.stdout.include?('Device or resource busy')
s.error!
end
# Otherwise, if we'e hit maxtries or it's an error we don't expect,
# bail out
if tries == max_tries
Chef::Log.error(
"fb_storage: Failed to remove #{device} from " +
"#{array} after #{max_tries} tries",
)
s.error!
end
Chef::Log.info(
"fb_storage: #{device} still busy after setting it " +
"faulty - sleeping #{interval} seconds and trying again to " +
'remove it.',
)
# otherwise sleep for $interval seconds and try again
_sleep(interval)
tries += 1
end
end
def remove_from_arrays(devices)
devices.each do |device|
remove_device_from_any_arrays(device)
end
end
# When we're not an array, nuke anything holding us
def remove_all_partitions_from_all_arrays
list = existing_partitions + [@device]
Chef::Log.debug(
'fb_storage: Removing all partitions from all arrays ' +
"that contain any of #{list}",
)
affected = remove_from_arrays(list)
affected.each { |d| nuke_raid_header(d) }
end
def stop_array(array)
if File.exist?(array)
Chef::Log.info("fb_storage: Stopping array: #{array}")
cmd = "#{MDADM} -S #{array}"
Mixlib::ShellOut.new(cmd).run_command.error!
else
Chef::Log.debug(
"fb_storage: Skipping request to stop #{array} " +
'because it no longer exists',
)
end
end
def umount_all_partitions
existing_partitions.each do |part|
umount_by_partition(part)
end
umount_device
end
def umount_by_partition(partition)
# the 'mounts' check should be all that's necessary - unless we
# partitioned this device in this run :)
if @node.filesystem_data['by_device'][partition] &&
@node.filesystem_data['by_device'][partition]['mounts']
@node.filesystem_data['by_device'][partition]['mounts'].each do |m|
umount(m)
end
end
end
def umount_device
if @node.filesystem_data['by_device'][@device] &&
@node.filesystem_data['by_device'][@device]['mounts']
@node.filesystem_data['by_device'][@device]['mounts'].each do |m|
umount(m)
end
end
end
def umount(m)
# we may call umount on the same thing more than once depending on
# our path through the system, so check it's actually mounted.
if Pathname.new(m).mountpoint?
Chef::Log.info("fb_storage: Unmounting #{m}")
Mixlib::ShellOut.new("/bin/umount #{m}").run_command.error!
end
end
def partition_device_name(num)
FB::Storage.partition_device_name(@device, num)
end
def existing_partitions
@existing_partitions ||=
@node.filesystem_data['by_device'].keys.select do |x|
x.start_with?(@device) && x != @device
end
end
def mkfs_cmd(type)
case type
when 'xfs'
'mkfs -t xfs -f'
when 'btrfs'
'mkfs.btrfs -f'
when 'ext4', 'ext3', 'ext2'
"mkfs -t #{type} -F"
end
end
def default_format_options(type)
# rubocop:disable Style/HashLikeCase
case type
when 'xfs'
'-i size=2048'
when 'btrfs'
'-l 16K -n 16K'
when 'ext4'
''
end
# rubocop:enable Style/HashLikeCase
end
class FioHandler < FB::Storage::Handler
def initialize(device, node)
super
@type = :fio
raw = device.sub('fio', 'fct')
num = raw[-1].tr('[a-j]', '[0-9]')
raw[-1] = num
@raw_device = raw
end
def self.match?(devname, _info)
devname.start_with?('fio')
end
def prep_device
# "format" isn't "make a filesystem" but some other magical
# flash prep thing
{ 'detach' => nil,
'format' => '-y',
'attach' => nil }.each do |step, opts|
cmd = "/usr/bin/fio-#{step}"
cmd << " #{opts}" if opts
Chef::Log.debug(
"fb_storage: Running #{cmd} #{@raw_device}",
)
Mixlib::ShellOut.new("#{cmd} #{@raw_device}").
run_command.error!
end
end
end
class JbodHandler < FB::Storage::Handler
def initialize(device, node)
super
@type = :jbod
# Current AOE drivers have a bug where big partitions can take
# more than 10 minutes to run mkfs
self.mkfs_timeout = 900
end
# JBOD always matches as it'll work for every block device
def self.match?(_devname, _info)
true
end
end
class MdHandler < FB::Storage::Handler
def initialize(device, node)
super
@type = :md
end
def self.match?(devname, _info)
devname.start_with?('md')
end
def default_format_options(type)
opts = super(type)
if type == 'xfs'
# there's no need to discard blocks on md devices
opts << ' -K'
end
opts
end
def build(config)
Chef::Log.info(
"fb_storage: Creating array: #{@device}",
)
Chef::Log.debug(
"fb_storage: ... out of #{config['members']}",
)
# We set homehost to `any` since many of our hostnames won't fit
# in the md superblock's name field, and that causes the hostname
# not to match and the device to end up as md127.
#
# It's also worth noting that we don't use --name, which despite
# earlier code's implication, is actually the field that holds
# "$NAME:$INDEX" (related to homehost above), so we don't set it
# statically and mdadm will set it to "any:$INDEX" when we specify
# --homehost=any
cmd = "echo y | #{MDADM} --create #{@device} --force " +
"--homehost=any --raid-devices=#{config['members'].length} " +
"--level=#{config['raid_level']}"
if config['raid_stripe_size']
cmd << " --chunk #{config['raid_stripe_size']}"
end
if config['journal']
cmd << " --write-journal #{config['journal']}"
end
if config['create_options']
cmd << " #{config['create_options']}"
end
cmd << " #{config['members'].join(' ')}"
Mixlib::ShellOut.new(cmd).run_command.error!
Mixlib::ShellOut.new("udevadm trigger #{@device}").run_command.error!
end
def stop
umount_by_partition(@device)
stop_array(@device)
end
def wipe_member_devices(config)
config['members'].each do |device|
Chef::Log.info(
"fb_storage: Wiping out #{device}",
)
umount_by_partition(device)
nuke_raid_header(device)
end
end
end
end
end
end