in raid5.c [7457:7821]
static int raid5_run(struct mddev *mddev)
{
struct r5conf *conf;
int working_disks = 0;
int dirty_parity_disks = 0;
struct md_rdev *rdev;
struct md_rdev *journal_dev = NULL;
sector_t reshape_offset = 0;
int i, ret = 0;
long long min_offset_diff = 0;
int first = 1;
if (acct_bioset_init(mddev)) {
pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev));
return -ENOMEM;
}
if (mddev_init_writes_pending(mddev) < 0) {
ret = -ENOMEM;
goto exit_acct_set;
}
if (mddev->recovery_cp != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
mdname(mddev));
rdev_for_each(rdev, mddev) {
long long diff;
if (test_bit(Journal, &rdev->flags)) {
journal_dev = rdev;
continue;
}
if (rdev->raid_disk < 0)
continue;
diff = (rdev->new_data_offset - rdev->data_offset);
if (first) {
min_offset_diff = diff;
first = 0;
} else if (mddev->reshape_backwards &&
diff < min_offset_diff)
min_offset_diff = diff;
else if (!mddev->reshape_backwards &&
diff > min_offset_diff)
min_offset_diff = diff;
}
if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
(mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
mdname(mddev));
ret = -EINVAL;
goto exit_acct_set;
}
if (mddev->reshape_position != MaxSector) {
/* Check that we can continue the reshape.
* Difficulties arise if the stripe we would write to
* next is at or after the stripe we would read from next.
* For a reshape that changes the number of devices, this
* is only possible for a very short time, and mdadm makes
* sure that time appears to have past before assembling
* the array. So we fail if that time hasn't passed.
* For a reshape that keeps the number of devices the same
* mdadm must be monitoring the reshape can keeping the
* critical areas read-only and backed up. It will start
* the array in read-only mode, so we check for that.
*/
sector_t here_new, here_old;
int old_disks;
int max_degraded = (mddev->level == 6 ? 2 : 1);
int chunk_sectors;
int new_data_disks;
if (journal_dev) {
pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
mdname(mddev));
ret = -EINVAL;
goto exit_acct_set;
}
if (mddev->new_level != mddev->level) {
pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
mdname(mddev));
ret = -EINVAL;
goto exit_acct_set;
}
old_disks = mddev->raid_disks - mddev->delta_disks;
/* reshape_position must be on a new-stripe boundary, and one
* further up in new geometry must map after here in old
* geometry.
* If the chunk sizes are different, then as we perform reshape
* in units of the largest of the two, reshape_position needs
* be a multiple of the largest chunk size times new data disks.
*/
here_new = mddev->reshape_position;
chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
new_data_disks = mddev->raid_disks - max_degraded;
if (sector_div(here_new, chunk_sectors * new_data_disks)) {
pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
mdname(mddev));
ret = -EINVAL;
goto exit_acct_set;
}
reshape_offset = here_new * chunk_sectors;
/* here_new is the stripe we will write to */
here_old = mddev->reshape_position;
sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
/* here_old is the first stripe that we might need to read
* from */
if (mddev->delta_disks == 0) {
/* We cannot be sure it is safe to start an in-place
* reshape. It is only safe if user-space is monitoring
* and taking constant backups.
* mdadm always starts a situation like this in
* readonly mode so it can take control before
* allowing any writes. So just check for that.
*/
if (abs(min_offset_diff) >= mddev->chunk_sectors &&
abs(min_offset_diff) >= mddev->new_chunk_sectors)
/* not really in-place - so OK */;
else if (mddev->ro == 0) {
pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
mdname(mddev));
ret = -EINVAL;
goto exit_acct_set;
}
} else if (mddev->reshape_backwards
? (here_new * chunk_sectors + min_offset_diff <=
here_old * chunk_sectors)
: (here_new * chunk_sectors >=
here_old * chunk_sectors + (-min_offset_diff))) {
/* Reading from the same stripe as writing to - bad */
pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
mdname(mddev));
ret = -EINVAL;
goto exit_acct_set;
}
pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
/* OK, we should be able to continue; */
} else {
BUG_ON(mddev->level != mddev->new_level);
BUG_ON(mddev->layout != mddev->new_layout);
BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
BUG_ON(mddev->delta_disks != 0);
}
if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
test_bit(MD_HAS_PPL, &mddev->flags)) {
pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
mdname(mddev));
clear_bit(MD_HAS_PPL, &mddev->flags);
clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags);
}
if (mddev->private == NULL)
conf = setup_conf(mddev);
else
conf = mddev->private;
if (IS_ERR(conf)) {
ret = PTR_ERR(conf);
goto exit_acct_set;
}
if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
if (!journal_dev) {
pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
mdname(mddev));
mddev->ro = 1;
set_disk_ro(mddev->gendisk, 1);
} else if (mddev->recovery_cp == MaxSector)
set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
}
conf->min_offset_diff = min_offset_diff;
mddev->thread = conf->thread;
conf->thread = NULL;
mddev->private = conf;
for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
i++) {
rdev = conf->disks[i].rdev;
if (!rdev && conf->disks[i].replacement) {
/* The replacement is all we have yet */
rdev = conf->disks[i].replacement;
conf->disks[i].replacement = NULL;
clear_bit(Replacement, &rdev->flags);
conf->disks[i].rdev = rdev;
}
if (!rdev)
continue;
if (conf->disks[i].replacement &&
conf->reshape_progress != MaxSector) {
/* replacements and reshape simply do not mix. */
pr_warn("md: cannot handle concurrent replacement and reshape.\n");
goto abort;
}
if (test_bit(In_sync, &rdev->flags)) {
working_disks++;
continue;
}
/* This disc is not fully in-sync. However if it
* just stored parity (beyond the recovery_offset),
* when we don't need to be concerned about the
* array being dirty.
* When reshape goes 'backwards', we never have
* partially completed devices, so we only need
* to worry about reshape going forwards.
*/
/* Hack because v0.91 doesn't store recovery_offset properly. */
if (mddev->major_version == 0 &&
mddev->minor_version > 90)
rdev->recovery_offset = reshape_offset;
if (rdev->recovery_offset < reshape_offset) {
/* We need to check old and new layout */
if (!only_parity(rdev->raid_disk,
conf->algorithm,
conf->raid_disks,
conf->max_degraded))
continue;
}
if (!only_parity(rdev->raid_disk,
conf->prev_algo,
conf->previous_raid_disks,
conf->max_degraded))
continue;
dirty_parity_disks++;
}
/*
* 0 for a fully functional array, 1 or 2 for a degraded array.
*/
mddev->degraded = raid5_calc_degraded(conf);
if (has_failed(conf)) {
pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
mdname(mddev), mddev->degraded, conf->raid_disks);
goto abort;
}
/* device size must be a multiple of chunk size */
mddev->dev_sectors &= ~((sector_t)mddev->chunk_sectors - 1);
mddev->resync_max_sectors = mddev->dev_sectors;
if (mddev->degraded > dirty_parity_disks &&
mddev->recovery_cp != MaxSector) {
if (test_bit(MD_HAS_PPL, &mddev->flags))
pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
mdname(mddev));
else if (mddev->ok_start_degraded)
pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
mdname(mddev));
else {
pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
mdname(mddev));
goto abort;
}
}
pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
mdname(mddev), conf->level,
mddev->raid_disks-mddev->degraded, mddev->raid_disks,
mddev->new_layout);
print_raid5_conf(conf);
if (conf->reshape_progress != MaxSector) {
conf->reshape_safe = conf->reshape_progress;
atomic_set(&conf->reshape_stripes, 0);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
"reshape");
if (!mddev->sync_thread)
goto abort;
}
/* Ok, everything is just fine now */
if (mddev->to_remove == &raid5_attrs_group)
mddev->to_remove = NULL;
else if (mddev->kobj.sd &&
sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
pr_warn("raid5: failed to create sysfs attributes for %s\n",
mdname(mddev));
md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
if (mddev->queue) {
int chunk_size;
/* read-ahead size must cover two whole stripes, which
* is 2 * (datadisks) * chunksize where 'n' is the
* number of raid devices
*/
int data_disks = conf->previous_raid_disks - conf->max_degraded;
int stripe = data_disks *
((mddev->chunk_sectors << 9) / PAGE_SIZE);
chunk_size = mddev->chunk_sectors << 9;
blk_queue_io_min(mddev->queue, chunk_size);
raid5_set_io_opt(conf);
mddev->queue->limits.raid_partial_stripes_expensive = 1;
/*
* We can only discard a whole stripe. It doesn't make sense to
* discard data disk but write parity disk
*/
stripe = stripe * PAGE_SIZE;
stripe = roundup_pow_of_two(stripe);
mddev->queue->limits.discard_alignment = stripe;
mddev->queue->limits.discard_granularity = stripe;
blk_queue_max_write_same_sectors(mddev->queue, 0);
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->new_data_offset << 9);
}
/*
* zeroing is required, otherwise data
* could be lost. Consider a scenario: discard a stripe
* (the stripe could be inconsistent if
* discard_zeroes_data is 0); write one disk of the
* stripe (the stripe could be inconsistent again
* depending on which disks are used to calculate
* parity); the disk is broken; The stripe data of this
* disk is lost.
*
* We only allow DISCARD if the sysadmin has confirmed that
* only safe devices are in use by setting a module parameter.
* A better idea might be to turn DISCARD into WRITE_ZEROES
* requests, as that is required to be safe.
*/
if (devices_handle_discard_safely &&
mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
mddev->queue->limits.discard_granularity >= stripe)
blk_queue_flag_set(QUEUE_FLAG_DISCARD,
mddev->queue);
else
blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
mddev->queue);
blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
}
if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
goto abort;
return 0;
abort:
md_unregister_thread(&mddev->thread);
print_raid5_conf(conf);
free_conf(conf);
mddev->private = NULL;
pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
ret = -EIO;
exit_acct_set:
acct_bioset_exit(mddev);
return ret;
}