static int raid5_run()

in raid5.c [7457:7821]


static int raid5_run(struct mddev *mddev)
{
	struct r5conf *conf;
	int working_disks = 0;
	int dirty_parity_disks = 0;
	struct md_rdev *rdev;
	struct md_rdev *journal_dev = NULL;
	sector_t reshape_offset = 0;
	int i, ret = 0;
	long long min_offset_diff = 0;
	int first = 1;

	if (acct_bioset_init(mddev)) {
		pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev));
		return -ENOMEM;
	}

	if (mddev_init_writes_pending(mddev) < 0) {
		ret = -ENOMEM;
		goto exit_acct_set;
	}

	if (mddev->recovery_cp != MaxSector)
		pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
			  mdname(mddev));

	rdev_for_each(rdev, mddev) {
		long long diff;

		if (test_bit(Journal, &rdev->flags)) {
			journal_dev = rdev;
			continue;
		}
		if (rdev->raid_disk < 0)
			continue;
		diff = (rdev->new_data_offset - rdev->data_offset);
		if (first) {
			min_offset_diff = diff;
			first = 0;
		} else if (mddev->reshape_backwards &&
			 diff < min_offset_diff)
			min_offset_diff = diff;
		else if (!mddev->reshape_backwards &&
			 diff > min_offset_diff)
			min_offset_diff = diff;
	}

	if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
	    (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
		pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
			  mdname(mddev));
		ret = -EINVAL;
		goto exit_acct_set;
	}

	if (mddev->reshape_position != MaxSector) {
		/* Check that we can continue the reshape.
		 * Difficulties arise if the stripe we would write to
		 * next is at or after the stripe we would read from next.
		 * For a reshape that changes the number of devices, this
		 * is only possible for a very short time, and mdadm makes
		 * sure that time appears to have past before assembling
		 * the array.  So we fail if that time hasn't passed.
		 * For a reshape that keeps the number of devices the same
		 * mdadm must be monitoring the reshape can keeping the
		 * critical areas read-only and backed up.  It will start
		 * the array in read-only mode, so we check for that.
		 */
		sector_t here_new, here_old;
		int old_disks;
		int max_degraded = (mddev->level == 6 ? 2 : 1);
		int chunk_sectors;
		int new_data_disks;

		if (journal_dev) {
			pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
				mdname(mddev));
			ret = -EINVAL;
			goto exit_acct_set;
		}

		if (mddev->new_level != mddev->level) {
			pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
				mdname(mddev));
			ret = -EINVAL;
			goto exit_acct_set;
		}
		old_disks = mddev->raid_disks - mddev->delta_disks;
		/* reshape_position must be on a new-stripe boundary, and one
		 * further up in new geometry must map after here in old
		 * geometry.
		 * If the chunk sizes are different, then as we perform reshape
		 * in units of the largest of the two, reshape_position needs
		 * be a multiple of the largest chunk size times new data disks.
		 */
		here_new = mddev->reshape_position;
		chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
		new_data_disks = mddev->raid_disks - max_degraded;
		if (sector_div(here_new, chunk_sectors * new_data_disks)) {
			pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
				mdname(mddev));
			ret = -EINVAL;
			goto exit_acct_set;
		}
		reshape_offset = here_new * chunk_sectors;
		/* here_new is the stripe we will write to */
		here_old = mddev->reshape_position;
		sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
		/* here_old is the first stripe that we might need to read
		 * from */
		if (mddev->delta_disks == 0) {
			/* We cannot be sure it is safe to start an in-place
			 * reshape.  It is only safe if user-space is monitoring
			 * and taking constant backups.
			 * mdadm always starts a situation like this in
			 * readonly mode so it can take control before
			 * allowing any writes.  So just check for that.
			 */
			if (abs(min_offset_diff) >= mddev->chunk_sectors &&
			    abs(min_offset_diff) >= mddev->new_chunk_sectors)
				/* not really in-place - so OK */;
			else if (mddev->ro == 0) {
				pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
					mdname(mddev));
				ret = -EINVAL;
				goto exit_acct_set;
			}
		} else if (mddev->reshape_backwards
		    ? (here_new * chunk_sectors + min_offset_diff <=
		       here_old * chunk_sectors)
		    : (here_new * chunk_sectors >=
		       here_old * chunk_sectors + (-min_offset_diff))) {
			/* Reading from the same stripe as writing to - bad */
			pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
				mdname(mddev));
			ret = -EINVAL;
			goto exit_acct_set;
		}
		pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
		/* OK, we should be able to continue; */
	} else {
		BUG_ON(mddev->level != mddev->new_level);
		BUG_ON(mddev->layout != mddev->new_layout);
		BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
		BUG_ON(mddev->delta_disks != 0);
	}

	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
	    test_bit(MD_HAS_PPL, &mddev->flags)) {
		pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
			mdname(mddev));
		clear_bit(MD_HAS_PPL, &mddev->flags);
		clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags);
	}

	if (mddev->private == NULL)
		conf = setup_conf(mddev);
	else
		conf = mddev->private;

	if (IS_ERR(conf)) {
		ret = PTR_ERR(conf);
		goto exit_acct_set;
	}

	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
		if (!journal_dev) {
			pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
				mdname(mddev));
			mddev->ro = 1;
			set_disk_ro(mddev->gendisk, 1);
		} else if (mddev->recovery_cp == MaxSector)
			set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
	}

	conf->min_offset_diff = min_offset_diff;
	mddev->thread = conf->thread;
	conf->thread = NULL;
	mddev->private = conf;

	for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
	     i++) {
		rdev = conf->disks[i].rdev;
		if (!rdev && conf->disks[i].replacement) {
			/* The replacement is all we have yet */
			rdev = conf->disks[i].replacement;
			conf->disks[i].replacement = NULL;
			clear_bit(Replacement, &rdev->flags);
			conf->disks[i].rdev = rdev;
		}
		if (!rdev)
			continue;
		if (conf->disks[i].replacement &&
		    conf->reshape_progress != MaxSector) {
			/* replacements and reshape simply do not mix. */
			pr_warn("md: cannot handle concurrent replacement and reshape.\n");
			goto abort;
		}
		if (test_bit(In_sync, &rdev->flags)) {
			working_disks++;
			continue;
		}
		/* This disc is not fully in-sync.  However if it
		 * just stored parity (beyond the recovery_offset),
		 * when we don't need to be concerned about the
		 * array being dirty.
		 * When reshape goes 'backwards', we never have
		 * partially completed devices, so we only need
		 * to worry about reshape going forwards.
		 */
		/* Hack because v0.91 doesn't store recovery_offset properly. */
		if (mddev->major_version == 0 &&
		    mddev->minor_version > 90)
			rdev->recovery_offset = reshape_offset;

		if (rdev->recovery_offset < reshape_offset) {
			/* We need to check old and new layout */
			if (!only_parity(rdev->raid_disk,
					 conf->algorithm,
					 conf->raid_disks,
					 conf->max_degraded))
				continue;
		}
		if (!only_parity(rdev->raid_disk,
				 conf->prev_algo,
				 conf->previous_raid_disks,
				 conf->max_degraded))
			continue;
		dirty_parity_disks++;
	}

	/*
	 * 0 for a fully functional array, 1 or 2 for a degraded array.
	 */
	mddev->degraded = raid5_calc_degraded(conf);

	if (has_failed(conf)) {
		pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
			mdname(mddev), mddev->degraded, conf->raid_disks);
		goto abort;
	}

	/* device size must be a multiple of chunk size */
	mddev->dev_sectors &= ~((sector_t)mddev->chunk_sectors - 1);
	mddev->resync_max_sectors = mddev->dev_sectors;

	if (mddev->degraded > dirty_parity_disks &&
	    mddev->recovery_cp != MaxSector) {
		if (test_bit(MD_HAS_PPL, &mddev->flags))
			pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
				mdname(mddev));
		else if (mddev->ok_start_degraded)
			pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
				mdname(mddev));
		else {
			pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
				mdname(mddev));
			goto abort;
		}
	}

	pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
		mdname(mddev), conf->level,
		mddev->raid_disks-mddev->degraded, mddev->raid_disks,
		mddev->new_layout);

	print_raid5_conf(conf);

	if (conf->reshape_progress != MaxSector) {
		conf->reshape_safe = conf->reshape_progress;
		atomic_set(&conf->reshape_stripes, 0);
		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
							"reshape");
		if (!mddev->sync_thread)
			goto abort;
	}

	/* Ok, everything is just fine now */
	if (mddev->to_remove == &raid5_attrs_group)
		mddev->to_remove = NULL;
	else if (mddev->kobj.sd &&
	    sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
		pr_warn("raid5: failed to create sysfs attributes for %s\n",
			mdname(mddev));
	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));

	if (mddev->queue) {
		int chunk_size;
		/* read-ahead size must cover two whole stripes, which
		 * is 2 * (datadisks) * chunksize where 'n' is the
		 * number of raid devices
		 */
		int data_disks = conf->previous_raid_disks - conf->max_degraded;
		int stripe = data_disks *
			((mddev->chunk_sectors << 9) / PAGE_SIZE);

		chunk_size = mddev->chunk_sectors << 9;
		blk_queue_io_min(mddev->queue, chunk_size);
		raid5_set_io_opt(conf);
		mddev->queue->limits.raid_partial_stripes_expensive = 1;
		/*
		 * We can only discard a whole stripe. It doesn't make sense to
		 * discard data disk but write parity disk
		 */
		stripe = stripe * PAGE_SIZE;
		stripe = roundup_pow_of_two(stripe);
		mddev->queue->limits.discard_alignment = stripe;
		mddev->queue->limits.discard_granularity = stripe;

		blk_queue_max_write_same_sectors(mddev->queue, 0);
		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);

		rdev_for_each(rdev, mddev) {
			disk_stack_limits(mddev->gendisk, rdev->bdev,
					  rdev->data_offset << 9);
			disk_stack_limits(mddev->gendisk, rdev->bdev,
					  rdev->new_data_offset << 9);
		}

		/*
		 * zeroing is required, otherwise data
		 * could be lost. Consider a scenario: discard a stripe
		 * (the stripe could be inconsistent if
		 * discard_zeroes_data is 0); write one disk of the
		 * stripe (the stripe could be inconsistent again
		 * depending on which disks are used to calculate
		 * parity); the disk is broken; The stripe data of this
		 * disk is lost.
		 *
		 * We only allow DISCARD if the sysadmin has confirmed that
		 * only safe devices are in use by setting a module parameter.
		 * A better idea might be to turn DISCARD into WRITE_ZEROES
		 * requests, as that is required to be safe.
		 */
		if (devices_handle_discard_safely &&
		    mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
		    mddev->queue->limits.discard_granularity >= stripe)
			blk_queue_flag_set(QUEUE_FLAG_DISCARD,
						mddev->queue);
		else
			blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
						mddev->queue);

		blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
	}

	if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
		goto abort;

	return 0;
abort:
	md_unregister_thread(&mddev->thread);
	print_raid5_conf(conf);
	free_conf(conf);
	mddev->private = NULL;
	pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
	ret = -EIO;
exit_acct_set:
	acct_bioset_exit(mddev);
	return ret;
}