in raid5.c [4878:5271]
static void handle_stripe(struct stripe_head *sh)
{
struct stripe_head_state s;
struct r5conf *conf = sh->raid_conf;
int i;
int prexor;
int disks = sh->disks;
struct r5dev *pdev, *qdev;
clear_bit(STRIPE_HANDLE, &sh->state);
/*
* handle_stripe should not continue handle the batched stripe, only
* the head of batch list or lone stripe can continue. Otherwise we
* could see break_stripe_batch_list warns about the STRIPE_ACTIVE
* is set for the batched stripe.
*/
if (clear_batch_ready(sh))
return;
if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
/* already being handled, ensure it gets handled
* again when current action finishes */
set_bit(STRIPE_HANDLE, &sh->state);
return;
}
if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
break_stripe_batch_list(sh, 0);
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
spin_lock(&sh->stripe_lock);
/*
* Cannot process 'sync' concurrently with 'discard'.
* Flush data in r5cache before 'sync'.
*/
if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
!test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
!test_bit(STRIPE_DISCARD, &sh->state) &&
test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
set_bit(STRIPE_SYNCING, &sh->state);
clear_bit(STRIPE_INSYNC, &sh->state);
clear_bit(STRIPE_REPLACED, &sh->state);
}
spin_unlock(&sh->stripe_lock);
}
clear_bit(STRIPE_DELAYED, &sh->state);
pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
(unsigned long long)sh->sector, sh->state,
atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
sh->check_state, sh->reconstruct_state);
analyse_stripe(sh, &s);
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
goto finish;
if (s.handle_bad_blocks ||
test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
set_bit(STRIPE_HANDLE, &sh->state);
goto finish;
}
if (unlikely(s.blocked_rdev)) {
if (s.syncing || s.expanding || s.expanded ||
s.replacing || s.to_write || s.written) {
set_bit(STRIPE_HANDLE, &sh->state);
goto finish;
}
/* There is nothing for the blocked_rdev to block */
rdev_dec_pending(s.blocked_rdev, conf->mddev);
s.blocked_rdev = NULL;
}
if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
set_bit(STRIPE_BIOFILL_RUN, &sh->state);
}
pr_debug("locked=%d uptodate=%d to_read=%d"
" to_write=%d failed=%d failed_num=%d,%d\n",
s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
s.failed_num[0], s.failed_num[1]);
/*
* check if the array has lost more than max_degraded devices and,
* if so, some requests might need to be failed.
*
* When journal device failed (log_failed), we will only process
* the stripe if there is data need write to raid disks
*/
if (s.failed > conf->max_degraded ||
(s.log_failed && s.injournal == 0)) {
sh->check_state = 0;
sh->reconstruct_state = 0;
break_stripe_batch_list(sh, 0);
if (s.to_read+s.to_write+s.written)
handle_failed_stripe(conf, sh, &s, disks);
if (s.syncing + s.replacing)
handle_failed_sync(conf, sh, &s);
}
/* Now we check to see if any write operations have recently
* completed
*/
prexor = 0;
if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
prexor = 1;
if (sh->reconstruct_state == reconstruct_state_drain_result ||
sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
sh->reconstruct_state = reconstruct_state_idle;
/* All the 'written' buffers and the parity block are ready to
* be written back to disk
*/
BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
!test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
BUG_ON(sh->qd_idx >= 0 &&
!test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
!test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (test_bit(R5_LOCKED, &dev->flags) &&
(i == sh->pd_idx || i == sh->qd_idx ||
dev->written || test_bit(R5_InJournal,
&dev->flags))) {
pr_debug("Writing block %d\n", i);
set_bit(R5_Wantwrite, &dev->flags);
if (prexor)
continue;
if (s.failed > 1)
continue;
if (!test_bit(R5_Insync, &dev->flags) ||
((i == sh->pd_idx || i == sh->qd_idx) &&
s.failed == 0))
set_bit(STRIPE_INSYNC, &sh->state);
}
}
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
s.dec_preread_active = 1;
}
/*
* might be able to return some write requests if the parity blocks
* are safe, or on a failed drive
*/
pdev = &sh->dev[sh->pd_idx];
s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
qdev = &sh->dev[sh->qd_idx];
s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
|| conf->level < 6;
if (s.written &&
(s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
&& !test_bit(R5_LOCKED, &pdev->flags)
&& (test_bit(R5_UPTODATE, &pdev->flags) ||
test_bit(R5_Discard, &pdev->flags))))) &&
(s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
&& !test_bit(R5_LOCKED, &qdev->flags)
&& (test_bit(R5_UPTODATE, &qdev->flags) ||
test_bit(R5_Discard, &qdev->flags))))))
handle_stripe_clean_event(conf, sh, disks);
if (s.just_cached)
r5c_handle_cached_data_endio(conf, sh, disks);
log_stripe_write_finished(sh);
/* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests
* or to load a block that is being partially written.
*/
if (s.to_read || s.non_overwrite
|| (s.to_write && s.failed)
|| (s.syncing && (s.uptodate + s.compute < disks))
|| s.replacing
|| s.expanding)
handle_stripe_fill(sh, &s, disks);
/*
* When the stripe finishes full journal write cycle (write to journal
* and raid disk), this is the clean up procedure so it is ready for
* next operation.
*/
r5c_finish_stripe_write_out(conf, sh, &s);
/*
* Now to consider new write requests, cache write back and what else,
* if anything should be read. We do not handle new writes when:
* 1/ A 'write' operation (copy+xor) is already in flight.
* 2/ A 'check' operation is in flight, as it may clobber the parity
* block.
* 3/ A r5c cache log write is in flight.
*/
if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
if (!r5c_is_writeback(conf->log)) {
if (s.to_write)
handle_stripe_dirtying(conf, sh, &s, disks);
} else { /* write back cache */
int ret = 0;
/* First, try handle writes in caching phase */
if (s.to_write)
ret = r5c_try_caching_write(conf, sh, &s,
disks);
/*
* If caching phase failed: ret == -EAGAIN
* OR
* stripe under reclaim: !caching && injournal
*
* fall back to handle_stripe_dirtying()
*/
if (ret == -EAGAIN ||
/* stripe under reclaim: !caching && injournal */
(!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
s.injournal > 0)) {
ret = handle_stripe_dirtying(conf, sh, &s,
disks);
if (ret == -EAGAIN)
goto finish;
}
}
}
/* maybe we need to check and possibly fix the parity for this stripe
* Any reads will already have been scheduled, so we just see if enough
* data is available. The parity check is held off while parity
* dependent operations are in flight.
*/
if (sh->check_state ||
(s.syncing && s.locked == 0 &&
!test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
!test_bit(STRIPE_INSYNC, &sh->state))) {
if (conf->level == 6)
handle_parity_checks6(conf, sh, &s, disks);
else
handle_parity_checks5(conf, sh, &s, disks);
}
if ((s.replacing || s.syncing) && s.locked == 0
&& !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
&& !test_bit(STRIPE_REPLACED, &sh->state)) {
/* Write out to replacement devices where possible */
for (i = 0; i < conf->raid_disks; i++)
if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
set_bit(R5_WantReplace, &sh->dev[i].flags);
set_bit(R5_LOCKED, &sh->dev[i].flags);
s.locked++;
}
if (s.replacing)
set_bit(STRIPE_INSYNC, &sh->state);
set_bit(STRIPE_REPLACED, &sh->state);
}
if ((s.syncing || s.replacing) && s.locked == 0 &&
!test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
test_bit(STRIPE_INSYNC, &sh->state)) {
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
clear_bit(STRIPE_SYNCING, &sh->state);
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
wake_up(&conf->wait_for_overlap);
}
/* If the failed drives are just a ReadError, then we might need
* to progress the repair/check process
*/
if (s.failed <= conf->max_degraded && !conf->mddev->ro)
for (i = 0; i < s.failed; i++) {
struct r5dev *dev = &sh->dev[s.failed_num[i]];
if (test_bit(R5_ReadError, &dev->flags)
&& !test_bit(R5_LOCKED, &dev->flags)
&& test_bit(R5_UPTODATE, &dev->flags)
) {
if (!test_bit(R5_ReWrite, &dev->flags)) {
set_bit(R5_Wantwrite, &dev->flags);
set_bit(R5_ReWrite, &dev->flags);
} else
/* let's read it back */
set_bit(R5_Wantread, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
s.locked++;
}
}
/* Finish reconstruct operations initiated by the expansion process */
if (sh->reconstruct_state == reconstruct_state_result) {
struct stripe_head *sh_src
= raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
/* sh cannot be written until sh_src has been read.
* so arrange for sh to be delayed a little
*/
set_bit(STRIPE_DELAYED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
&sh_src->state))
atomic_inc(&conf->preread_active_stripes);
raid5_release_stripe(sh_src);
goto finish;
}
if (sh_src)
raid5_release_stripe(sh_src);
sh->reconstruct_state = reconstruct_state_idle;
clear_bit(STRIPE_EXPANDING, &sh->state);
for (i = conf->raid_disks; i--; ) {
set_bit(R5_Wantwrite, &sh->dev[i].flags);
set_bit(R5_LOCKED, &sh->dev[i].flags);
s.locked++;
}
}
if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
!sh->reconstruct_state) {
/* Need to write out all blocks after computing parity */
sh->disks = conf->raid_disks;
stripe_set_idx(sh->sector, conf, 0, sh);
schedule_reconstruction(sh, &s, 1, 1);
} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
wake_up(&conf->wait_for_overlap);
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
}
if (s.expanding && s.locked == 0 &&
!test_bit(STRIPE_COMPUTE_RUN, &sh->state))
handle_stripe_expansion(conf, sh);
finish:
/* wait for this device to become unblocked */
if (unlikely(s.blocked_rdev)) {
if (conf->mddev->external)
md_wait_for_blocked_rdev(s.blocked_rdev,
conf->mddev);
else
/* Internal metadata will immediately
* be written by raid5d, so we don't
* need to wait here.
*/
rdev_dec_pending(s.blocked_rdev,
conf->mddev);
}
if (s.handle_bad_blocks)
for (i = disks; i--; ) {
struct md_rdev *rdev;
struct r5dev *dev = &sh->dev[i];
if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
/* We own a safe reference to the rdev */
rdev = conf->disks[i].rdev;
if (!rdev_set_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0))
md_error(conf->mddev, rdev);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
rdev = conf->disks[i].replacement;
if (!rdev)
/* rdev have been moved down */
rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
}
}
if (s.ops_request)
raid_run_ops(sh, s.ops_request);
ops_run_io(sh, &s);
if (s.dec_preread_active) {
/* We delay this until after ops_run_io so that if make_request
* is waiting on a flush, it won't continue until the writes
* have actually been submitted.
*/
atomic_dec(&conf->preread_active_stripes);
if (atomic_read(&conf->preread_active_stripes) <
IO_THRESHOLD)
md_wakeup_thread(conf->mddev->thread);
}
clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
}