in raid5.c [4078:4240]
static int handle_stripe_dirtying(struct r5conf *conf,
struct stripe_head *sh,
struct stripe_head_state *s,
int disks)
{
int rmw = 0, rcw = 0, i;
sector_t recovery_cp = conf->mddev->recovery_cp;
/* Check whether resync is now happening or should start.
* If yes, then the array is dirty (after unclean shutdown or
* initial creation), so parity in some stripes might be inconsistent.
* In this case, we need to always do reconstruct-write, to ensure
* that in case of drive failure or read-error correction, we
* generate correct data from the parity.
*/
if (conf->rmw_level == PARITY_DISABLE_RMW ||
(recovery_cp < MaxSector && sh->sector >= recovery_cp &&
s->failed == 0)) {
/* Calculate the real rcw later - for now make it
* look like rcw is cheaper
*/
rcw = 1; rmw = 2;
pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
conf->rmw_level, (unsigned long long)recovery_cp,
(unsigned long long)sh->sector);
} else for (i = disks; i--; ) {
/* would I have to read this buffer for read_modify_write */
struct r5dev *dev = &sh->dev[i];
if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
i == sh->pd_idx || i == sh->qd_idx ||
test_bit(R5_InJournal, &dev->flags)) &&
!test_bit(R5_LOCKED, &dev->flags) &&
!(uptodate_for_rmw(dev) ||
test_bit(R5_Wantcompute, &dev->flags))) {
if (test_bit(R5_Insync, &dev->flags))
rmw++;
else
rmw += 2*disks; /* cannot read it */
}
/* Would I have to read this buffer for reconstruct_write */
if (!test_bit(R5_OVERWRITE, &dev->flags) &&
i != sh->pd_idx && i != sh->qd_idx &&
!test_bit(R5_LOCKED, &dev->flags) &&
!(test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags))) {
if (test_bit(R5_Insync, &dev->flags))
rcw++;
else
rcw += 2*disks;
}
}
pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
(unsigned long long)sh->sector, sh->state, rmw, rcw);
set_bit(STRIPE_HANDLE, &sh->state);
if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
/* prefer read-modify-write, but need to get some data */
if (conf->mddev->queue)
blk_add_trace_msg(conf->mddev->queue,
"raid5 rmw %llu %d",
(unsigned long long)sh->sector, rmw);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (test_bit(R5_InJournal, &dev->flags) &&
dev->page == dev->orig_page &&
!test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
/* alloc page for prexor */
struct page *p = alloc_page(GFP_NOIO);
if (p) {
dev->orig_page = p;
continue;
}
/*
* alloc_page() failed, try use
* disk_info->extra_page
*/
if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
&conf->cache_state)) {
r5c_use_extra_page(sh);
break;
}
/* extra_page in use, add to delayed_list */
set_bit(STRIPE_DELAYED, &sh->state);
s->waiting_extra_page = 1;
return -EAGAIN;
}
}
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
i == sh->pd_idx || i == sh->qd_idx ||
test_bit(R5_InJournal, &dev->flags)) &&
!test_bit(R5_LOCKED, &dev->flags) &&
!(uptodate_for_rmw(dev) ||
test_bit(R5_Wantcompute, &dev->flags)) &&
test_bit(R5_Insync, &dev->flags)) {
if (test_bit(STRIPE_PREREAD_ACTIVE,
&sh->state)) {
pr_debug("Read_old block %d for r-m-w\n",
i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
s->locked++;
} else
set_bit(STRIPE_DELAYED, &sh->state);
}
}
}
if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) {
/* want reconstruct write, but need to get some data */
int qread =0;
rcw = 0;
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (!test_bit(R5_OVERWRITE, &dev->flags) &&
i != sh->pd_idx && i != sh->qd_idx &&
!test_bit(R5_LOCKED, &dev->flags) &&
!(test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags))) {
rcw++;
if (test_bit(R5_Insync, &dev->flags) &&
test_bit(STRIPE_PREREAD_ACTIVE,
&sh->state)) {
pr_debug("Read_old block "
"%d for Reconstruct\n", i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
s->locked++;
qread++;
} else
set_bit(STRIPE_DELAYED, &sh->state);
}
}
if (rcw && conf->mddev->queue)
blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
(unsigned long long)sh->sector,
rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
}
if (rcw > disks && rmw > disks &&
!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
set_bit(STRIPE_DELAYED, &sh->state);
/* now if nothing is locked, and if we have enough data,
* we can start a write request
*/
/* since handle_stripe can be called at any time we need to handle the
* case where a compute block operation has been submitted and then a
* subsequent call wants to start a write request. raid_run_ops only
* handles the case where compute block and reconstruct are requested
* simultaneously. If this is not the case then new writes need to be
* held off until the compute completes.
*/
if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
(s->locked == 0 && (rcw == 0 || rmw == 0) &&
!test_bit(STRIPE_BIT_DELAY, &sh->state)))
schedule_reconstruction(sh, s, rcw == 0, 0);
return 0;
}