in raid10.c [3244:3865]
static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
int *skipped)
{
struct r10conf *conf = mddev->private;
struct r10bio *r10_bio;
struct bio *biolist = NULL, *bio;
sector_t max_sector, nr_sectors;
int i;
int max_sync;
sector_t sync_blocks;
sector_t sectors_skipped = 0;
int chunks_skipped = 0;
sector_t chunk_mask = conf->geo.chunk_mask;
int page_idx = 0;
if (!mempool_initialized(&conf->r10buf_pool))
if (init_resync(conf))
return 0;
/*
* Allow skipping a full rebuild for incremental assembly
* of a clean array, like RAID1 does.
*/
if (mddev->bitmap == NULL &&
mddev->recovery_cp == MaxSector &&
mddev->reshape_position == MaxSector &&
!test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
conf->fullsync == 0) {
*skipped = 1;
return mddev->dev_sectors - sector_nr;
}
skipped:
max_sector = mddev->dev_sectors;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sector = mddev->resync_max_sectors;
if (sector_nr >= max_sector) {
conf->cluster_sync_low = 0;
conf->cluster_sync_high = 0;
/* If we aborted, we need to abort the
* sync on the 'current' bitmap chucks (there can
* be several when recovering multiple devices).
* as we may have started syncing it but not finished.
* We can find the current address in
* mddev->curr_resync, but for recovery,
* we need to convert that to several
* virtual addresses.
*/
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
end_reshape(conf);
close_sync(conf);
return 0;
}
if (mddev->curr_resync < max_sector) { /* aborted */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
&sync_blocks, 1);
else for (i = 0; i < conf->geo.raid_disks; i++) {
sector_t sect =
raid10_find_virt(conf, mddev->curr_resync, i);
md_bitmap_end_sync(mddev->bitmap, sect,
&sync_blocks, 1);
}
} else {
/* completed sync */
if ((!mddev->bitmap || conf->fullsync)
&& conf->have_replacement
&& test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
/* Completed a full sync so the replacements
* are now fully recovered.
*/
rcu_read_lock();
for (i = 0; i < conf->geo.raid_disks; i++) {
struct md_rdev *rdev =
rcu_dereference(conf->mirrors[i].replacement);
if (rdev)
rdev->recovery_offset = MaxSector;
}
rcu_read_unlock();
}
conf->fullsync = 0;
}
md_bitmap_close_sync(mddev->bitmap);
close_sync(conf);
*skipped = 1;
return sectors_skipped;
}
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
return reshape_request(mddev, sector_nr, skipped);
if (chunks_skipped >= conf->geo.raid_disks) {
/* if there has been nothing to do on any drive,
* then there is nothing to do at all..
*/
*skipped = 1;
return (max_sector - sector_nr) + sectors_skipped;
}
if (max_sector > mddev->resync_max)
max_sector = mddev->resync_max; /* Don't do IO beyond here */
/* make sure whole request will fit in a chunk - if chunks
* are meaningful
*/
if (conf->geo.near_copies < conf->geo.raid_disks &&
max_sector > (sector_nr | chunk_mask))
max_sector = (sector_nr | chunk_mask) + 1;
/*
* If there is non-resync activity waiting for a turn, then let it
* though before starting on this new sync request.
*/
if (conf->nr_waiting)
schedule_timeout_uninterruptible(1);
/* Again, very different code for resync and recovery.
* Both must result in an r10bio with a list of bios that
* have bi_end_io, bi_sector, bi_bdev set,
* and bi_private set to the r10bio.
* For recovery, we may actually create several r10bios
* with 2 bios in each, that correspond to the bios in the main one.
* In this case, the subordinate r10bios link back through a
* borrowed master_bio pointer, and the counter in the master
* includes a ref from each subordinate.
*/
/* First, we decide what to do and set ->bi_end_io
* To end_sync_read if we want to read, and
* end_sync_write if we will want to write.
*/
max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
/* recovery... the complicated one */
int j;
r10_bio = NULL;
for (i = 0 ; i < conf->geo.raid_disks; i++) {
int still_degraded;
struct r10bio *rb2;
sector_t sect;
int must_sync;
int any_working;
int need_recover = 0;
int need_replace = 0;
struct raid10_info *mirror = &conf->mirrors[i];
struct md_rdev *mrdev, *mreplace;
rcu_read_lock();
mrdev = rcu_dereference(mirror->rdev);
mreplace = rcu_dereference(mirror->replacement);
if (mrdev != NULL &&
!test_bit(Faulty, &mrdev->flags) &&
!test_bit(In_sync, &mrdev->flags))
need_recover = 1;
if (mreplace != NULL &&
!test_bit(Faulty, &mreplace->flags))
need_replace = 1;
if (!need_recover && !need_replace) {
rcu_read_unlock();
continue;
}
still_degraded = 0;
/* want to reconstruct this device */
rb2 = r10_bio;
sect = raid10_find_virt(conf, sector_nr, i);
if (sect >= mddev->resync_max_sectors) {
/* last stripe is not complete - don't
* try to recover this sector.
*/
rcu_read_unlock();
continue;
}
if (mreplace && test_bit(Faulty, &mreplace->flags))
mreplace = NULL;
/* Unless we are doing a full sync, or a replacement
* we only need to recover the block if it is set in
* the bitmap
*/
must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
&sync_blocks, 1);
if (sync_blocks < max_sync)
max_sync = sync_blocks;
if (!must_sync &&
mreplace == NULL &&
!conf->fullsync) {
/* yep, skip the sync_blocks here, but don't assume
* that there will never be anything to do here
*/
chunks_skipped = -1;
rcu_read_unlock();
continue;
}
atomic_inc(&mrdev->nr_pending);
if (mreplace)
atomic_inc(&mreplace->nr_pending);
rcu_read_unlock();
r10_bio = raid10_alloc_init_r10buf(conf);
r10_bio->state = 0;
raise_barrier(conf, rb2 != NULL);
atomic_set(&r10_bio->remaining, 0);
r10_bio->master_bio = (struct bio*)rb2;
if (rb2)
atomic_inc(&rb2->remaining);
r10_bio->mddev = mddev;
set_bit(R10BIO_IsRecover, &r10_bio->state);
r10_bio->sector = sect;
raid10_find_phys(conf, r10_bio);
/* Need to check if the array will still be
* degraded
*/
rcu_read_lock();
for (j = 0; j < conf->geo.raid_disks; j++) {
struct md_rdev *rdev = rcu_dereference(
conf->mirrors[j].rdev);
if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
still_degraded = 1;
break;
}
}
must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
&sync_blocks, still_degraded);
any_working = 0;
for (j=0; j<conf->copies;j++) {
int k;
int d = r10_bio->devs[j].devnum;
sector_t from_addr, to_addr;
struct md_rdev *rdev =
rcu_dereference(conf->mirrors[d].rdev);
sector_t sector, first_bad;
int bad_sectors;
if (!rdev ||
!test_bit(In_sync, &rdev->flags))
continue;
/* This is where we read from */
any_working = 1;
sector = r10_bio->devs[j].addr;
if (is_badblock(rdev, sector, max_sync,
&first_bad, &bad_sectors)) {
if (first_bad > sector)
max_sync = first_bad - sector;
else {
bad_sectors -= (sector
- first_bad);
if (max_sync > bad_sectors)
max_sync = bad_sectors;
continue;
}
}
bio = r10_bio->devs[0].bio;
bio->bi_next = biolist;
biolist = bio;
bio->bi_end_io = end_sync_read;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
from_addr = r10_bio->devs[j].addr;
bio->bi_iter.bi_sector = from_addr +
rdev->data_offset;
bio_set_dev(bio, rdev->bdev);
atomic_inc(&rdev->nr_pending);
/* and we write to 'i' (if not in_sync) */
for (k=0; k<conf->copies; k++)
if (r10_bio->devs[k].devnum == i)
break;
BUG_ON(k == conf->copies);
to_addr = r10_bio->devs[k].addr;
r10_bio->devs[0].devnum = d;
r10_bio->devs[0].addr = from_addr;
r10_bio->devs[1].devnum = i;
r10_bio->devs[1].addr = to_addr;
if (need_recover) {
bio = r10_bio->devs[1].bio;
bio->bi_next = biolist;
biolist = bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_iter.bi_sector = to_addr
+ mrdev->data_offset;
bio_set_dev(bio, mrdev->bdev);
atomic_inc(&r10_bio->remaining);
} else
r10_bio->devs[1].bio->bi_end_io = NULL;
/* and maybe write to replacement */
bio = r10_bio->devs[1].repl_bio;
if (bio)
bio->bi_end_io = NULL;
/* Note: if need_replace, then bio
* cannot be NULL as r10buf_pool_alloc will
* have allocated it.
*/
if (!need_replace)
break;
bio->bi_next = biolist;
biolist = bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_iter.bi_sector = to_addr +
mreplace->data_offset;
bio_set_dev(bio, mreplace->bdev);
atomic_inc(&r10_bio->remaining);
break;
}
rcu_read_unlock();
if (j == conf->copies) {
/* Cannot recover, so abort the recovery or
* record a bad block */
if (any_working) {
/* problem is that there are bad blocks
* on other device(s)
*/
int k;
for (k = 0; k < conf->copies; k++)
if (r10_bio->devs[k].devnum == i)
break;
if (!test_bit(In_sync,
&mrdev->flags)
&& !rdev_set_badblocks(
mrdev,
r10_bio->devs[k].addr,
max_sync, 0))
any_working = 0;
if (mreplace &&
!rdev_set_badblocks(
mreplace,
r10_bio->devs[k].addr,
max_sync, 0))
any_working = 0;
}
if (!any_working) {
if (!test_and_set_bit(MD_RECOVERY_INTR,
&mddev->recovery))
pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
mdname(mddev));
mirror->recovery_disabled
= mddev->recovery_disabled;
}
put_buf(r10_bio);
if (rb2)
atomic_dec(&rb2->remaining);
r10_bio = rb2;
rdev_dec_pending(mrdev, mddev);
if (mreplace)
rdev_dec_pending(mreplace, mddev);
break;
}
rdev_dec_pending(mrdev, mddev);
if (mreplace)
rdev_dec_pending(mreplace, mddev);
if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) {
/* Only want this if there is elsewhere to
* read from. 'j' is currently the first
* readable copy.
*/
int targets = 1;
for (; j < conf->copies; j++) {
int d = r10_bio->devs[j].devnum;
if (conf->mirrors[d].rdev &&
test_bit(In_sync,
&conf->mirrors[d].rdev->flags))
targets++;
}
if (targets == 1)
r10_bio->devs[0].bio->bi_opf
&= ~MD_FAILFAST;
}
}
if (biolist == NULL) {
while (r10_bio) {
struct r10bio *rb2 = r10_bio;
r10_bio = (struct r10bio*) rb2->master_bio;
rb2->master_bio = NULL;
put_buf(rb2);
}
goto giveup;
}
} else {
/* resync. Schedule a read for every block at this virt offset */
int count = 0;
/*
* Since curr_resync_completed could probably not update in
* time, and we will set cluster_sync_low based on it.
* Let's check against "sector_nr + 2 * RESYNC_SECTORS" for
* safety reason, which ensures curr_resync_completed is
* updated in bitmap_cond_end_sync.
*/
md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
mddev_is_clustered(mddev) &&
(sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, mddev->degraded) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
&mddev->recovery)) {
/* We can skip this block */
*skipped = 1;
return sync_blocks + sectors_skipped;
}
if (sync_blocks < max_sync)
max_sync = sync_blocks;
r10_bio = raid10_alloc_init_r10buf(conf);
r10_bio->state = 0;
r10_bio->mddev = mddev;
atomic_set(&r10_bio->remaining, 0);
raise_barrier(conf, 0);
conf->next_resync = sector_nr;
r10_bio->master_bio = NULL;
r10_bio->sector = sector_nr;
set_bit(R10BIO_IsSync, &r10_bio->state);
raid10_find_phys(conf, r10_bio);
r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
for (i = 0; i < conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
sector_t first_bad, sector;
int bad_sectors;
struct md_rdev *rdev;
if (r10_bio->devs[i].repl_bio)
r10_bio->devs[i].repl_bio->bi_end_io = NULL;
bio = r10_bio->devs[i].bio;
bio->bi_status = BLK_STS_IOERR;
rcu_read_lock();
rdev = rcu_dereference(conf->mirrors[d].rdev);
if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
rcu_read_unlock();
continue;
}
sector = r10_bio->devs[i].addr;
if (is_badblock(rdev, sector, max_sync,
&first_bad, &bad_sectors)) {
if (first_bad > sector)
max_sync = first_bad - sector;
else {
bad_sectors -= (sector - first_bad);
if (max_sync > bad_sectors)
max_sync = bad_sectors;
rcu_read_unlock();
continue;
}
}
atomic_inc(&rdev->nr_pending);
atomic_inc(&r10_bio->remaining);
bio->bi_next = biolist;
biolist = bio;
bio->bi_end_io = end_sync_read;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
bio->bi_iter.bi_sector = sector + rdev->data_offset;
bio_set_dev(bio, rdev->bdev);
count++;
rdev = rcu_dereference(conf->mirrors[d].replacement);
if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
rcu_read_unlock();
continue;
}
atomic_inc(&rdev->nr_pending);
/* Need to set up for writing to the replacement */
bio = r10_bio->devs[i].repl_bio;
bio->bi_status = BLK_STS_IOERR;
sector = r10_bio->devs[i].addr;
bio->bi_next = biolist;
biolist = bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
bio->bi_iter.bi_sector = sector + rdev->data_offset;
bio_set_dev(bio, rdev->bdev);
count++;
rcu_read_unlock();
}
if (count < 2) {
for (i=0; i<conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
if (r10_bio->devs[i].bio->bi_end_io)
rdev_dec_pending(conf->mirrors[d].rdev,
mddev);
if (r10_bio->devs[i].repl_bio &&
r10_bio->devs[i].repl_bio->bi_end_io)
rdev_dec_pending(
conf->mirrors[d].replacement,
mddev);
}
put_buf(r10_bio);
biolist = NULL;
goto giveup;
}
}
nr_sectors = 0;
if (sector_nr + max_sync < max_sector)
max_sector = sector_nr + max_sync;
do {
struct page *page;
int len = PAGE_SIZE;
if (sector_nr + (len>>9) > max_sector)
len = (max_sector - sector_nr) << 9;
if (len == 0)
break;
for (bio= biolist ; bio ; bio=bio->bi_next) {
struct resync_pages *rp = get_resync_pages(bio);
page = resync_fetch_page(rp, page_idx);
/*
* won't fail because the vec table is big enough
* to hold all these pages
*/
bio_add_page(bio, page, len, 0);
}
nr_sectors += len>>9;
sector_nr += len>>9;
} while (++page_idx < RESYNC_PAGES);
r10_bio->sectors = nr_sectors;
if (mddev_is_clustered(mddev) &&
test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
/* It is resync not recovery */
if (conf->cluster_sync_high < sector_nr + nr_sectors) {
conf->cluster_sync_low = mddev->curr_resync_completed;
raid10_set_cluster_sync_high(conf);
/* Send resync message */
md_cluster_ops->resync_info_update(mddev,
conf->cluster_sync_low,
conf->cluster_sync_high);
}
} else if (mddev_is_clustered(mddev)) {
/* This is recovery not resync */
sector_t sect_va1, sect_va2;
bool broadcast_msg = false;
for (i = 0; i < conf->geo.raid_disks; i++) {
/*
* sector_nr is a device address for recovery, so we
* need translate it to array address before compare
* with cluster_sync_high.
*/
sect_va1 = raid10_find_virt(conf, sector_nr, i);
if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
broadcast_msg = true;
/*
* curr_resync_completed is similar as
* sector_nr, so make the translation too.
*/
sect_va2 = raid10_find_virt(conf,
mddev->curr_resync_completed, i);
if (conf->cluster_sync_low == 0 ||
conf->cluster_sync_low > sect_va2)
conf->cluster_sync_low = sect_va2;
}
}
if (broadcast_msg) {
raid10_set_cluster_sync_high(conf);
md_cluster_ops->resync_info_update(mddev,
conf->cluster_sync_low,
conf->cluster_sync_high);
}
}
while (biolist) {
bio = biolist;
biolist = biolist->bi_next;
bio->bi_next = NULL;
r10_bio = get_resync_r10bio(bio);
r10_bio->sectors = nr_sectors;
if (bio->bi_end_io == end_sync_read) {
md_sync_acct_bio(bio, nr_sectors);
bio->bi_status = 0;
submit_bio_noacct(bio);
}
}
if (sectors_skipped)
/* pretend they weren't skipped, it makes
* no important difference in this case
*/
md_done_sync(mddev, sectors_skipped, 1);
return sectors_skipped + nr_sectors;
giveup:
/* There is nowhere to write, so all non-sync
* drives must be failed or in resync, all drives
* have a bad block, so try the next chunk...
*/
if (sector_nr + max_sync < max_sector)
max_sector = sector_nr + max_sync;
sectors_skipped += (max_sector - sector_nr);
chunks_skipped ++;
sector_nr = max_sector;
goto skipped;
}