in raid1.c [1343:1603]
static void raid1_write_request(struct mddev *mddev, struct bio *bio,
int max_write_sectors)
{
struct r1conf *conf = mddev->private;
struct r1bio *r1_bio;
int i, disks;
struct bitmap *bitmap = mddev->bitmap;
unsigned long flags;
struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
struct raid1_plug_cb *plug = NULL;
int first_clone;
int max_sectors;
bool write_behind = false;
if (mddev_is_clustered(mddev) &&
md_cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector, bio_end_sector(bio))) {
DEFINE_WAIT(w);
if (bio->bi_opf & REQ_NOWAIT) {
bio_wouldblock_error(bio);
return;
}
for (;;) {
prepare_to_wait(&conf->wait_barrier,
&w, TASK_IDLE);
if (!md_cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector,
bio_end_sector(bio)))
break;
schedule();
}
finish_wait(&conf->wait_barrier, &w);
}
/*
* Register the new request and wait if the reconstruction
* thread has put up a bar for new requests.
* Continue immediately if no resync is active currently.
*/
if (!wait_barrier(conf, bio->bi_iter.bi_sector,
bio->bi_opf & REQ_NOWAIT)) {
bio_wouldblock_error(bio);
return;
}
r1_bio = alloc_r1bio(mddev, bio);
r1_bio->sectors = max_write_sectors;
/* first select target devices under rcu_lock and
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
* If there are known/acknowledged bad blocks on any device on
* which we have seen a write error, we want to avoid writing those
* blocks.
* This potentially requires several writes to write around
* the bad blocks. Each set of writes gets it's own r1bio
* with a set of bios attached.
*/
disks = conf->raid_disks * 2;
retry_write:
blocked_rdev = NULL;
rcu_read_lock();
max_sectors = r1_bio->sectors;
for (i = 0; i < disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
/*
* The write-behind io is only attempted on drives marked as
* write-mostly, which means we could allocate write behind
* bio later.
*/
if (rdev && test_bit(WriteMostly, &rdev->flags))
write_behind = true;
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
atomic_inc(&rdev->nr_pending);
blocked_rdev = rdev;
break;
}
r1_bio->bios[i] = NULL;
if (!rdev || test_bit(Faulty, &rdev->flags)) {
if (i < conf->raid_disks)
set_bit(R1BIO_Degraded, &r1_bio->state);
continue;
}
atomic_inc(&rdev->nr_pending);
if (test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
int bad_sectors;
int is_bad;
is_bad = is_badblock(rdev, r1_bio->sector, max_sectors,
&first_bad, &bad_sectors);
if (is_bad < 0) {
/* mustn't write here until the bad block is
* acknowledged*/
set_bit(BlockedBadBlocks, &rdev->flags);
blocked_rdev = rdev;
break;
}
if (is_bad && first_bad <= r1_bio->sector) {
/* Cannot write here at all */
bad_sectors -= (r1_bio->sector - first_bad);
if (bad_sectors < max_sectors)
/* mustn't write more than bad_sectors
* to other devices yet
*/
max_sectors = bad_sectors;
rdev_dec_pending(rdev, mddev);
/* We don't set R1BIO_Degraded as that
* only applies if the disk is
* missing, so it might be re-added,
* and we want to know to recover this
* chunk.
* In this case the device is here,
* and the fact that this chunk is not
* in-sync is recorded in the bad
* block log
*/
continue;
}
if (is_bad) {
int good_sectors = first_bad - r1_bio->sector;
if (good_sectors < max_sectors)
max_sectors = good_sectors;
}
}
r1_bio->bios[i] = bio;
}
rcu_read_unlock();
if (unlikely(blocked_rdev)) {
/* Wait for this device to become unblocked */
int j;
for (j = 0; j < i; j++)
if (r1_bio->bios[j])
rdev_dec_pending(conf->mirrors[j].rdev, mddev);
r1_bio->state = 0;
allow_barrier(conf, bio->bi_iter.bi_sector);
if (bio->bi_opf & REQ_NOWAIT) {
bio_wouldblock_error(bio);
return;
}
raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
md_wait_for_blocked_rdev(blocked_rdev, mddev);
wait_barrier(conf, bio->bi_iter.bi_sector, false);
goto retry_write;
}
/*
* When using a bitmap, we may call alloc_behind_master_bio below.
* alloc_behind_master_bio allocates a copy of the data payload a page
* at a time and thus needs a new bio that can fit the whole payload
* this bio in page sized chunks.
*/
if (write_behind && bitmap)
max_sectors = min_t(int, max_sectors,
BIO_MAX_VECS * (PAGE_SIZE >> 9));
if (max_sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, max_sectors,
GFP_NOIO, &conf->bio_split);
bio_chain(split, bio);
submit_bio_noacct(bio);
bio = split;
r1_bio->master_bio = bio;
r1_bio->sectors = max_sectors;
}
if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
r1_bio->start_time = bio_start_io_acct(bio);
atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0);
first_clone = 1;
for (i = 0; i < disks; i++) {
struct bio *mbio = NULL;
struct md_rdev *rdev = conf->mirrors[i].rdev;
if (!r1_bio->bios[i])
continue;
if (first_clone) {
/* do behind I/O ?
* Not if there are too many, or cannot
* allocate memory, or a reader on WriteMostly
* is waiting for behind writes to flush */
if (bitmap &&
test_bit(WriteMostly, &rdev->flags) &&
(atomic_read(&bitmap->behind_writes)
< mddev->bitmap_info.max_write_behind) &&
!waitqueue_active(&bitmap->behind_wait)) {
alloc_behind_master_bio(r1_bio, bio);
}
md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors,
test_bit(R1BIO_BehindIO, &r1_bio->state));
first_clone = 0;
}
if (r1_bio->behind_master_bio)
mbio = bio_clone_fast(r1_bio->behind_master_bio,
GFP_NOIO, &mddev->bio_set);
else
mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
if (r1_bio->behind_master_bio) {
if (test_bit(CollisionCheck, &rdev->flags))
wait_for_serialization(rdev, r1_bio);
if (test_bit(WriteMostly, &rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
} else if (mddev->serialize_policy)
wait_for_serialization(rdev, r1_bio);
r1_bio->bios[i] = mbio;
mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset);
bio_set_dev(mbio, rdev->bdev);
mbio->bi_end_io = raid1_end_write_request;
mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA));
if (test_bit(FailFast, &rdev->flags) &&
!test_bit(WriteMostly, &rdev->flags) &&
conf->raid_disks - mddev->degraded > 1)
mbio->bi_opf |= MD_FAILFAST;
mbio->bi_private = r1_bio;
atomic_inc(&r1_bio->remaining);
if (mddev->gendisk)
trace_block_bio_remap(mbio, disk_devt(mddev->gendisk),
r1_bio->sector);
/* flush_pending_writes() needs access to the rdev so...*/
mbio->bi_bdev = (void *)rdev;
cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
if (cb)
plug = container_of(cb, struct raid1_plug_cb, cb);
else
plug = NULL;
if (plug) {
bio_list_add(&plug->pending, mbio);
plug->pending_cnt++;
} else {
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
conf->pending_count++;
spin_unlock_irqrestore(&conf->device_lock, flags);
md_wakeup_thread(mddev->thread);
}
}
r1_bio_write_done(r1_bio);
/* In case raid1d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
}