static void raid1_write_request()

in raid1.c [1343:1603]


static void raid1_write_request(struct mddev *mddev, struct bio *bio,
				int max_write_sectors)
{
	struct r1conf *conf = mddev->private;
	struct r1bio *r1_bio;
	int i, disks;
	struct bitmap *bitmap = mddev->bitmap;
	unsigned long flags;
	struct md_rdev *blocked_rdev;
	struct blk_plug_cb *cb;
	struct raid1_plug_cb *plug = NULL;
	int first_clone;
	int max_sectors;
	bool write_behind = false;

	if (mddev_is_clustered(mddev) &&
	     md_cluster_ops->area_resyncing(mddev, WRITE,
		     bio->bi_iter.bi_sector, bio_end_sector(bio))) {

		DEFINE_WAIT(w);
		if (bio->bi_opf & REQ_NOWAIT) {
			bio_wouldblock_error(bio);
			return;
		}
		for (;;) {
			prepare_to_wait(&conf->wait_barrier,
					&w, TASK_IDLE);
			if (!md_cluster_ops->area_resyncing(mddev, WRITE,
							bio->bi_iter.bi_sector,
							bio_end_sector(bio)))
				break;
			schedule();
		}
		finish_wait(&conf->wait_barrier, &w);
	}

	/*
	 * Register the new request and wait if the reconstruction
	 * thread has put up a bar for new requests.
	 * Continue immediately if no resync is active currently.
	 */
	if (!wait_barrier(conf, bio->bi_iter.bi_sector,
				bio->bi_opf & REQ_NOWAIT)) {
		bio_wouldblock_error(bio);
		return;
	}

	r1_bio = alloc_r1bio(mddev, bio);
	r1_bio->sectors = max_write_sectors;

	/* first select target devices under rcu_lock and
	 * inc refcount on their rdev.  Record them by setting
	 * bios[x] to bio
	 * If there are known/acknowledged bad blocks on any device on
	 * which we have seen a write error, we want to avoid writing those
	 * blocks.
	 * This potentially requires several writes to write around
	 * the bad blocks.  Each set of writes gets it's own r1bio
	 * with a set of bios attached.
	 */

	disks = conf->raid_disks * 2;
 retry_write:
	blocked_rdev = NULL;
	rcu_read_lock();
	max_sectors = r1_bio->sectors;
	for (i = 0;  i < disks; i++) {
		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);

		/*
		 * The write-behind io is only attempted on drives marked as
		 * write-mostly, which means we could allocate write behind
		 * bio later.
		 */
		if (rdev && test_bit(WriteMostly, &rdev->flags))
			write_behind = true;

		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
			atomic_inc(&rdev->nr_pending);
			blocked_rdev = rdev;
			break;
		}
		r1_bio->bios[i] = NULL;
		if (!rdev || test_bit(Faulty, &rdev->flags)) {
			if (i < conf->raid_disks)
				set_bit(R1BIO_Degraded, &r1_bio->state);
			continue;
		}

		atomic_inc(&rdev->nr_pending);
		if (test_bit(WriteErrorSeen, &rdev->flags)) {
			sector_t first_bad;
			int bad_sectors;
			int is_bad;

			is_bad = is_badblock(rdev, r1_bio->sector, max_sectors,
					     &first_bad, &bad_sectors);
			if (is_bad < 0) {
				/* mustn't write here until the bad block is
				 * acknowledged*/
				set_bit(BlockedBadBlocks, &rdev->flags);
				blocked_rdev = rdev;
				break;
			}
			if (is_bad && first_bad <= r1_bio->sector) {
				/* Cannot write here at all */
				bad_sectors -= (r1_bio->sector - first_bad);
				if (bad_sectors < max_sectors)
					/* mustn't write more than bad_sectors
					 * to other devices yet
					 */
					max_sectors = bad_sectors;
				rdev_dec_pending(rdev, mddev);
				/* We don't set R1BIO_Degraded as that
				 * only applies if the disk is
				 * missing, so it might be re-added,
				 * and we want to know to recover this
				 * chunk.
				 * In this case the device is here,
				 * and the fact that this chunk is not
				 * in-sync is recorded in the bad
				 * block log
				 */
				continue;
			}
			if (is_bad) {
				int good_sectors = first_bad - r1_bio->sector;
				if (good_sectors < max_sectors)
					max_sectors = good_sectors;
			}
		}
		r1_bio->bios[i] = bio;
	}
	rcu_read_unlock();

	if (unlikely(blocked_rdev)) {
		/* Wait for this device to become unblocked */
		int j;

		for (j = 0; j < i; j++)
			if (r1_bio->bios[j])
				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
		r1_bio->state = 0;
		allow_barrier(conf, bio->bi_iter.bi_sector);

		if (bio->bi_opf & REQ_NOWAIT) {
			bio_wouldblock_error(bio);
			return;
		}
		raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
		md_wait_for_blocked_rdev(blocked_rdev, mddev);
		wait_barrier(conf, bio->bi_iter.bi_sector, false);
		goto retry_write;
	}

	/*
	 * When using a bitmap, we may call alloc_behind_master_bio below.
	 * alloc_behind_master_bio allocates a copy of the data payload a page
	 * at a time and thus needs a new bio that can fit the whole payload
	 * this bio in page sized chunks.
	 */
	if (write_behind && bitmap)
		max_sectors = min_t(int, max_sectors,
				    BIO_MAX_VECS * (PAGE_SIZE >> 9));
	if (max_sectors < bio_sectors(bio)) {
		struct bio *split = bio_split(bio, max_sectors,
					      GFP_NOIO, &conf->bio_split);
		bio_chain(split, bio);
		submit_bio_noacct(bio);
		bio = split;
		r1_bio->master_bio = bio;
		r1_bio->sectors = max_sectors;
	}

	if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
		r1_bio->start_time = bio_start_io_acct(bio);
	atomic_set(&r1_bio->remaining, 1);
	atomic_set(&r1_bio->behind_remaining, 0);

	first_clone = 1;

	for (i = 0; i < disks; i++) {
		struct bio *mbio = NULL;
		struct md_rdev *rdev = conf->mirrors[i].rdev;
		if (!r1_bio->bios[i])
			continue;

		if (first_clone) {
			/* do behind I/O ?
			 * Not if there are too many, or cannot
			 * allocate memory, or a reader on WriteMostly
			 * is waiting for behind writes to flush */
			if (bitmap &&
			    test_bit(WriteMostly, &rdev->flags) &&
			    (atomic_read(&bitmap->behind_writes)
			     < mddev->bitmap_info.max_write_behind) &&
			    !waitqueue_active(&bitmap->behind_wait)) {
				alloc_behind_master_bio(r1_bio, bio);
			}

			md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors,
					     test_bit(R1BIO_BehindIO, &r1_bio->state));
			first_clone = 0;
		}

		if (r1_bio->behind_master_bio)
			mbio = bio_clone_fast(r1_bio->behind_master_bio,
					      GFP_NOIO, &mddev->bio_set);
		else
			mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);

		if (r1_bio->behind_master_bio) {
			if (test_bit(CollisionCheck, &rdev->flags))
				wait_for_serialization(rdev, r1_bio);
			if (test_bit(WriteMostly, &rdev->flags))
				atomic_inc(&r1_bio->behind_remaining);
		} else if (mddev->serialize_policy)
			wait_for_serialization(rdev, r1_bio);

		r1_bio->bios[i] = mbio;

		mbio->bi_iter.bi_sector	= (r1_bio->sector + rdev->data_offset);
		bio_set_dev(mbio, rdev->bdev);
		mbio->bi_end_io	= raid1_end_write_request;
		mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA));
		if (test_bit(FailFast, &rdev->flags) &&
		    !test_bit(WriteMostly, &rdev->flags) &&
		    conf->raid_disks - mddev->degraded > 1)
			mbio->bi_opf |= MD_FAILFAST;
		mbio->bi_private = r1_bio;

		atomic_inc(&r1_bio->remaining);

		if (mddev->gendisk)
			trace_block_bio_remap(mbio, disk_devt(mddev->gendisk),
					      r1_bio->sector);
		/* flush_pending_writes() needs access to the rdev so...*/
		mbio->bi_bdev = (void *)rdev;

		cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
		if (cb)
			plug = container_of(cb, struct raid1_plug_cb, cb);
		else
			plug = NULL;
		if (plug) {
			bio_list_add(&plug->pending, mbio);
			plug->pending_cnt++;
		} else {
			spin_lock_irqsave(&conf->device_lock, flags);
			bio_list_add(&conf->pending_bio_list, mbio);
			conf->pending_count++;
			spin_unlock_irqrestore(&conf->device_lock, flags);
			md_wakeup_thread(mddev->thread);
		}
	}

	r1_bio_write_done(r1_bio);

	/* In case raid1d snuck in to freeze_array */
	wake_up(&conf->wait_barrier);
}