static void ops_run_io()

in raid5.c [1057:1303]


static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
{
	struct r5conf *conf = sh->raid_conf;
	int i, disks = sh->disks;
	struct stripe_head *head_sh = sh;
	struct bio_list pending_bios = BIO_EMPTY_LIST;
	bool should_defer;

	might_sleep();

	if (log_stripe(sh, s) == 0)
		return;

	should_defer = conf->batch_bio_dispatch && conf->group_cnt;

	for (i = disks; i--; ) {
		int op, op_flags = 0;
		int replace_only = 0;
		struct bio *bi, *rbi;
		struct md_rdev *rdev, *rrdev = NULL;

		sh = head_sh;
		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
			op = REQ_OP_WRITE;
			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
				op_flags = REQ_FUA;
			if (test_bit(R5_Discard, &sh->dev[i].flags))
				op = REQ_OP_DISCARD;
		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
			op = REQ_OP_READ;
		else if (test_and_clear_bit(R5_WantReplace,
					    &sh->dev[i].flags)) {
			op = REQ_OP_WRITE;
			replace_only = 1;
		} else
			continue;
		if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
			op_flags |= REQ_SYNC;

again:
		bi = &sh->dev[i].req;
		rbi = &sh->dev[i].rreq; /* For writing to replacement */

		rcu_read_lock();
		rrdev = rcu_dereference(conf->disks[i].replacement);
		smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
		rdev = rcu_dereference(conf->disks[i].rdev);
		if (!rdev) {
			rdev = rrdev;
			rrdev = NULL;
		}
		if (op_is_write(op)) {
			if (replace_only)
				rdev = NULL;
			if (rdev == rrdev)
				/* We raced and saw duplicates */
				rrdev = NULL;
		} else {
			if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
				rdev = rrdev;
			rrdev = NULL;
		}

		if (rdev && test_bit(Faulty, &rdev->flags))
			rdev = NULL;
		if (rdev)
			atomic_inc(&rdev->nr_pending);
		if (rrdev && test_bit(Faulty, &rrdev->flags))
			rrdev = NULL;
		if (rrdev)
			atomic_inc(&rrdev->nr_pending);
		rcu_read_unlock();

		/* We have already checked bad blocks for reads.  Now
		 * need to check for writes.  We never accept write errors
		 * on the replacement, so we don't to check rrdev.
		 */
		while (op_is_write(op) && rdev &&
		       test_bit(WriteErrorSeen, &rdev->flags)) {
			sector_t first_bad;
			int bad_sectors;
			int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
					      &first_bad, &bad_sectors);
			if (!bad)
				break;

			if (bad < 0) {
				set_bit(BlockedBadBlocks, &rdev->flags);
				if (!conf->mddev->external &&
				    conf->mddev->sb_flags) {
					/* It is very unlikely, but we might
					 * still need to write out the
					 * bad block log - better give it
					 * a chance*/
					md_check_recovery(conf->mddev);
				}
				/*
				 * Because md_wait_for_blocked_rdev
				 * will dec nr_pending, we must
				 * increment it first.
				 */
				atomic_inc(&rdev->nr_pending);
				md_wait_for_blocked_rdev(rdev, conf->mddev);
			} else {
				/* Acknowledged bad block - skip the write */
				rdev_dec_pending(rdev, conf->mddev);
				rdev = NULL;
			}
		}

		if (rdev) {
			if (s->syncing || s->expanding || s->expanded
			    || s->replacing)
				md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));

			set_bit(STRIPE_IO_STARTED, &sh->state);

			bio_set_dev(bi, rdev->bdev);
			bio_set_op_attrs(bi, op, op_flags);
			bi->bi_end_io = op_is_write(op)
				? raid5_end_write_request
				: raid5_end_read_request;
			bi->bi_private = sh;

			pr_debug("%s: for %llu schedule op %d on disc %d\n",
				__func__, (unsigned long long)sh->sector,
				bi->bi_opf, i);
			atomic_inc(&sh->count);
			if (sh != head_sh)
				atomic_inc(&head_sh->count);
			if (use_new_offset(conf, sh))
				bi->bi_iter.bi_sector = (sh->sector
						 + rdev->new_data_offset);
			else
				bi->bi_iter.bi_sector = (sh->sector
						 + rdev->data_offset);
			if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
				bi->bi_opf |= REQ_NOMERGE;

			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));

			if (!op_is_write(op) &&
			    test_bit(R5_InJournal, &sh->dev[i].flags))
				/*
				 * issuing read for a page in journal, this
				 * must be preparing for prexor in rmw; read
				 * the data into orig_page
				 */
				sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
			else
				sh->dev[i].vec.bv_page = sh->dev[i].page;
			bi->bi_vcnt = 1;
			bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
			bi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
			bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
			bi->bi_write_hint = sh->dev[i].write_hint;
			if (!rrdev)
				sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
			/*
			 * If this is discard request, set bi_vcnt 0. We don't
			 * want to confuse SCSI because SCSI will replace payload
			 */
			if (op == REQ_OP_DISCARD)
				bi->bi_vcnt = 0;
			if (rrdev)
				set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);

			if (conf->mddev->gendisk)
				trace_block_bio_remap(bi,
						disk_devt(conf->mddev->gendisk),
						sh->dev[i].sector);
			if (should_defer && op_is_write(op))
				bio_list_add(&pending_bios, bi);
			else
				submit_bio_noacct(bi);
		}
		if (rrdev) {
			if (s->syncing || s->expanding || s->expanded
			    || s->replacing)
				md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));

			set_bit(STRIPE_IO_STARTED, &sh->state);

			bio_set_dev(rbi, rrdev->bdev);
			bio_set_op_attrs(rbi, op, op_flags);
			BUG_ON(!op_is_write(op));
			rbi->bi_end_io = raid5_end_write_request;
			rbi->bi_private = sh;

			pr_debug("%s: for %llu schedule op %d on "
				 "replacement disc %d\n",
				__func__, (unsigned long long)sh->sector,
				rbi->bi_opf, i);
			atomic_inc(&sh->count);
			if (sh != head_sh)
				atomic_inc(&head_sh->count);
			if (use_new_offset(conf, sh))
				rbi->bi_iter.bi_sector = (sh->sector
						  + rrdev->new_data_offset);
			else
				rbi->bi_iter.bi_sector = (sh->sector
						  + rrdev->data_offset);
			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
			sh->dev[i].rvec.bv_page = sh->dev[i].page;
			rbi->bi_vcnt = 1;
			rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
			rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
			rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
			rbi->bi_write_hint = sh->dev[i].write_hint;
			sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
			/*
			 * If this is discard request, set bi_vcnt 0. We don't
			 * want to confuse SCSI because SCSI will replace payload
			 */
			if (op == REQ_OP_DISCARD)
				rbi->bi_vcnt = 0;
			if (conf->mddev->gendisk)
				trace_block_bio_remap(rbi,
						disk_devt(conf->mddev->gendisk),
						sh->dev[i].sector);
			if (should_defer && op_is_write(op))
				bio_list_add(&pending_bios, rbi);
			else
				submit_bio_noacct(rbi);
		}
		if (!rdev && !rrdev) {
			if (op_is_write(op))
				set_bit(STRIPE_DEGRADED, &sh->state);
			pr_debug("skip op %d on disc %d for sector %llu\n",
				bi->bi_opf, i, (unsigned long long)sh->sector);
			clear_bit(R5_LOCKED, &sh->dev[i].flags);
			set_bit(STRIPE_HANDLE, &sh->state);
		}

		if (!head_sh->batch_head)
			continue;
		sh = list_first_entry(&sh->batch_list, struct stripe_head,
				      batch_list);
		if (sh != head_sh)
			goto again;
	}

	if (should_defer && !bio_list_empty(&pending_bios))
		defer_issue_bios(conf, head_sh->sector, &pending_bios);
}