static sector_t raid10_sync_request()

in raid10.c [3244:3865]


static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
			     int *skipped)
{
	struct r10conf *conf = mddev->private;
	struct r10bio *r10_bio;
	struct bio *biolist = NULL, *bio;
	sector_t max_sector, nr_sectors;
	int i;
	int max_sync;
	sector_t sync_blocks;
	sector_t sectors_skipped = 0;
	int chunks_skipped = 0;
	sector_t chunk_mask = conf->geo.chunk_mask;
	int page_idx = 0;

	if (!mempool_initialized(&conf->r10buf_pool))
		if (init_resync(conf))
			return 0;

	/*
	 * Allow skipping a full rebuild for incremental assembly
	 * of a clean array, like RAID1 does.
	 */
	if (mddev->bitmap == NULL &&
	    mddev->recovery_cp == MaxSector &&
	    mddev->reshape_position == MaxSector &&
	    !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
	    !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
	    conf->fullsync == 0) {
		*skipped = 1;
		return mddev->dev_sectors - sector_nr;
	}

 skipped:
	max_sector = mddev->dev_sectors;
	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
		max_sector = mddev->resync_max_sectors;
	if (sector_nr >= max_sector) {
		conf->cluster_sync_low = 0;
		conf->cluster_sync_high = 0;

		/* If we aborted, we need to abort the
		 * sync on the 'current' bitmap chucks (there can
		 * be several when recovering multiple devices).
		 * as we may have started syncing it but not finished.
		 * We can find the current address in
		 * mddev->curr_resync, but for recovery,
		 * we need to convert that to several
		 * virtual addresses.
		 */
		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
			end_reshape(conf);
			close_sync(conf);
			return 0;
		}

		if (mddev->curr_resync < max_sector) { /* aborted */
			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
				md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
						   &sync_blocks, 1);
			else for (i = 0; i < conf->geo.raid_disks; i++) {
				sector_t sect =
					raid10_find_virt(conf, mddev->curr_resync, i);
				md_bitmap_end_sync(mddev->bitmap, sect,
						   &sync_blocks, 1);
			}
		} else {
			/* completed sync */
			if ((!mddev->bitmap || conf->fullsync)
			    && conf->have_replacement
			    && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
				/* Completed a full sync so the replacements
				 * are now fully recovered.
				 */
				rcu_read_lock();
				for (i = 0; i < conf->geo.raid_disks; i++) {
					struct md_rdev *rdev =
						rcu_dereference(conf->mirrors[i].replacement);
					if (rdev)
						rdev->recovery_offset = MaxSector;
				}
				rcu_read_unlock();
			}
			conf->fullsync = 0;
		}
		md_bitmap_close_sync(mddev->bitmap);
		close_sync(conf);
		*skipped = 1;
		return sectors_skipped;
	}

	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
		return reshape_request(mddev, sector_nr, skipped);

	if (chunks_skipped >= conf->geo.raid_disks) {
		/* if there has been nothing to do on any drive,
		 * then there is nothing to do at all..
		 */
		*skipped = 1;
		return (max_sector - sector_nr) + sectors_skipped;
	}

	if (max_sector > mddev->resync_max)
		max_sector = mddev->resync_max; /* Don't do IO beyond here */

	/* make sure whole request will fit in a chunk - if chunks
	 * are meaningful
	 */
	if (conf->geo.near_copies < conf->geo.raid_disks &&
	    max_sector > (sector_nr | chunk_mask))
		max_sector = (sector_nr | chunk_mask) + 1;

	/*
	 * If there is non-resync activity waiting for a turn, then let it
	 * though before starting on this new sync request.
	 */
	if (conf->nr_waiting)
		schedule_timeout_uninterruptible(1);

	/* Again, very different code for resync and recovery.
	 * Both must result in an r10bio with a list of bios that
	 * have bi_end_io, bi_sector, bi_bdev set,
	 * and bi_private set to the r10bio.
	 * For recovery, we may actually create several r10bios
	 * with 2 bios in each, that correspond to the bios in the main one.
	 * In this case, the subordinate r10bios link back through a
	 * borrowed master_bio pointer, and the counter in the master
	 * includes a ref from each subordinate.
	 */
	/* First, we decide what to do and set ->bi_end_io
	 * To end_sync_read if we want to read, and
	 * end_sync_write if we will want to write.
	 */

	max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
	if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
		/* recovery... the complicated one */
		int j;
		r10_bio = NULL;

		for (i = 0 ; i < conf->geo.raid_disks; i++) {
			int still_degraded;
			struct r10bio *rb2;
			sector_t sect;
			int must_sync;
			int any_working;
			int need_recover = 0;
			int need_replace = 0;
			struct raid10_info *mirror = &conf->mirrors[i];
			struct md_rdev *mrdev, *mreplace;

			rcu_read_lock();
			mrdev = rcu_dereference(mirror->rdev);
			mreplace = rcu_dereference(mirror->replacement);

			if (mrdev != NULL &&
			    !test_bit(Faulty, &mrdev->flags) &&
			    !test_bit(In_sync, &mrdev->flags))
				need_recover = 1;
			if (mreplace != NULL &&
			    !test_bit(Faulty, &mreplace->flags))
				need_replace = 1;

			if (!need_recover && !need_replace) {
				rcu_read_unlock();
				continue;
			}

			still_degraded = 0;
			/* want to reconstruct this device */
			rb2 = r10_bio;
			sect = raid10_find_virt(conf, sector_nr, i);
			if (sect >= mddev->resync_max_sectors) {
				/* last stripe is not complete - don't
				 * try to recover this sector.
				 */
				rcu_read_unlock();
				continue;
			}
			if (mreplace && test_bit(Faulty, &mreplace->flags))
				mreplace = NULL;
			/* Unless we are doing a full sync, or a replacement
			 * we only need to recover the block if it is set in
			 * the bitmap
			 */
			must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
							 &sync_blocks, 1);
			if (sync_blocks < max_sync)
				max_sync = sync_blocks;
			if (!must_sync &&
			    mreplace == NULL &&
			    !conf->fullsync) {
				/* yep, skip the sync_blocks here, but don't assume
				 * that there will never be anything to do here
				 */
				chunks_skipped = -1;
				rcu_read_unlock();
				continue;
			}
			atomic_inc(&mrdev->nr_pending);
			if (mreplace)
				atomic_inc(&mreplace->nr_pending);
			rcu_read_unlock();

			r10_bio = raid10_alloc_init_r10buf(conf);
			r10_bio->state = 0;
			raise_barrier(conf, rb2 != NULL);
			atomic_set(&r10_bio->remaining, 0);

			r10_bio->master_bio = (struct bio*)rb2;
			if (rb2)
				atomic_inc(&rb2->remaining);
			r10_bio->mddev = mddev;
			set_bit(R10BIO_IsRecover, &r10_bio->state);
			r10_bio->sector = sect;

			raid10_find_phys(conf, r10_bio);

			/* Need to check if the array will still be
			 * degraded
			 */
			rcu_read_lock();
			for (j = 0; j < conf->geo.raid_disks; j++) {
				struct md_rdev *rdev = rcu_dereference(
					conf->mirrors[j].rdev);
				if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
					still_degraded = 1;
					break;
				}
			}

			must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
							 &sync_blocks, still_degraded);

			any_working = 0;
			for (j=0; j<conf->copies;j++) {
				int k;
				int d = r10_bio->devs[j].devnum;
				sector_t from_addr, to_addr;
				struct md_rdev *rdev =
					rcu_dereference(conf->mirrors[d].rdev);
				sector_t sector, first_bad;
				int bad_sectors;
				if (!rdev ||
				    !test_bit(In_sync, &rdev->flags))
					continue;
				/* This is where we read from */
				any_working = 1;
				sector = r10_bio->devs[j].addr;

				if (is_badblock(rdev, sector, max_sync,
						&first_bad, &bad_sectors)) {
					if (first_bad > sector)
						max_sync = first_bad - sector;
					else {
						bad_sectors -= (sector
								- first_bad);
						if (max_sync > bad_sectors)
							max_sync = bad_sectors;
						continue;
					}
				}
				bio = r10_bio->devs[0].bio;
				bio->bi_next = biolist;
				biolist = bio;
				bio->bi_end_io = end_sync_read;
				bio_set_op_attrs(bio, REQ_OP_READ, 0);
				if (test_bit(FailFast, &rdev->flags))
					bio->bi_opf |= MD_FAILFAST;
				from_addr = r10_bio->devs[j].addr;
				bio->bi_iter.bi_sector = from_addr +
					rdev->data_offset;
				bio_set_dev(bio, rdev->bdev);
				atomic_inc(&rdev->nr_pending);
				/* and we write to 'i' (if not in_sync) */

				for (k=0; k<conf->copies; k++)
					if (r10_bio->devs[k].devnum == i)
						break;
				BUG_ON(k == conf->copies);
				to_addr = r10_bio->devs[k].addr;
				r10_bio->devs[0].devnum = d;
				r10_bio->devs[0].addr = from_addr;
				r10_bio->devs[1].devnum = i;
				r10_bio->devs[1].addr = to_addr;

				if (need_recover) {
					bio = r10_bio->devs[1].bio;
					bio->bi_next = biolist;
					biolist = bio;
					bio->bi_end_io = end_sync_write;
					bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
					bio->bi_iter.bi_sector = to_addr
						+ mrdev->data_offset;
					bio_set_dev(bio, mrdev->bdev);
					atomic_inc(&r10_bio->remaining);
				} else
					r10_bio->devs[1].bio->bi_end_io = NULL;

				/* and maybe write to replacement */
				bio = r10_bio->devs[1].repl_bio;
				if (bio)
					bio->bi_end_io = NULL;
				/* Note: if need_replace, then bio
				 * cannot be NULL as r10buf_pool_alloc will
				 * have allocated it.
				 */
				if (!need_replace)
					break;
				bio->bi_next = biolist;
				biolist = bio;
				bio->bi_end_io = end_sync_write;
				bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
				bio->bi_iter.bi_sector = to_addr +
					mreplace->data_offset;
				bio_set_dev(bio, mreplace->bdev);
				atomic_inc(&r10_bio->remaining);
				break;
			}
			rcu_read_unlock();
			if (j == conf->copies) {
				/* Cannot recover, so abort the recovery or
				 * record a bad block */
				if (any_working) {
					/* problem is that there are bad blocks
					 * on other device(s)
					 */
					int k;
					for (k = 0; k < conf->copies; k++)
						if (r10_bio->devs[k].devnum == i)
							break;
					if (!test_bit(In_sync,
						      &mrdev->flags)
					    && !rdev_set_badblocks(
						    mrdev,
						    r10_bio->devs[k].addr,
						    max_sync, 0))
						any_working = 0;
					if (mreplace &&
					    !rdev_set_badblocks(
						    mreplace,
						    r10_bio->devs[k].addr,
						    max_sync, 0))
						any_working = 0;
				}
				if (!any_working)  {
					if (!test_and_set_bit(MD_RECOVERY_INTR,
							      &mddev->recovery))
						pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
						       mdname(mddev));
					mirror->recovery_disabled
						= mddev->recovery_disabled;
				}
				put_buf(r10_bio);
				if (rb2)
					atomic_dec(&rb2->remaining);
				r10_bio = rb2;
				rdev_dec_pending(mrdev, mddev);
				if (mreplace)
					rdev_dec_pending(mreplace, mddev);
				break;
			}
			rdev_dec_pending(mrdev, mddev);
			if (mreplace)
				rdev_dec_pending(mreplace, mddev);
			if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) {
				/* Only want this if there is elsewhere to
				 * read from. 'j' is currently the first
				 * readable copy.
				 */
				int targets = 1;
				for (; j < conf->copies; j++) {
					int d = r10_bio->devs[j].devnum;
					if (conf->mirrors[d].rdev &&
					    test_bit(In_sync,
						      &conf->mirrors[d].rdev->flags))
						targets++;
				}
				if (targets == 1)
					r10_bio->devs[0].bio->bi_opf
						&= ~MD_FAILFAST;
			}
		}
		if (biolist == NULL) {
			while (r10_bio) {
				struct r10bio *rb2 = r10_bio;
				r10_bio = (struct r10bio*) rb2->master_bio;
				rb2->master_bio = NULL;
				put_buf(rb2);
			}
			goto giveup;
		}
	} else {
		/* resync. Schedule a read for every block at this virt offset */
		int count = 0;

		/*
		 * Since curr_resync_completed could probably not update in
		 * time, and we will set cluster_sync_low based on it.
		 * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for
		 * safety reason, which ensures curr_resync_completed is
		 * updated in bitmap_cond_end_sync.
		 */
		md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
					mddev_is_clustered(mddev) &&
					(sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));

		if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
					  &sync_blocks, mddev->degraded) &&
		    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
						 &mddev->recovery)) {
			/* We can skip this block */
			*skipped = 1;
			return sync_blocks + sectors_skipped;
		}
		if (sync_blocks < max_sync)
			max_sync = sync_blocks;
		r10_bio = raid10_alloc_init_r10buf(conf);
		r10_bio->state = 0;

		r10_bio->mddev = mddev;
		atomic_set(&r10_bio->remaining, 0);
		raise_barrier(conf, 0);
		conf->next_resync = sector_nr;

		r10_bio->master_bio = NULL;
		r10_bio->sector = sector_nr;
		set_bit(R10BIO_IsSync, &r10_bio->state);
		raid10_find_phys(conf, r10_bio);
		r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;

		for (i = 0; i < conf->copies; i++) {
			int d = r10_bio->devs[i].devnum;
			sector_t first_bad, sector;
			int bad_sectors;
			struct md_rdev *rdev;

			if (r10_bio->devs[i].repl_bio)
				r10_bio->devs[i].repl_bio->bi_end_io = NULL;

			bio = r10_bio->devs[i].bio;
			bio->bi_status = BLK_STS_IOERR;
			rcu_read_lock();
			rdev = rcu_dereference(conf->mirrors[d].rdev);
			if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
				rcu_read_unlock();
				continue;
			}
			sector = r10_bio->devs[i].addr;
			if (is_badblock(rdev, sector, max_sync,
					&first_bad, &bad_sectors)) {
				if (first_bad > sector)
					max_sync = first_bad - sector;
				else {
					bad_sectors -= (sector - first_bad);
					if (max_sync > bad_sectors)
						max_sync = bad_sectors;
					rcu_read_unlock();
					continue;
				}
			}
			atomic_inc(&rdev->nr_pending);
			atomic_inc(&r10_bio->remaining);
			bio->bi_next = biolist;
			biolist = bio;
			bio->bi_end_io = end_sync_read;
			bio_set_op_attrs(bio, REQ_OP_READ, 0);
			if (test_bit(FailFast, &rdev->flags))
				bio->bi_opf |= MD_FAILFAST;
			bio->bi_iter.bi_sector = sector + rdev->data_offset;
			bio_set_dev(bio, rdev->bdev);
			count++;

			rdev = rcu_dereference(conf->mirrors[d].replacement);
			if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
				rcu_read_unlock();
				continue;
			}
			atomic_inc(&rdev->nr_pending);

			/* Need to set up for writing to the replacement */
			bio = r10_bio->devs[i].repl_bio;
			bio->bi_status = BLK_STS_IOERR;

			sector = r10_bio->devs[i].addr;
			bio->bi_next = biolist;
			biolist = bio;
			bio->bi_end_io = end_sync_write;
			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
			if (test_bit(FailFast, &rdev->flags))
				bio->bi_opf |= MD_FAILFAST;
			bio->bi_iter.bi_sector = sector + rdev->data_offset;
			bio_set_dev(bio, rdev->bdev);
			count++;
			rcu_read_unlock();
		}

		if (count < 2) {
			for (i=0; i<conf->copies; i++) {
				int d = r10_bio->devs[i].devnum;
				if (r10_bio->devs[i].bio->bi_end_io)
					rdev_dec_pending(conf->mirrors[d].rdev,
							 mddev);
				if (r10_bio->devs[i].repl_bio &&
				    r10_bio->devs[i].repl_bio->bi_end_io)
					rdev_dec_pending(
						conf->mirrors[d].replacement,
						mddev);
			}
			put_buf(r10_bio);
			biolist = NULL;
			goto giveup;
		}
	}

	nr_sectors = 0;
	if (sector_nr + max_sync < max_sector)
		max_sector = sector_nr + max_sync;
	do {
		struct page *page;
		int len = PAGE_SIZE;
		if (sector_nr + (len>>9) > max_sector)
			len = (max_sector - sector_nr) << 9;
		if (len == 0)
			break;
		for (bio= biolist ; bio ; bio=bio->bi_next) {
			struct resync_pages *rp = get_resync_pages(bio);
			page = resync_fetch_page(rp, page_idx);
			/*
			 * won't fail because the vec table is big enough
			 * to hold all these pages
			 */
			bio_add_page(bio, page, len, 0);
		}
		nr_sectors += len>>9;
		sector_nr += len>>9;
	} while (++page_idx < RESYNC_PAGES);
	r10_bio->sectors = nr_sectors;

	if (mddev_is_clustered(mddev) &&
	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
		/* It is resync not recovery */
		if (conf->cluster_sync_high < sector_nr + nr_sectors) {
			conf->cluster_sync_low = mddev->curr_resync_completed;
			raid10_set_cluster_sync_high(conf);
			/* Send resync message */
			md_cluster_ops->resync_info_update(mddev,
						conf->cluster_sync_low,
						conf->cluster_sync_high);
		}
	} else if (mddev_is_clustered(mddev)) {
		/* This is recovery not resync */
		sector_t sect_va1, sect_va2;
		bool broadcast_msg = false;

		for (i = 0; i < conf->geo.raid_disks; i++) {
			/*
			 * sector_nr is a device address for recovery, so we
			 * need translate it to array address before compare
			 * with cluster_sync_high.
			 */
			sect_va1 = raid10_find_virt(conf, sector_nr, i);

			if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
				broadcast_msg = true;
				/*
				 * curr_resync_completed is similar as
				 * sector_nr, so make the translation too.
				 */
				sect_va2 = raid10_find_virt(conf,
					mddev->curr_resync_completed, i);

				if (conf->cluster_sync_low == 0 ||
				    conf->cluster_sync_low > sect_va2)
					conf->cluster_sync_low = sect_va2;
			}
		}
		if (broadcast_msg) {
			raid10_set_cluster_sync_high(conf);
			md_cluster_ops->resync_info_update(mddev,
						conf->cluster_sync_low,
						conf->cluster_sync_high);
		}
	}

	while (biolist) {
		bio = biolist;
		biolist = biolist->bi_next;

		bio->bi_next = NULL;
		r10_bio = get_resync_r10bio(bio);
		r10_bio->sectors = nr_sectors;

		if (bio->bi_end_io == end_sync_read) {
			md_sync_acct_bio(bio, nr_sectors);
			bio->bi_status = 0;
			submit_bio_noacct(bio);
		}
	}

	if (sectors_skipped)
		/* pretend they weren't skipped, it makes
		 * no important difference in this case
		 */
		md_done_sync(mddev, sectors_skipped, 1);

	return sectors_skipped + nr_sectors;
 giveup:
	/* There is nowhere to write, so all non-sync
	 * drives must be failed or in resync, all drives
	 * have a bad block, so try the next chunk...
	 */
	if (sector_nr + max_sync < max_sector)
		max_sector = sector_nr + max_sync;

	sectors_skipped += (max_sector - sector_nr);
	chunks_skipped ++;
	sector_nr = max_sector;
	goto skipped;
}