in md.c [8693:9109]
void md_do_sync(struct md_thread *thread)
{
struct mddev *mddev = thread->mddev;
struct mddev *mddev2;
unsigned int currspeed = 0, window;
sector_t max_sectors,j, io_sectors, recovery_done;
unsigned long mark[SYNC_MARKS];
unsigned long update_time;
sector_t mark_cnt[SYNC_MARKS];
int last_mark,m;
struct list_head *tmp;
sector_t last_check;
int skipped = 0;
struct md_rdev *rdev;
char *desc, *action = NULL;
struct blk_plug plug;
int ret;
/* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
test_bit(MD_RECOVERY_WAIT, &mddev->recovery))
return;
if (mddev->ro) {/* never try to sync a read-only array */
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
return;
}
if (mddev_is_clustered(mddev)) {
ret = md_cluster_ops->resync_start(mddev);
if (ret)
goto skip;
set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
&& ((unsigned long long)mddev->curr_resync_completed
< (unsigned long long)mddev->resync_max_sectors))
goto skip;
}
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
desc = "data-check";
action = "check";
} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
desc = "requested-resync";
action = "repair";
} else
desc = "resync";
} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
desc = "reshape";
else
desc = "recovery";
mddev->last_sync_action = action ?: desc;
/* we overload curr_resync somewhat here.
* 0 == not engaged in resync at all
* 2 == checking that there is no conflict with another sync
* 1 == like 2, but have yielded to allow conflicting resync to
* commence
* other == active in resync - this many blocks
*
* Before starting a resync we must have set curr_resync to
* 2, and then checked that every "conflicting" array has curr_resync
* less than ours. When we find one that is the same or higher
* we wait on resync_wait. To avoid deadlock, we reduce curr_resync
* to 1 if we choose to yield (based arbitrarily on address of mddev structure).
* This will mean we have to start checking from the beginning again.
*
*/
do {
int mddev2_minor = -1;
mddev->curr_resync = 2;
try_again:
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
goto skip;
for_each_mddev(mddev2, tmp) {
if (mddev2 == mddev)
continue;
if (!mddev->parallel_resync
&& mddev2->curr_resync
&& match_mddev_units(mddev, mddev2)) {
DEFINE_WAIT(wq);
if (mddev < mddev2 && mddev->curr_resync == 2) {
/* arbitrarily yield */
mddev->curr_resync = 1;
wake_up(&resync_wait);
}
if (mddev > mddev2 && mddev->curr_resync == 1)
/* no need to wait here, we can wait the next
* time 'round when curr_resync == 2
*/
continue;
/* We need to wait 'interruptible' so as not to
* contribute to the load average, and not to
* be caught by 'softlockup'
*/
prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
mddev2->curr_resync >= mddev->curr_resync) {
if (mddev2_minor != mddev2->md_minor) {
mddev2_minor = mddev2->md_minor;
pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
desc, mdname(mddev),
mdname(mddev2));
}
mddev_put(mddev2);
if (signal_pending(current))
flush_signals(current);
schedule();
finish_wait(&resync_wait, &wq);
goto try_again;
}
finish_wait(&resync_wait, &wq);
}
}
} while (mddev->curr_resync < 2);
j = 0;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
/* resync follows the size requested by the personality,
* which defaults to physical size, but can be virtual size
*/
max_sectors = mddev->resync_max_sectors;
atomic64_set(&mddev->resync_mismatches, 0);
/* we don't use the checkpoint if there's a bitmap */
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
j = mddev->resync_min;
else if (!mddev->bitmap)
j = mddev->recovery_cp;
} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
max_sectors = mddev->resync_max_sectors;
/*
* If the original node aborts reshaping then we continue the
* reshaping, so set j again to avoid restart reshape from the
* first beginning
*/
if (mddev_is_clustered(mddev) &&
mddev->reshape_position != MaxSector)
j = mddev->reshape_position;
} else {
/* recovery follows the physical size of devices */
max_sectors = mddev->dev_sectors;
j = MaxSector;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < j)
j = rdev->recovery_offset;
rcu_read_unlock();
/* If there is a bitmap, we need to make sure all
* writes that started before we added a spare
* complete before we start doing a recovery.
* Otherwise the write might complete and (via
* bitmap_endwrite) set a bit in the bitmap after the
* recovery has checked that bit and skipped that
* region.
*/
if (mddev->bitmap) {
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
}
}
pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev));
pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
speed_max(mddev), desc);
is_mddev_idle(mddev, 1); /* this initializes IO event counters */
io_sectors = 0;
for (m = 0; m < SYNC_MARKS; m++) {
mark[m] = jiffies;
mark_cnt[m] = io_sectors;
}
last_mark = 0;
mddev->resync_mark = mark[last_mark];
mddev->resync_mark_cnt = mark_cnt[last_mark];
/*
* Tune reconstruction:
*/
window = 32 * (PAGE_SIZE / 512);
pr_debug("md: using %dk window, over a total of %lluk.\n",
window/2, (unsigned long long)max_sectors/2);
atomic_set(&mddev->recovery_active, 0);
last_check = 0;
if (j>2) {
pr_debug("md: resuming %s of %s from checkpoint.\n",
desc, mdname(mddev));
mddev->curr_resync = j;
} else
mddev->curr_resync = 3; /* no longer delayed */
mddev->curr_resync_completed = j;
sysfs_notify_dirent_safe(mddev->sysfs_completed);
md_new_event();
update_time = jiffies;
blk_start_plug(&plug);
while (j < max_sectors) {
sector_t sectors;
skipped = 0;
if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
((mddev->curr_resync > mddev->curr_resync_completed &&
(mddev->curr_resync - mddev->curr_resync_completed)
> (max_sectors >> 4)) ||
time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
(j - mddev->curr_resync_completed)*2
>= mddev->resync_max - mddev->curr_resync_completed ||
mddev->curr_resync_completed > mddev->resync_max
)) {
/* time to update curr_resync_completed */
wait_event(mddev->recovery_wait,
atomic_read(&mddev->recovery_active) == 0);
mddev->curr_resync_completed = j;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
j > mddev->recovery_cp)
mddev->recovery_cp = j;
update_time = jiffies;
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
while (j >= mddev->resync_max &&
!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* As this condition is controlled by user-space,
* we can block indefinitely, so use '_interruptible'
* to avoid triggering warnings.
*/
flush_signals(current); /* just in case */
wait_event_interruptible(mddev->recovery_wait,
mddev->resync_max > j
|| test_bit(MD_RECOVERY_INTR,
&mddev->recovery));
}
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
break;
sectors = mddev->pers->sync_request(mddev, j, &skipped);
if (sectors == 0) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
break;
}
if (!skipped) { /* actual IO requested */
io_sectors += sectors;
atomic_add(sectors, &mddev->recovery_active);
}
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
break;
j += sectors;
if (j > max_sectors)
/* when skipping, extra large numbers can be returned. */
j = max_sectors;
if (j > 2)
mddev->curr_resync = j;
mddev->curr_mark_cnt = io_sectors;
if (last_check == 0)
/* this is the earliest that rebuild will be
* visible in /proc/mdstat
*/
md_new_event();
if (last_check + window > io_sectors || j == max_sectors)
continue;
last_check = io_sectors;
repeat:
if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
/* step marks */
int next = (last_mark+1) % SYNC_MARKS;
mddev->resync_mark = mark[next];
mddev->resync_mark_cnt = mark_cnt[next];
mark[next] = jiffies;
mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
last_mark = next;
}
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
break;
/*
* this loop exits only if either when we are slower than
* the 'hard' speed limit, or the system was IO-idle for
* a jiffy.
* the system might be non-idle CPU-wise, but we only care
* about not overloading the IO subsystem. (things like an
* e2fsck being done on the RAID array should execute fast)
*/
cond_resched();
recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
/((jiffies-mddev->resync_mark)/HZ +1) +1;
if (currspeed > speed_min(mddev)) {
if (currspeed > speed_max(mddev)) {
msleep(500);
goto repeat;
}
if (!is_mddev_idle(mddev, 0)) {
/*
* Give other IO more of a chance.
* The faster the devices, the less we wait.
*/
wait_event(mddev->recovery_wait,
!atomic_read(&mddev->recovery_active));
}
}
}
pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
test_bit(MD_RECOVERY_INTR, &mddev->recovery)
? "interrupted" : "done");
/*
* this also signals 'finished resyncing' to md_stop
*/
blk_finish_plug(&plug);
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
mddev->curr_resync > 3) {
mddev->curr_resync_completed = mddev->curr_resync;
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
mddev->pers->sync_request(mddev, max_sectors, &skipped);
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
mddev->curr_resync > 3) {
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
if (mddev->curr_resync >= mddev->recovery_cp) {
pr_debug("md: checkpointing %s of %s.\n",
desc, mdname(mddev));
if (test_bit(MD_RECOVERY_ERROR,
&mddev->recovery))
mddev->recovery_cp =
mddev->curr_resync_completed;
else
mddev->recovery_cp =
mddev->curr_resync;
}
} else
mddev->recovery_cp = MaxSector;
} else {
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
mddev->curr_resync = MaxSector;
if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < mddev->curr_resync)
rdev->recovery_offset = mddev->curr_resync;
rcu_read_unlock();
}
}
}
skip:
/* set CHANGE_PENDING here since maybe another update is needed,
* so other nodes are informed. It should be harmless for normal
* raid */
set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
mddev->delta_disks > 0 &&
mddev->pers->finish_reshape &&
mddev->pers->size &&
mddev->queue) {
mddev_lock_nointr(mddev);
md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
mddev_unlock(mddev);
if (!mddev_is_clustered(mddev))
set_capacity_and_notify(mddev->gendisk,
mddev->array_sectors);
}
spin_lock(&mddev->lock);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* We completed so min/max setting can be forgotten if used. */
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
mddev->resync_min = mddev->curr_resync_completed;
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
mddev->curr_resync = 0;
spin_unlock(&mddev->lock);
wake_up(&resync_wait);
md_wakeup_thread(mddev->thread);
return;
}