in raid5.c [5960:6227]
static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
{
/* reshaping is quite different to recovery/resync so it is
* handled quite separately ... here.
*
* On each call to sync_request, we gather one chunk worth of
* destination stripes and flag them as expanding.
* Then we find all the source stripes and request reads.
* As the reads complete, handle_stripe will copy the data
* into the destination stripe and release that stripe.
*/
struct r5conf *conf = mddev->private;
struct stripe_head *sh;
struct md_rdev *rdev;
sector_t first_sector, last_sector;
int raid_disks = conf->previous_raid_disks;
int data_disks = raid_disks - conf->max_degraded;
int new_data_disks = conf->raid_disks - conf->max_degraded;
int i;
int dd_idx;
sector_t writepos, readpos, safepos;
sector_t stripe_addr;
int reshape_sectors;
struct list_head stripes;
sector_t retn;
if (sector_nr == 0) {
/* If restarting in the middle, skip the initial sectors */
if (mddev->reshape_backwards &&
conf->reshape_progress < raid5_size(mddev, 0, 0)) {
sector_nr = raid5_size(mddev, 0, 0)
- conf->reshape_progress;
} else if (mddev->reshape_backwards &&
conf->reshape_progress == MaxSector) {
/* shouldn't happen, but just in case, finish up.*/
sector_nr = MaxSector;
} else if (!mddev->reshape_backwards &&
conf->reshape_progress > 0)
sector_nr = conf->reshape_progress;
sector_div(sector_nr, new_data_disks);
if (sector_nr) {
mddev->curr_resync_completed = sector_nr;
sysfs_notify_dirent_safe(mddev->sysfs_completed);
*skipped = 1;
retn = sector_nr;
goto finish;
}
}
/* We need to process a full chunk at a time.
* If old and new chunk sizes differ, we need to process the
* largest of these
*/
reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
/* We update the metadata at least every 10 seconds, or when
* the data about to be copied would over-write the source of
* the data at the front of the range. i.e. one new_stripe
* along from reshape_progress new_maps to after where
* reshape_safe old_maps to
*/
writepos = conf->reshape_progress;
sector_div(writepos, new_data_disks);
readpos = conf->reshape_progress;
sector_div(readpos, data_disks);
safepos = conf->reshape_safe;
sector_div(safepos, data_disks);
if (mddev->reshape_backwards) {
BUG_ON(writepos < reshape_sectors);
writepos -= reshape_sectors;
readpos += reshape_sectors;
safepos += reshape_sectors;
} else {
writepos += reshape_sectors;
/* readpos and safepos are worst-case calculations.
* A negative number is overly pessimistic, and causes
* obvious problems for unsigned storage. So clip to 0.
*/
readpos -= min_t(sector_t, reshape_sectors, readpos);
safepos -= min_t(sector_t, reshape_sectors, safepos);
}
/* Having calculated the 'writepos' possibly use it
* to set 'stripe_addr' which is where we will write to.
*/
if (mddev->reshape_backwards) {
BUG_ON(conf->reshape_progress == 0);
stripe_addr = writepos;
BUG_ON((mddev->dev_sectors &
~((sector_t)reshape_sectors - 1))
- reshape_sectors - stripe_addr
!= sector_nr);
} else {
BUG_ON(writepos != sector_nr + reshape_sectors);
stripe_addr = sector_nr;
}
/* 'writepos' is the most advanced device address we might write.
* 'readpos' is the least advanced device address we might read.
* 'safepos' is the least address recorded in the metadata as having
* been reshaped.
* If there is a min_offset_diff, these are adjusted either by
* increasing the safepos/readpos if diff is negative, or
* increasing writepos if diff is positive.
* If 'readpos' is then behind 'writepos', there is no way that we can
* ensure safety in the face of a crash - that must be done by userspace
* making a backup of the data. So in that case there is no particular
* rush to update metadata.
* Otherwise if 'safepos' is behind 'writepos', then we really need to
* update the metadata to advance 'safepos' to match 'readpos' so that
* we can be safe in the event of a crash.
* So we insist on updating metadata if safepos is behind writepos and
* readpos is beyond writepos.
* In any case, update the metadata every 10 seconds.
* Maybe that number should be configurable, but I'm not sure it is
* worth it.... maybe it could be a multiple of safemode_delay???
*/
if (conf->min_offset_diff < 0) {
safepos += -conf->min_offset_diff;
readpos += -conf->min_offset_diff;
} else
writepos += conf->min_offset_diff;
if ((mddev->reshape_backwards
? (safepos > writepos && readpos < writepos)
: (safepos < writepos && readpos > writepos)) ||
time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
/* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes)==0
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
if (atomic_read(&conf->reshape_stripes) != 0)
return 0;
mddev->reshape_position = conf->reshape_progress;
mddev->curr_resync_completed = sector_nr;
if (!mddev->reshape_backwards)
/* Can update recovery_offset */
rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < sector_nr)
rdev->recovery_offset = sector_nr;
conf->reshape_checkpoint = jiffies;
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
test_bit(MD_RECOVERY_INTR, &mddev->recovery));
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
return 0;
spin_lock_irq(&conf->device_lock);
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
INIT_LIST_HEAD(&stripes);
for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
int j;
int skipped_disk = 0;
sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
set_bit(STRIPE_EXPANDING, &sh->state);
atomic_inc(&conf->reshape_stripes);
/* If any of this stripe is beyond the end of the old
* array, then we need to zero those blocks
*/
for (j=sh->disks; j--;) {
sector_t s;
if (j == sh->pd_idx)
continue;
if (conf->level == 6 &&
j == sh->qd_idx)
continue;
s = raid5_compute_blocknr(sh, j, 0);
if (s < raid5_size(mddev, 0, 0)) {
skipped_disk = 1;
continue;
}
memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
set_bit(R5_Expanded, &sh->dev[j].flags);
set_bit(R5_UPTODATE, &sh->dev[j].flags);
}
if (!skipped_disk) {
set_bit(STRIPE_EXPAND_READY, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
}
list_add(&sh->lru, &stripes);
}
spin_lock_irq(&conf->device_lock);
if (mddev->reshape_backwards)
conf->reshape_progress -= reshape_sectors * new_data_disks;
else
conf->reshape_progress += reshape_sectors * new_data_disks;
spin_unlock_irq(&conf->device_lock);
/* Ok, those stripe are ready. We can start scheduling
* reads on the source stripes.
* The source stripes are determined by mapping the first and last
* block on the destination stripes.
*/
first_sector =
raid5_compute_sector(conf, stripe_addr*(new_data_disks),
1, &dd_idx, NULL);
last_sector =
raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
* new_data_disks - 1),
1, &dd_idx, NULL);
if (last_sector >= mddev->dev_sectors)
last_sector = mddev->dev_sectors - 1;
while (first_sector <= last_sector) {
sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
first_sector += RAID5_STRIPE_SECTORS(conf);
}
/* Now that the sources are clearly marked, we can release
* the destination stripes
*/
while (!list_empty(&stripes)) {
sh = list_entry(stripes.next, struct stripe_head, lru);
list_del_init(&sh->lru);
raid5_release_stripe(sh);
}
/* If this takes us to the resync_max point where we have to pause,
* then we need to write out the superblock.
*/
sector_nr += reshape_sectors;
retn = reshape_sectors;
finish:
if (mddev->curr_resync_completed > mddev->resync_max ||
(sector_nr - mddev->curr_resync_completed) * 2
>= mddev->resync_max - mddev->curr_resync_completed) {
/* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes) == 0
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
if (atomic_read(&conf->reshape_stripes) != 0)
goto ret;
mddev->reshape_position = conf->reshape_progress;
mddev->curr_resync_completed = sector_nr;
if (!mddev->reshape_backwards)
/* Can update recovery_offset */
rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < sector_nr)
rdev->recovery_offset = sector_nr;
conf->reshape_checkpoint = jiffies;
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
goto ret;
spin_lock_irq(&conf->device_lock);
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
ret:
return retn;
}