in drbd/drbd_worker.c [1737:1915]
void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
{
struct drbd_peer_device *peer_device = first_peer_device(device);
struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
union drbd_state ns;
int r;
if (device->state.conn >= C_SYNC_SOURCE && device->state.conn < C_AHEAD) {
drbd_err(device, "Resync already running!\n");
return;
}
if (!connection) {
drbd_err(device, "No connection to peer, aborting!\n");
return;
}
if (!test_bit(B_RS_H_DONE, &device->flags)) {
if (side == C_SYNC_TARGET) {
/* Since application IO was locked out during C_WF_BITMAP_T and
C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
we check that we might make the data inconsistent. */
r = drbd_khelper(device, "before-resync-target");
r = (r >> 8) & 0xff;
if (r > 0) {
drbd_info(device, "before-resync-target handler returned %d, "
"dropping connection.\n", r);
conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
return;
}
} else /* C_SYNC_SOURCE */ {
r = drbd_khelper(device, "before-resync-source");
r = (r >> 8) & 0xff;
if (r > 0) {
if (r == 3) {
drbd_info(device, "before-resync-source handler returned %d, "
"ignoring. Old userland tools?", r);
} else {
drbd_info(device, "before-resync-source handler returned %d, "
"dropping connection.\n", r);
conn_request_state(connection,
NS(conn, C_DISCONNECTING), CS_HARD);
return;
}
}
}
}
if (current == connection->worker.task) {
/* The worker should not sleep waiting for state_mutex,
that can take long */
if (!mutex_trylock(device->state_mutex)) {
set_bit(B_RS_H_DONE, &device->flags);
device->start_resync_timer.expires = jiffies + HZ/5;
add_timer(&device->start_resync_timer);
return;
}
} else {
mutex_lock(device->state_mutex);
}
lock_all_resources();
clear_bit(B_RS_H_DONE, &device->flags);
/* Did some connection breakage or IO error race with us? */
if (device->state.conn < C_CONNECTED
|| !get_ldev_if_state(device, D_NEGOTIATING)) {
unlock_all_resources();
goto out;
}
ns = drbd_read_state(device);
ns.aftr_isp = !_drbd_may_sync_now(device);
ns.conn = side;
if (side == C_SYNC_TARGET)
ns.disk = D_INCONSISTENT;
else /* side == C_SYNC_SOURCE */
ns.pdsk = D_INCONSISTENT;
r = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
ns = drbd_read_state(device);
if (ns.conn < C_CONNECTED)
r = SS_UNKNOWN_ERROR;
if (r == SS_SUCCESS) {
unsigned long tw = drbd_bm_total_weight(device);
unsigned long now = jiffies;
int i;
device->rs_failed = 0;
device->rs_paused = 0;
device->rs_same_csum = 0;
device->rs_last_sect_ev = 0;
device->rs_total = tw;
device->rs_start = now;
for (i = 0; i < DRBD_SYNC_MARKS; i++) {
device->rs_mark_left[i] = tw;
device->rs_mark_time[i] = now;
}
drbd_pause_after(device);
/* Forget potentially stale cached per resync extent bit-counts.
* Open coded drbd_rs_cancel_all(device), we already have IRQs
* disabled, and know the disk state is ok. */
spin_lock(&device->al_lock);
lc_reset(device->resync);
device->resync_locked = 0;
device->resync_wenr = LC_FREE;
spin_unlock(&device->al_lock);
}
unlock_all_resources();
if (r == SS_SUCCESS) {
wake_up(&device->al_wait); /* for lc_reset() above */
/* reset rs_last_bcast when a resync or verify is started,
* to deal with potential jiffies wrap. */
device->rs_last_bcast = jiffies - HZ;
drbd_info(device, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
drbd_conn_str(ns.conn),
(unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
(unsigned long) device->rs_total);
if (side == C_SYNC_TARGET) {
device->bm_resync_fo = 0;
device->use_csums = use_checksum_based_resync(connection, device);
} else {
device->use_csums = false;
}
/* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
* with w_send_oos, or the sync target will get confused as to
* how much bits to resync. We cannot do that always, because for an
* empty resync and protocol < 95, we need to do it here, as we call
* drbd_resync_finished from here in that case.
* We drbd_gen_and_send_sync_uuid here for protocol < 96,
* and from after_state_ch otherwise. */
if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96)
drbd_gen_and_send_sync_uuid(peer_device);
if (connection->agreed_pro_version < 95 && device->rs_total == 0) {
/* This still has a race (about when exactly the peers
* detect connection loss) that can lead to a full sync
* on next handshake. In 8.3.9 we fixed this with explicit
* resync-finished notifications, but the fix
* introduces a protocol change. Sleeping for some
* time longer than the ping interval + timeout on the
* SyncSource, to give the SyncTarget the chance to
* detect connection loss, then waiting for a ping
* response (implicit in drbd_resync_finished) reduces
* the race considerably, but does not solve it. */
if (side == C_SYNC_SOURCE) {
struct net_conf *nc;
int timeo;
rcu_read_lock();
nc = rcu_dereference(connection->net_conf);
timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
rcu_read_unlock();
schedule_timeout_interruptible(timeo);
}
drbd_resync_finished(device);
}
drbd_rs_controller_reset(device);
/* ns.conn may already be != device->state.conn,
* we may have been paused in between, or become paused until
* the timer triggers.
* No matter, that is handled in resync_timer_fn() */
if (ns.conn == C_SYNC_TARGET)
mod_timer(&device->resync_timer, jiffies);
drbd_md_sync(device);
}
put_ldev(device);
out:
mutex_unlock(device->state_mutex);
}