in drbd/drbd_nl.c [1784:2203]
int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
{
struct drbd_config_context adm_ctx;
struct drbd_device *device;
struct drbd_peer_device *peer_device;
struct drbd_connection *connection;
int err;
enum drbd_ret_code retcode;
enum determine_dev_size dd;
sector_t max_possible_sectors;
sector_t min_md_device_sectors;
struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
struct disk_conf *new_disk_conf = NULL;
struct lru_cache *resync_lru = NULL;
struct fifo_buffer *new_plan = NULL;
union drbd_state ns, os;
enum drbd_state_rv rv;
struct net_conf *nc;
retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
if (!adm_ctx.reply_skb)
return retcode;
if (retcode != NO_ERROR)
goto finish;
device = adm_ctx.device;
mutex_lock(&adm_ctx.resource->adm_mutex);
peer_device = first_peer_device(device);
connection = peer_device->connection;
conn_reconfig_start(connection);
/* if you want to reconfigure, please tear down first */
if (device->state.disk > D_DISKLESS) {
retcode = ERR_DISK_CONFIGURED;
goto fail;
}
/* It may just now have detached because of IO error. Make sure
* drbd_ldev_destroy is done already, we may end up here very fast,
* e.g. if someone calls attach from the on-io-error handler,
* to realize a "hot spare" feature (not that I'd recommend that) */
wait_event(device->misc_wait, !test_bit(GOING_DISKLESS, &device->flags));
/* make sure there is no leftover from previous force-detach attempts */
clear_bit(FORCE_DETACH, &device->flags);
clear_bit(WAS_IO_ERROR, &device->flags);
clear_bit(WAS_READ_ERROR, &device->flags);
/* and no leftover from previously aborted resync or verify, either */
device->rs_total = 0;
device->rs_failed = 0;
atomic_set(&device->rs_pending_cnt, 0);
/* allocation not in the IO path, drbdsetup context */
nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
if (!nbc) {
retcode = ERR_NOMEM;
goto fail;
}
spin_lock_init(&nbc->md.uuid_lock);
new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
if (!new_disk_conf) {
retcode = ERR_NOMEM;
goto fail;
}
nbc->disk_conf = new_disk_conf;
set_disk_conf_defaults(new_disk_conf);
err = disk_conf_from_attrs(new_disk_conf, info);
if (err) {
retcode = ERR_MANDATORY_TAG;
drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
goto fail;
}
if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
if (!new_plan) {
retcode = ERR_NOMEM;
goto fail;
}
if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
retcode = ERR_MD_IDX_INVALID;
goto fail;
}
rcu_read_lock();
nc = rcu_dereference(connection->net_conf);
if (nc) {
if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
rcu_read_unlock();
retcode = ERR_STONITH_AND_PROT_A;
goto fail;
}
}
rcu_read_unlock();
retcode = open_backing_devices(device, new_disk_conf, nbc);
if (retcode != NO_ERROR)
goto fail;
if ((nbc->backing_bdev == nbc->md_bdev) !=
(new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
retcode = ERR_MD_IDX_INVALID;
goto fail;
}
resync_lru = lc_create("resync", drbd_bm_ext_cache,
1, 61, sizeof(struct bm_extent),
offsetof(struct bm_extent, lce));
if (!resync_lru) {
retcode = ERR_NOMEM;
goto fail;
}
/* Read our meta data super block early.
* This also sets other on-disk offsets. */
retcode = drbd_md_read(device, nbc);
if (retcode != NO_ERROR)
goto fail;
sanitize_disk_conf(device, new_disk_conf, nbc);
if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
drbd_err(device, "max capacity %llu smaller than disk size %llu\n",
(unsigned long long) drbd_get_max_capacity(nbc),
(unsigned long long) new_disk_conf->disk_size);
retcode = ERR_DISK_TOO_SMALL;
goto fail;
}
if (new_disk_conf->meta_dev_idx < 0) {
max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
/* at least one MB, otherwise it does not make sense */
min_md_device_sectors = (2<<10);
} else {
max_possible_sectors = DRBD_MAX_SECTORS;
min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
}
if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
retcode = ERR_MD_DISK_TOO_SMALL;
drbd_warn(device, "refusing attach: md-device too small, "
"at least %llu sectors needed for this meta-disk type\n",
(unsigned long long) min_md_device_sectors);
goto fail;
}
/* Make sure the new disk is big enough
* (we may currently be R_PRIMARY with no local disk...) */
if (drbd_get_max_capacity(nbc) < get_capacity(device->vdisk)) {
retcode = ERR_DISK_TOO_SMALL;
goto fail;
}
nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
if (nbc->known_size > max_possible_sectors) {
drbd_warn(device, "==> truncating very big lower level device "
"to currently maximum possible %llu sectors <==\n",
(unsigned long long) max_possible_sectors);
if (new_disk_conf->meta_dev_idx >= 0)
drbd_warn(device, "==>> using internal or flexible "
"meta data may help <<==\n");
}
drbd_suspend_io(device);
/* also wait for the last barrier ack. */
/* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171
* We need a way to either ignore barrier acks for barriers sent before a device
* was attached, or a way to wait for all pending barrier acks to come in.
* As barriers are counted per resource,
* we'd need to suspend io on all devices of a resource.
*/
wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device));
/* and for any other previously queued work */
drbd_flush_workqueue(&connection->sender_work);
rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE);
retcode = (enum drbd_ret_code)rv;
drbd_resume_io(device);
if (rv < SS_SUCCESS)
goto fail;
if (!get_ldev_if_state(device, D_ATTACHING))
goto force_diskless;
if (!device->bitmap) {
if (drbd_bm_init(device)) {
retcode = ERR_NOMEM;
goto force_diskless_dec;
}
}
if (device->state.pdsk != D_UP_TO_DATE && device->ed_uuid &&
(device->state.role == R_PRIMARY || device->state.peer == R_PRIMARY) &&
(device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
drbd_err(device, "Can only attach to data with current UUID=%016llX\n",
(unsigned long long)device->ed_uuid);
retcode = ERR_DATA_NOT_CURRENT;
goto force_diskless_dec;
}
/* Since we are diskless, fix the activity log first... */
if (drbd_check_al_size(device, new_disk_conf)) {
retcode = ERR_NOMEM;
goto force_diskless_dec;
}
/* Prevent shrinking of consistent devices ! */
{
unsigned long long nsz = drbd_new_dev_size(device, nbc, nbc->disk_conf->disk_size, 0);
unsigned long long eff = nbc->md.la_size_sect;
if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && nsz < eff) {
if (nsz == nbc->disk_conf->disk_size) {
drbd_warn(device, "truncating a consistent device during attach (%llu < %llu)\n", nsz, eff);
} else {
drbd_warn(device, "refusing to truncate a consistent device (%llu < %llu)\n", nsz, eff);
drbd_msg_sprintf_info(adm_ctx.reply_skb,
"To-be-attached device has last effective > current size, and is consistent\n"
"(%llu > %llu sectors). Refusing to attach.", eff, nsz);
retcode = ERR_IMPLICIT_SHRINK;
goto force_diskless_dec;
}
}
}
lock_all_resources();
retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
if (retcode != NO_ERROR) {
unlock_all_resources();
goto force_diskless_dec;
}
/* Reset the "barriers don't work" bits here, then force meta data to
* be written, to ensure we determine if barriers are supported. */
if (new_disk_conf->md_flushes)
clear_bit(MD_NO_FUA, &device->flags);
else
set_bit(MD_NO_FUA, &device->flags);
/* Point of no return reached.
* Devices and memory are no longer released by error cleanup below.
* now device takes over responsibility, and the state engine should
* clean it up somewhere. */
D_ASSERT(device, device->ldev == NULL);
device->ldev = nbc;
device->resync = resync_lru;
device->rs_plan_s = new_plan;
nbc = NULL;
resync_lru = NULL;
new_disk_conf = NULL;
new_plan = NULL;
drbd_resync_after_changed(device);
drbd_bump_write_ordering(device->resource, device->ldev, WO_BDEV_FLUSH);
unlock_all_resources();
if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
set_bit(CRASHED_PRIMARY, &device->flags);
else
clear_bit(CRASHED_PRIMARY, &device->flags);
if (drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
!(device->state.role == R_PRIMARY && device->resource->susp_nod))
set_bit(CRASHED_PRIMARY, &device->flags);
device->send_cnt = 0;
device->recv_cnt = 0;
device->read_cnt = 0;
device->writ_cnt = 0;
drbd_reconsider_queue_parameters(device, device->ldev, NULL);
/* If I am currently not R_PRIMARY,
* but meta data primary indicator is set,
* I just now recover from a hard crash,
* and have been R_PRIMARY before that crash.
*
* Now, if I had no connection before that crash
* (have been degraded R_PRIMARY), chances are that
* I won't find my peer now either.
*
* In that case, and _only_ in that case,
* we use the degr-wfc-timeout instead of the default,
* so we can automatically recover from a crash of a
* degraded but active "cluster" after a certain timeout.
*/
clear_bit(USE_DEGR_WFC_T, &device->flags);
if (device->state.role != R_PRIMARY &&
drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
!drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND))
set_bit(USE_DEGR_WFC_T, &device->flags);
dd = drbd_determine_dev_size(device, 0, NULL);
if (dd <= DS_ERROR) {
retcode = ERR_NOMEM_BITMAP;
goto force_diskless_dec;
} else if (dd == DS_GREW)
set_bit(RESYNC_AFTER_NEG, &device->flags);
if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ||
(test_bit(CRASHED_PRIMARY, &device->flags) &&
drbd_md_test_flag(device->ldev, MDF_AL_DISABLED))) {
drbd_info(device, "Assuming that all blocks are out of sync "
"(aka FullSync)\n");
if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
"set_n_write from attaching", BM_LOCKED_MASK)) {
retcode = ERR_IO_MD_DISK;
goto force_diskless_dec;
}
} else {
if (drbd_bitmap_io(device, &drbd_bm_read,
"read from attaching", BM_LOCKED_MASK)) {
retcode = ERR_IO_MD_DISK;
goto force_diskless_dec;
}
}
if (_drbd_bm_total_weight(device) == drbd_bm_bits(device))
drbd_suspend_al(device); /* IO is still suspended here... */
spin_lock_irq(&device->resource->req_lock);
os = drbd_read_state(device);
ns = os;
/* If MDF_CONSISTENT is not set go into inconsistent state,
otherwise investigate MDF_WasUpToDate...
If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
otherwise into D_CONSISTENT state.
*/
if (drbd_md_test_flag(device->ldev, MDF_CONSISTENT)) {
if (drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE))
ns.disk = D_CONSISTENT;
else
ns.disk = D_OUTDATED;
} else {
ns.disk = D_INCONSISTENT;
}
if (drbd_md_test_flag(device->ldev, MDF_PEER_OUT_DATED))
ns.pdsk = D_OUTDATED;
rcu_read_lock();
if (ns.disk == D_CONSISTENT &&
(ns.pdsk == D_OUTDATED || rcu_dereference(device->ldev->disk_conf)->fencing == FP_DONT_CARE))
ns.disk = D_UP_TO_DATE;
/* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
this point, because drbd_request_state() modifies these
flags. */
if (rcu_dereference(device->ldev->disk_conf)->al_updates)
device->ldev->md.flags &= ~MDF_AL_DISABLED;
else
device->ldev->md.flags |= MDF_AL_DISABLED;
rcu_read_unlock();
/* In case we are C_CONNECTED postpone any decision on the new disk
state after the negotiation phase. */
if (device->state.conn == C_CONNECTED) {
device->new_state_tmp.i = ns.i;
ns.i = os.i;
ns.disk = D_NEGOTIATING;
/* We expect to receive up-to-date UUIDs soon.
To avoid a race in receive_state, free p_uuid while
holding req_lock. I.e. atomic with the state change */
kfree(device->p_uuid);
device->p_uuid = NULL;
}
rv = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
spin_unlock_irq(&device->resource->req_lock);
if (rv < SS_SUCCESS)
goto force_diskless_dec;
mod_timer(&device->request_timer, jiffies + HZ);
if (device->state.role == R_PRIMARY)
device->ldev->md.uuid[UI_CURRENT] |= (u64)1;
else
device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
drbd_md_mark_dirty(device);
drbd_md_sync(device);
kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
put_ldev(device);
conn_reconfig_done(connection);
mutex_unlock(&adm_ctx.resource->adm_mutex);
drbd_adm_finish(&adm_ctx, info, retcode);
return 0;
force_diskless_dec:
put_ldev(device);
force_diskless:
drbd_force_state(device, NS(disk, D_DISKLESS));
drbd_md_sync(device);
fail:
conn_reconfig_done(connection);
if (nbc) {
close_backing_dev(device, nbc->md_bdev, nbc->md_bdev != nbc->backing_bdev);
close_backing_dev(device, nbc->backing_bdev, true);
kfree(nbc);
}
kfree(new_disk_conf);
lc_destroy(resync_lru);
kfree(new_plan);
mutex_unlock(&adm_ctx.resource->adm_mutex);
finish:
drbd_adm_finish(&adm_ctx, info, retcode);
return 0;
}