in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java [551:788]
protected Long restart(final HaWorkVO work) {
logger.debug("RESTART with HAWORK");
List<HaWorkVO> items = _haDao.listFutureHaWorkForVm(work.getInstanceId(), work.getId());
if (items.size() > 0) {
StringBuilder str = new StringBuilder("Cancelling this work item because newer ones have been scheduled. Work Ids = [");
for (HaWorkVO item : items) {
str.append(item.getId()).append(", ");
}
str.delete(str.length() - 2, str.length()).append("]");
logger.info(str.toString());
return null;
}
items = _haDao.listRunningHaWorkForVm(work.getInstanceId());
if (items.size() > 0) {
StringBuilder str = new StringBuilder("Waiting because there's HA work being executed on an item currently. Work Ids =[");
for (HaWorkVO item : items) {
str.append(item.getId()).append(", ");
}
str.delete(str.length() - 2, str.length()).append("]");
logger.info(str.toString());
return (System.currentTimeMillis() >> 10) + _investigateRetryInterval;
}
long vmId = work.getInstanceId();
VirtualMachine vm = _itMgr.findById(work.getInstanceId());
if (vm == null) {
logger.info("Unable to find vm: " + vmId);
return null;
}
if (checkAndCancelWorkIfNeeded(work)) {
return null;
}
logger.info("HA on " + vm);
if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) {
logger.info("VM " + vm + " has been changed. Current State = " + vm.getState() + " Previous State = " + work.getPreviousState() + " last updated = " +
vm.getUpdated() + " previous updated = " + work.getUpdateTime());
return null;
}
if (vm.getHostId() != null && !vm.getHostId().equals(work.getHostId())) {
logger.info("VM " + vm + " has been changed. Current host id = " + vm.getHostId() + " Previous host id = " + work.getHostId());
return null;
}
AlertManager.AlertType alertType = AlertManager.AlertType.ALERT_TYPE_USERVM;
if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) {
alertType = AlertManager.AlertType.ALERT_TYPE_DOMAIN_ROUTER;
} else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
alertType = AlertManager.AlertType.ALERT_TYPE_CONSOLE_PROXY;
} else if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) {
alertType = AlertManager.AlertType.ALERT_TYPE_SSVM;
}
HostVO host = _hostDao.findById(work.getHostId());
boolean isHostRemoved = false;
if (host == null) {
host = _hostDao.findByIdIncludingRemoved(work.getHostId());
if (host != null) {
logger.debug("VM {} is now no longer on host {} as the host is removed", vm, host);
isHostRemoved = true;
}
}
DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId());
HostPodVO podVO = _podDao.findById(host.getPodId());
String hostDesc = String.format("%s, availability zone: %s, pod: %s", host, dcVO.getName(), podVO.getName());
Boolean alive = null;
if (work.getStep() == Step.Investigating) {
if (!isHostRemoved) {
if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) {
logger.info("VM {} is now no longer on host {}", vm, host);
return null;
}
Investigator investigator = null;
for (Investigator it : investigators) {
investigator = it;
try
{
alive = investigator.isVmAlive(vm, host);
logger.info(investigator.getName() + " found " + vm + " to be alive? " + alive);
break;
} catch (UnknownVM e) {
logger.info(investigator.getName() + " could not find " + vm);
}
}
boolean fenced = false;
if (alive == null) {
logger.debug("Fencing off VM that we don't know the state of");
for (FenceBuilder fb : fenceBuilders) {
Boolean result = fb.fenceOff(vm, host);
logger.info("Fencer " + fb.getName() + " returned " + result);
if (result != null && result) {
fenced = true;
break;
}
}
} else if (!alive) {
fenced = true;
} else {
logger.debug("VM {} is found to be alive by {}", vm, investigator.getName());
if (host.getStatus() == Status.Up) {
logger.info(vm + " is alive and host is up. No need to restart it.");
return null;
} else {
logger.debug("Rescheduling because the host is not up but the vm is alive");
return (System.currentTimeMillis() >> 10) + _investigateRetryInterval;
}
}
if (!fenced) {
logger.debug("We were unable to fence off the VM " + vm);
_alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() +
" which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getHostName() + ", id: " + vmId +
" which was running on host " + hostDesc);
return (System.currentTimeMillis() >> 10) + _restartRetryInterval;
}
try {
_itMgr.advanceStop(vm.getUuid(), true);
} catch (ResourceUnavailableException e) {
assert false : "How do we hit this when force is true?";
throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
} catch (OperationTimedoutException e) {
assert false : "How do we hit this when force is true?";
throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
} catch (ConcurrentOperationException e) {
assert false : "How do we hit this when force is true?";
throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
}
work.setStep(Step.Scheduled);
_haDao.update(work.getId(), work);
} else {
logger.debug("How come that HA step is Investigating and the host is removed? Calling forced Stop on Vm anyways");
try {
_itMgr.advanceStop(vm.getUuid(), true);
} catch (ResourceUnavailableException e) {
assert false : "How do we hit this when force is true?";
throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
} catch (OperationTimedoutException e) {
assert false : "How do we hit this when force is true?";
throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
} catch (ConcurrentOperationException e) {
assert false : "How do we hit this when force is true?";
throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
}
}
}
vm = _itMgr.findById(vm.getId());
if (!ForceHA.value() && !vm.isHaEnabled()) {
if (logger.isDebugEnabled()) {
logger.debug("VM is not HA enabled so we're done.");
}
return null; // VM doesn't require HA
}
if ((host == null || host.getRemoved() != null || host.getState() != Status.Up)
&& !volumeMgr.canVmRestartOnAnotherServer(vm.getId())) {
if (logger.isDebugEnabled()) {
logger.debug("VM can not restart on another server.");
}
return null;
}
try {
HashMap<VirtualMachineProfile.Param, Object> params = new HashMap<VirtualMachineProfile.Param, Object>();
if (_haTag != null) {
params.put(VirtualMachineProfile.Param.HaTag, _haTag);
}
WorkType wt = work.getWorkType();
if (wt.equals(WorkType.HA)) {
params.put(VirtualMachineProfile.Param.HaOperation, true);
}
try{
if (HypervisorType.KVM == host.getHypervisorType()) {
List<VolumeVO> volumes = volumeDao.findByInstance(vmId);
for (VolumeVO volumeVO : volumes) {
//detach the volumes from all clusters before starting the VM on another host.
if (volumeVO.getPoolType() == StoragePoolType.StorPool) {
DataStoreProvider storeProvider = dataStoreProviderMgr.getDataStoreProvider(volumeVO.getPoolType().name());
DataStoreDriver storeDriver = storeProvider.getDataStoreDriver();
if (storeDriver instanceof PrimaryDataStoreDriver) {
PrimaryDataStoreDriver primaryStoreDriver = (PrimaryDataStoreDriver)storeDriver;
primaryStoreDriver.detachVolumeFromAllStorageNodes(volumeVO);
}
}
}
}
// First try starting the vm with its original planner, if it doesn't succeed send HAPlanner as its an emergency.
startVm(vm, params, null);
} catch (InsufficientCapacityException e){
logger.warn("Failed to deploy vm {} with original planner, sending HAPlanner", vm);
startVm(vm, params, _haPlanners.get(0));
}
VMInstanceVO started = _instanceDao.findById(vm.getId());
if (started != null && started.getState() == VirtualMachine.State.Running) {
String message = String.format("HA starting VM: %s (%s)", started.getHostName(), started.getInstanceName());
HostVO hostVmHasStarted = _hostDao.findById(started.getHostId());
logger.info(String.format("HA is now restarting %s on %s", started, hostVmHasStarted));
_alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), message, message);
return null;
}
if (logger.isDebugEnabled()) {
logger.debug("Rescheduling VM " + vm.toString() + " to try again in " + _restartRetryInterval);
}
} catch (final InsufficientCapacityException e) {
logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
_alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " +
hostDesc, String.format("Insufficient capacity to restart VM, name: %s, id: %d uuid: %s which was running on host %s", vm.getHostName(), vmId, vm.getUuid(), hostDesc));
} catch (final ResourceUnavailableException e) {
logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
_alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " +
hostDesc, String.format("The resource is unavailable for trying to restart VM, name: %s, id: %d uuid: %s which was running on host %s", vm.getHostName(), vmId, vm.getUuid(), hostDesc));
} catch (ConcurrentOperationException e) {
logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
_alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " +
hostDesc, String.format("The Storage is unavailable for trying to restart VM, name: %s, id: %d uuid: %s which was running on host %s", vm.getHostName(), vmId, vm.getUuid(), hostDesc));
} catch (OperationTimedoutException e) {
logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
_alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " +
hostDesc, String.format("The operation timed out while trying to restart VM, name: %s, id: %d uuid: %s which was running on host %s", vm.getHostName(), vmId, vm.getUuid(), hostDesc));
}
vm = _itMgr.findById(vm.getId());
work.setUpdateTime(vm.getUpdated());
work.setPreviousState(vm.getState());
return (System.currentTimeMillis() >> 10) + _restartRetryInterval;
}