protected Long restart()

in server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java [551:788]


    protected Long restart(final HaWorkVO work) {
        logger.debug("RESTART with HAWORK");
        List<HaWorkVO> items = _haDao.listFutureHaWorkForVm(work.getInstanceId(), work.getId());
        if (items.size() > 0) {
            StringBuilder str = new StringBuilder("Cancelling this work item because newer ones have been scheduled.  Work Ids = [");
            for (HaWorkVO item : items) {
                str.append(item.getId()).append(", ");
            }
            str.delete(str.length() - 2, str.length()).append("]");
            logger.info(str.toString());
            return null;
        }

        items = _haDao.listRunningHaWorkForVm(work.getInstanceId());
        if (items.size() > 0) {
            StringBuilder str = new StringBuilder("Waiting because there's HA work being executed on an item currently.  Work Ids =[");
            for (HaWorkVO item : items) {
                str.append(item.getId()).append(", ");
            }
            str.delete(str.length() - 2, str.length()).append("]");
            logger.info(str.toString());
            return (System.currentTimeMillis() >> 10) + _investigateRetryInterval;
        }

        long vmId = work.getInstanceId();

        VirtualMachine vm = _itMgr.findById(work.getInstanceId());
        if (vm == null) {
            logger.info("Unable to find vm: " + vmId);
            return null;
        }
        if (checkAndCancelWorkIfNeeded(work)) {
            return null;
        }

        logger.info("HA on " + vm);
        if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) {
            logger.info("VM " + vm + " has been changed.  Current State = " + vm.getState() + " Previous State = " + work.getPreviousState() + " last updated = " +
                vm.getUpdated() + " previous updated = " + work.getUpdateTime());
            return null;
        }
        if (vm.getHostId() != null && !vm.getHostId().equals(work.getHostId())) {
            logger.info("VM " + vm + " has been changed.  Current host id = " + vm.getHostId() + " Previous host id = " + work.getHostId());
            return null;
        }

        AlertManager.AlertType alertType = AlertManager.AlertType.ALERT_TYPE_USERVM;
        if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) {
            alertType = AlertManager.AlertType.ALERT_TYPE_DOMAIN_ROUTER;
        } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
            alertType = AlertManager.AlertType.ALERT_TYPE_CONSOLE_PROXY;
        } else if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) {
            alertType = AlertManager.AlertType.ALERT_TYPE_SSVM;
        }

        HostVO host = _hostDao.findById(work.getHostId());
        boolean isHostRemoved = false;
        if (host == null) {
            host = _hostDao.findByIdIncludingRemoved(work.getHostId());
            if (host != null) {
                logger.debug("VM {} is now no longer on host {} as the host is removed", vm, host);
                isHostRemoved = true;
            }
        }

        DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId());
        HostPodVO podVO = _podDao.findById(host.getPodId());
        String hostDesc = String.format("%s, availability zone: %s, pod: %s", host, dcVO.getName(), podVO.getName());

        Boolean alive = null;
        if (work.getStep() == Step.Investigating) {
            if (!isHostRemoved) {
                if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) {
                    logger.info("VM {} is now no longer on host {}", vm, host);
                    return null;
                }

                Investigator investigator = null;
                for (Investigator it : investigators) {
                    investigator = it;
                    try
                    {
                        alive = investigator.isVmAlive(vm, host);
                        logger.info(investigator.getName() + " found " + vm + " to be alive? " + alive);
                        break;
                    } catch (UnknownVM e) {
                        logger.info(investigator.getName() + " could not find " + vm);
                    }
                }

                boolean fenced = false;
                if (alive == null) {
                    logger.debug("Fencing off VM that we don't know the state of");
                    for (FenceBuilder fb : fenceBuilders) {
                        Boolean result = fb.fenceOff(vm, host);
                        logger.info("Fencer " + fb.getName() + " returned " + result);
                        if (result != null && result) {
                            fenced = true;
                            break;
                        }
                    }

                } else if (!alive) {
                    fenced = true;
                } else {
                    logger.debug("VM {} is found to be alive by {}", vm, investigator.getName());
                    if (host.getStatus() == Status.Up) {
                        logger.info(vm + " is alive and host is up. No need to restart it.");
                        return null;
                    } else {
                        logger.debug("Rescheduling because the host is not up but the vm is alive");
                        return (System.currentTimeMillis() >> 10) + _investigateRetryInterval;
                    }
                }

                if (!fenced) {
                    logger.debug("We were unable to fence off the VM " + vm);
                    _alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() +
                        " which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getHostName() + ", id: " + vmId +
                        " which was running on host " + hostDesc);
                    return (System.currentTimeMillis() >> 10) + _restartRetryInterval;
                }

                try {
                    _itMgr.advanceStop(vm.getUuid(), true);
                } catch (ResourceUnavailableException e) {
                    assert false : "How do we hit this when force is true?";
                    throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
                } catch (OperationTimedoutException e) {
                    assert false : "How do we hit this when force is true?";
                    throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
                } catch (ConcurrentOperationException e) {
                    assert false : "How do we hit this when force is true?";
                    throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
                }

                work.setStep(Step.Scheduled);
                _haDao.update(work.getId(), work);
            } else {
                logger.debug("How come that HA step is Investigating and the host is removed? Calling forced Stop on Vm anyways");
                try {
                    _itMgr.advanceStop(vm.getUuid(), true);
                } catch (ResourceUnavailableException e) {
                    assert false : "How do we hit this when force is true?";
                    throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
                } catch (OperationTimedoutException e) {
                    assert false : "How do we hit this when force is true?";
                    throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
                } catch (ConcurrentOperationException e) {
                    assert false : "How do we hit this when force is true?";
                    throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
                }
            }
        }

        vm = _itMgr.findById(vm.getId());

        if (!ForceHA.value() && !vm.isHaEnabled()) {
            if (logger.isDebugEnabled()) {
                logger.debug("VM is not HA enabled so we're done.");
            }
            return null; // VM doesn't require HA
        }

        if ((host == null || host.getRemoved() != null || host.getState() != Status.Up)
                 && !volumeMgr.canVmRestartOnAnotherServer(vm.getId())) {
            if (logger.isDebugEnabled()) {
                logger.debug("VM can not restart on another server.");
            }
            return null;
        }

        try {
            HashMap<VirtualMachineProfile.Param, Object> params = new HashMap<VirtualMachineProfile.Param, Object>();
            if (_haTag != null) {
                params.put(VirtualMachineProfile.Param.HaTag, _haTag);
            }
            WorkType wt = work.getWorkType();
            if (wt.equals(WorkType.HA)) {
                params.put(VirtualMachineProfile.Param.HaOperation, true);
            }

            try{
                if (HypervisorType.KVM == host.getHypervisorType()) {
                    List<VolumeVO> volumes = volumeDao.findByInstance(vmId);
                    for (VolumeVO volumeVO : volumes) {
                        //detach the volumes from all clusters before starting the VM on another host.
                        if (volumeVO.getPoolType() == StoragePoolType.StorPool) {
                            DataStoreProvider storeProvider = dataStoreProviderMgr.getDataStoreProvider(volumeVO.getPoolType().name());
                            DataStoreDriver storeDriver = storeProvider.getDataStoreDriver();
                            if (storeDriver instanceof PrimaryDataStoreDriver) {
                                PrimaryDataStoreDriver primaryStoreDriver = (PrimaryDataStoreDriver)storeDriver;
                                primaryStoreDriver.detachVolumeFromAllStorageNodes(volumeVO);
                            }
                        }
                    }
                }
                // First try starting the vm with its original planner, if it doesn't succeed send HAPlanner as its an emergency.
                startVm(vm, params, null);
            } catch (InsufficientCapacityException e){
                logger.warn("Failed to deploy vm {} with original planner, sending HAPlanner", vm);
                startVm(vm, params, _haPlanners.get(0));
            }

            VMInstanceVO started = _instanceDao.findById(vm.getId());
            if (started != null && started.getState() == VirtualMachine.State.Running) {
                String message = String.format("HA starting VM: %s (%s)", started.getHostName(), started.getInstanceName());
                HostVO hostVmHasStarted = _hostDao.findById(started.getHostId());
                logger.info(String.format("HA is now restarting %s on %s", started, hostVmHasStarted));
                _alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), message, message);
                return null;
            }

            if (logger.isDebugEnabled()) {
                logger.debug("Rescheduling VM " + vm.toString() + " to try again in " + _restartRetryInterval);
            }
        } catch (final InsufficientCapacityException e) {
            logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
            _alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " +
                hostDesc, String.format("Insufficient capacity to restart VM, name: %s, id: %d uuid: %s which was running on host %s", vm.getHostName(), vmId, vm.getUuid(), hostDesc));
        } catch (final ResourceUnavailableException e) {
            logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
            _alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " +
                hostDesc, String.format("The resource is unavailable for trying to restart VM, name: %s, id: %d uuid: %s which was running on host %s", vm.getHostName(), vmId, vm.getUuid(), hostDesc));
        } catch (ConcurrentOperationException e) {
            logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
            _alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " +
                hostDesc, String.format("The Storage is unavailable for trying to restart VM, name: %s, id: %d uuid: %s which was running on host %s", vm.getHostName(), vmId, vm.getUuid(), hostDesc));
        } catch (OperationTimedoutException e) {
            logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
            _alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " +
                    hostDesc, String.format("The operation timed out while trying to restart VM, name: %s, id: %d uuid: %s which was running on host %s", vm.getHostName(), vmId, vm.getUuid(), hostDesc));
        }
        vm = _itMgr.findById(vm.getId());
        work.setUpdateTime(vm.getUpdated());
        work.setPreviousState(vm.getState());
        return (System.currentTimeMillis() >> 10) + _restartRetryInterval;
    }