in habanalabs/common/device.c [978:1312]
int hl_device_reset(struct hl_device *hdev, u32 flags)
{
bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false,
reset_upon_device_release = false, schedule_hard_reset = false;
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
struct hl_ctx *ctx;
int i, rc;
if (!hdev->init_done) {
dev_err(hdev->dev, "Can't reset before initialization is done\n");
return 0;
}
hard_reset = !!(flags & HL_DRV_RESET_HARD);
from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR);
fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW);
if (!hard_reset && !hdev->asic_prop.supports_soft_reset) {
hard_instead_soft = true;
hard_reset = true;
}
if (hdev->reset_upon_device_release && (flags & HL_DRV_RESET_DEV_RELEASE)) {
if (hard_reset) {
dev_crit(hdev->dev,
"Aborting reset because hard-reset is mutually exclusive with reset-on-device-release\n");
return -EINVAL;
}
reset_upon_device_release = true;
goto do_reset;
}
if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) {
hard_instead_soft = true;
hard_reset = true;
}
if (hard_instead_soft)
dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
do_reset:
/* Re-entry of reset thread */
if (from_hard_reset_thread && hdev->process_kill_trial_cnt)
goto kill_processes;
/*
* Prevent concurrency in this function - only one reset should be
* done at any given time. Only need to perform this if we didn't
* get from the dedicated hard reset thread
*/
if (!from_hard_reset_thread) {
/* Block future CS/VM/JOB completion operations */
spin_lock(&hdev->reset_info.lock);
if (hdev->reset_info.in_reset) {
/* We only allow scheduling of a hard reset during soft reset */
if (hard_reset && hdev->reset_info.is_in_soft_reset)
hdev->reset_info.hard_reset_schedule_flags = flags;
spin_unlock(&hdev->reset_info.lock);
return 0;
}
hdev->reset_info.in_reset = 1;
spin_unlock(&hdev->reset_info.lock);
handle_reset_trigger(hdev, flags);
/* This still allows the completion of some KDMA ops */
hdev->reset_info.is_in_soft_reset = !hard_reset;
/* This also blocks future CS/VM/JOB completion operations */
hdev->disabled = true;
take_release_locks(hdev);
if (hard_reset)
dev_info(hdev->dev, "Going to reset device\n");
else if (reset_upon_device_release)
dev_info(hdev->dev, "Going to reset device after release by user\n");
else
dev_info(hdev->dev, "Going to reset engines of inference device\n");
}
again:
if ((hard_reset) && (!from_hard_reset_thread)) {
hdev->reset_info.hard_reset_pending = true;
hdev->process_kill_trial_cnt = 0;
hdev->device_reset_work.flags = flags;
/*
* Because the reset function can't run from heartbeat work,
* we need to call the reset function from a dedicated work.
*/
queue_delayed_work(hdev->device_reset_work.wq,
&hdev->device_reset_work.reset_work, 0);
return 0;
}
cleanup_resources(hdev, hard_reset, fw_reset);
kill_processes:
if (hard_reset) {
/* Kill processes here after CS rollback. This is because the
* process can't really exit until all its CSs are done, which
* is what we do in cs rollback
*/
rc = device_kill_open_processes(hdev, 0, false);
if (rc == -EBUSY) {
if (hdev->device_fini_pending) {
dev_crit(hdev->dev,
"Failed to kill all open processes, stopping hard reset\n");
goto out_err;
}
/* signal reset thread to reschedule */
return rc;
}
if (rc) {
dev_crit(hdev->dev,
"Failed to kill all open processes, stopping hard reset\n");
goto out_err;
}
/* Flush the Event queue workers to make sure no other thread is
* reading or writing to registers during the reset
*/
flush_workqueue(hdev->eq_wq);
}
/* Reset the H/W. It will be in idle state after this returns */
hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
if (hard_reset) {
hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
/* Release kernel context */
if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1)
hdev->kernel_ctx = NULL;
hl_vm_fini(hdev);
hl_mmu_fini(hdev);
hl_eq_reset(hdev, &hdev->event_queue);
}
/* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */
hl_hw_queue_reset(hdev, hard_reset);
for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
hl_cq_reset(hdev, &hdev->completion_queue[i]);
/* Make sure the context switch phase will run again */
ctx = hl_get_compute_ctx(hdev);
if (ctx) {
atomic_set(&ctx->thread_ctx_switch_token, 1);
ctx->thread_ctx_switch_wait_token = 0;
hl_ctx_put(ctx);
}
/* Finished tear-down, starting to re-initialize */
if (hard_reset) {
hdev->device_cpu_disabled = false;
hdev->reset_info.hard_reset_pending = false;
if (hdev->reset_info.reset_trigger_repeated &&
(hdev->reset_info.prev_reset_trigger ==
HL_DRV_RESET_FW_FATAL_ERR)) {
/* if there 2 back to back resets from FW,
* ensure driver puts the driver in a unusable state
*/
dev_crit(hdev->dev,
"Consecutive FW fatal errors received, stopping hard reset\n");
rc = -EIO;
goto out_err;
}
if (hdev->kernel_ctx) {
dev_crit(hdev->dev,
"kernel ctx was alive during hard reset, something is terribly wrong\n");
rc = -EBUSY;
goto out_err;
}
rc = hl_mmu_init(hdev);
if (rc) {
dev_err(hdev->dev,
"Failed to initialize MMU S/W after hard reset\n");
goto out_err;
}
/* Allocate the kernel context */
hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
GFP_KERNEL);
if (!hdev->kernel_ctx) {
rc = -ENOMEM;
hl_mmu_fini(hdev);
goto out_err;
}
hdev->is_compute_ctx_active = false;
rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
if (rc) {
dev_err(hdev->dev,
"failed to init kernel ctx in hard reset\n");
kfree(hdev->kernel_ctx);
hdev->kernel_ctx = NULL;
hl_mmu_fini(hdev);
goto out_err;
}
}
/* Device is now enabled as part of the initialization requires
* communication with the device firmware to get information that
* is required for the initialization itself
*/
hdev->disabled = false;
rc = hdev->asic_funcs->hw_init(hdev);
if (rc) {
dev_err(hdev->dev, "failed to initialize the H/W after reset\n");
goto out_err;
}
/* If device is not idle fail the reset process */
if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx) after reset\n",
idle_mask[1], idle_mask[0]);
rc = -EIO;
goto out_err;
}
/* Check that the communication with the device is working */
rc = hdev->asic_funcs->test_queues(hdev);
if (rc) {
dev_err(hdev->dev, "Failed to detect if device is alive after reset\n");
goto out_err;
}
if (hard_reset) {
rc = device_late_init(hdev);
if (rc) {
dev_err(hdev->dev, "Failed late init after hard reset\n");
goto out_err;
}
rc = hl_vm_init(hdev);
if (rc) {
dev_err(hdev->dev, "Failed to init memory module after hard reset\n");
goto out_err;
}
hl_set_max_power(hdev);
} else {
rc = hdev->asic_funcs->non_hard_reset_late_init(hdev);
if (rc) {
if (reset_upon_device_release)
dev_err(hdev->dev,
"Failed late init in reset after device release\n");
else
dev_err(hdev->dev, "Failed late init after soft reset\n");
goto out_err;
}
}
spin_lock(&hdev->reset_info.lock);
hdev->reset_info.is_in_soft_reset = false;
/* Schedule hard reset only if requested and if not already in hard reset.
* We keep 'in_reset' enabled, so no other reset can go in during the hard
* reset schedule
*/
if (!hard_reset && hdev->reset_info.hard_reset_schedule_flags)
schedule_hard_reset = true;
else
hdev->reset_info.in_reset = 0;
spin_unlock(&hdev->reset_info.lock);
hdev->reset_info.needs_reset = false;
dev_notice(hdev->dev, "Successfully finished resetting the device\n");
if (hard_reset) {
hdev->reset_info.hard_reset_cnt++;
/* After reset is done, we are ready to receive events from
* the F/W. We can't do it before because we will ignore events
* and if those events are fatal, we won't know about it and
* the device will be operational although it shouldn't be
*/
hdev->asic_funcs->enable_events_from_fw(hdev);
} else if (!reset_upon_device_release) {
hdev->reset_info.soft_reset_cnt++;
}
if (schedule_hard_reset) {
dev_info(hdev->dev, "Performing hard reset scheduled during soft reset\n");
flags = hdev->reset_info.hard_reset_schedule_flags;
hdev->reset_info.hard_reset_schedule_flags = 0;
hdev->disabled = true;
hard_reset = true;
handle_reset_trigger(hdev, flags);
goto again;
}
return 0;
out_err:
hdev->disabled = true;
hdev->reset_info.is_in_soft_reset = false;
if (hard_reset) {
dev_err(hdev->dev, "Failed to reset! Device is NOT usable\n");
hdev->reset_info.hard_reset_cnt++;
} else if (reset_upon_device_release) {
dev_err(hdev->dev, "Failed to reset device after user release\n");
hard_reset = true;
goto again;
} else {
dev_err(hdev->dev, "Failed to do soft-reset\n");
hdev->reset_info.soft_reset_cnt++;
hard_reset = true;
goto again;
}
hdev->reset_info.in_reset = 0;
return rc;
}