int hl_device_reset()

in habanalabs/common/device.c [978:1312]


int hl_device_reset(struct hl_device *hdev, u32 flags)
{
	bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false,
			reset_upon_device_release = false, schedule_hard_reset = false;
	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
	struct hl_ctx *ctx;
	int i, rc;

	if (!hdev->init_done) {
		dev_err(hdev->dev, "Can't reset before initialization is done\n");
		return 0;
	}

	hard_reset = !!(flags & HL_DRV_RESET_HARD);
	from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR);
	fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW);

	if (!hard_reset && !hdev->asic_prop.supports_soft_reset) {
		hard_instead_soft = true;
		hard_reset = true;
	}

	if (hdev->reset_upon_device_release && (flags & HL_DRV_RESET_DEV_RELEASE)) {
		if (hard_reset) {
			dev_crit(hdev->dev,
				"Aborting reset because hard-reset is mutually exclusive with reset-on-device-release\n");
			return -EINVAL;
		}

		reset_upon_device_release = true;

		goto do_reset;
	}

	if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) {
		hard_instead_soft = true;
		hard_reset = true;
	}

	if (hard_instead_soft)
		dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");

do_reset:
	/* Re-entry of reset thread */
	if (from_hard_reset_thread && hdev->process_kill_trial_cnt)
		goto kill_processes;

	/*
	 * Prevent concurrency in this function - only one reset should be
	 * done at any given time. Only need to perform this if we didn't
	 * get from the dedicated hard reset thread
	 */
	if (!from_hard_reset_thread) {
		/* Block future CS/VM/JOB completion operations */
		spin_lock(&hdev->reset_info.lock);
		if (hdev->reset_info.in_reset) {
			/* We only allow scheduling of a hard reset during soft reset */
			if (hard_reset && hdev->reset_info.is_in_soft_reset)
				hdev->reset_info.hard_reset_schedule_flags = flags;
			spin_unlock(&hdev->reset_info.lock);
			return 0;
		}
		hdev->reset_info.in_reset = 1;
		spin_unlock(&hdev->reset_info.lock);

		handle_reset_trigger(hdev, flags);

		/* This still allows the completion of some KDMA ops */
		hdev->reset_info.is_in_soft_reset = !hard_reset;

		/* This also blocks future CS/VM/JOB completion operations */
		hdev->disabled = true;

		take_release_locks(hdev);

		if (hard_reset)
			dev_info(hdev->dev, "Going to reset device\n");
		else if (reset_upon_device_release)
			dev_info(hdev->dev, "Going to reset device after release by user\n");
		else
			dev_info(hdev->dev, "Going to reset engines of inference device\n");
	}

again:
	if ((hard_reset) && (!from_hard_reset_thread)) {
		hdev->reset_info.hard_reset_pending = true;

		hdev->process_kill_trial_cnt = 0;

		hdev->device_reset_work.flags = flags;

		/*
		 * Because the reset function can't run from heartbeat work,
		 * we need to call the reset function from a dedicated work.
		 */
		queue_delayed_work(hdev->device_reset_work.wq,
			&hdev->device_reset_work.reset_work, 0);

		return 0;
	}

	cleanup_resources(hdev, hard_reset, fw_reset);

kill_processes:
	if (hard_reset) {
		/* Kill processes here after CS rollback. This is because the
		 * process can't really exit until all its CSs are done, which
		 * is what we do in cs rollback
		 */
		rc = device_kill_open_processes(hdev, 0, false);

		if (rc == -EBUSY) {
			if (hdev->device_fini_pending) {
				dev_crit(hdev->dev,
					"Failed to kill all open processes, stopping hard reset\n");
				goto out_err;
			}

			/* signal reset thread to reschedule */
			return rc;
		}

		if (rc) {
			dev_crit(hdev->dev,
				"Failed to kill all open processes, stopping hard reset\n");
			goto out_err;
		}

		/* Flush the Event queue workers to make sure no other thread is
		 * reading or writing to registers during the reset
		 */
		flush_workqueue(hdev->eq_wq);
	}

	/* Reset the H/W. It will be in idle state after this returns */
	hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);

	if (hard_reset) {
		hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;

		/* Release kernel context */
		if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1)
			hdev->kernel_ctx = NULL;

		hl_vm_fini(hdev);
		hl_mmu_fini(hdev);
		hl_eq_reset(hdev, &hdev->event_queue);
	}

	/* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */
	hl_hw_queue_reset(hdev, hard_reset);
	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
		hl_cq_reset(hdev, &hdev->completion_queue[i]);

	/* Make sure the context switch phase will run again */
	ctx = hl_get_compute_ctx(hdev);
	if (ctx) {
		atomic_set(&ctx->thread_ctx_switch_token, 1);
		ctx->thread_ctx_switch_wait_token = 0;
		hl_ctx_put(ctx);
	}

	/* Finished tear-down, starting to re-initialize */

	if (hard_reset) {
		hdev->device_cpu_disabled = false;
		hdev->reset_info.hard_reset_pending = false;

		if (hdev->reset_info.reset_trigger_repeated &&
				(hdev->reset_info.prev_reset_trigger ==
						HL_DRV_RESET_FW_FATAL_ERR)) {
			/* if there 2 back to back resets from FW,
			 * ensure driver puts the driver in a unusable state
			 */
			dev_crit(hdev->dev,
				"Consecutive FW fatal errors received, stopping hard reset\n");
			rc = -EIO;
			goto out_err;
		}

		if (hdev->kernel_ctx) {
			dev_crit(hdev->dev,
				"kernel ctx was alive during hard reset, something is terribly wrong\n");
			rc = -EBUSY;
			goto out_err;
		}

		rc = hl_mmu_init(hdev);
		if (rc) {
			dev_err(hdev->dev,
				"Failed to initialize MMU S/W after hard reset\n");
			goto out_err;
		}

		/* Allocate the kernel context */
		hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
						GFP_KERNEL);
		if (!hdev->kernel_ctx) {
			rc = -ENOMEM;
			hl_mmu_fini(hdev);
			goto out_err;
		}

		hdev->is_compute_ctx_active = false;

		rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
		if (rc) {
			dev_err(hdev->dev,
				"failed to init kernel ctx in hard reset\n");
			kfree(hdev->kernel_ctx);
			hdev->kernel_ctx = NULL;
			hl_mmu_fini(hdev);
			goto out_err;
		}
	}

	/* Device is now enabled as part of the initialization requires
	 * communication with the device firmware to get information that
	 * is required for the initialization itself
	 */
	hdev->disabled = false;

	rc = hdev->asic_funcs->hw_init(hdev);
	if (rc) {
		dev_err(hdev->dev, "failed to initialize the H/W after reset\n");
		goto out_err;
	}

	/* If device is not idle fail the reset process */
	if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
			HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
		dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx) after reset\n",
			idle_mask[1], idle_mask[0]);
		rc = -EIO;
		goto out_err;
	}

	/* Check that the communication with the device is working */
	rc = hdev->asic_funcs->test_queues(hdev);
	if (rc) {
		dev_err(hdev->dev, "Failed to detect if device is alive after reset\n");
		goto out_err;
	}

	if (hard_reset) {
		rc = device_late_init(hdev);
		if (rc) {
			dev_err(hdev->dev, "Failed late init after hard reset\n");
			goto out_err;
		}

		rc = hl_vm_init(hdev);
		if (rc) {
			dev_err(hdev->dev, "Failed to init memory module after hard reset\n");
			goto out_err;
		}

		hl_set_max_power(hdev);
	} else {
		rc = hdev->asic_funcs->non_hard_reset_late_init(hdev);
		if (rc) {
			if (reset_upon_device_release)
				dev_err(hdev->dev,
					"Failed late init in reset after device release\n");
			else
				dev_err(hdev->dev, "Failed late init after soft reset\n");
			goto out_err;
		}
	}

	spin_lock(&hdev->reset_info.lock);
	hdev->reset_info.is_in_soft_reset = false;

	/* Schedule hard reset only if requested and if not already in hard reset.
	 * We keep 'in_reset' enabled, so no other reset can go in during the hard
	 * reset schedule
	 */
	if (!hard_reset && hdev->reset_info.hard_reset_schedule_flags)
		schedule_hard_reset = true;
	else
		hdev->reset_info.in_reset = 0;

	spin_unlock(&hdev->reset_info.lock);

	hdev->reset_info.needs_reset = false;

	dev_notice(hdev->dev, "Successfully finished resetting the device\n");

	if (hard_reset) {
		hdev->reset_info.hard_reset_cnt++;

		/* After reset is done, we are ready to receive events from
		 * the F/W. We can't do it before because we will ignore events
		 * and if those events are fatal, we won't know about it and
		 * the device will be operational although it shouldn't be
		 */
		hdev->asic_funcs->enable_events_from_fw(hdev);
	} else if (!reset_upon_device_release) {
		hdev->reset_info.soft_reset_cnt++;
	}

	if (schedule_hard_reset) {
		dev_info(hdev->dev, "Performing hard reset scheduled during soft reset\n");
		flags = hdev->reset_info.hard_reset_schedule_flags;
		hdev->reset_info.hard_reset_schedule_flags = 0;
		hdev->disabled = true;
		hard_reset = true;
		handle_reset_trigger(hdev, flags);
		goto again;
	}

	return 0;

out_err:
	hdev->disabled = true;
	hdev->reset_info.is_in_soft_reset = false;

	if (hard_reset) {
		dev_err(hdev->dev, "Failed to reset! Device is NOT usable\n");
		hdev->reset_info.hard_reset_cnt++;
	} else if (reset_upon_device_release) {
		dev_err(hdev->dev, "Failed to reset device after user release\n");
		hard_reset = true;
		goto again;
	} else {
		dev_err(hdev->dev, "Failed to do soft-reset\n");
		hdev->reset_info.soft_reset_cnt++;
		hard_reset = true;
		goto again;
	}

	hdev->reset_info.in_reset = 0;

	return rc;
}