int hl_device_init()

in habanalabs/common/device.c [1323:1609]


int hl_device_init(struct hl_device *hdev, struct class *hclass)
{
	int i, rc, cq_cnt, user_interrupt_cnt, cq_ready_cnt;
	char *name;
	bool add_cdev_sysfs_on_err = false;

	name = kasprintf(GFP_KERNEL, "hl%d", hdev->id / 2);
	if (!name) {
		rc = -ENOMEM;
		goto out_disabled;
	}

	/* Initialize cdev and device structures */
	rc = device_init_cdev(hdev, hclass, hdev->id, &hl_ops, name,
				&hdev->cdev, &hdev->dev);

	kfree(name);

	if (rc)
		goto out_disabled;

	name = kasprintf(GFP_KERNEL, "hl_controlD%d", hdev->id / 2);
	if (!name) {
		rc = -ENOMEM;
		goto free_dev;
	}

	/* Initialize cdev and device structures for control device */
	rc = device_init_cdev(hdev, hclass, hdev->id_control, &hl_ctrl_ops,
				name, &hdev->cdev_ctrl, &hdev->dev_ctrl);

	kfree(name);

	if (rc)
		goto free_dev;

	/* Initialize ASIC function pointers and perform early init */
	rc = device_early_init(hdev);
	if (rc)
		goto free_dev_ctrl;

	user_interrupt_cnt = hdev->asic_prop.user_interrupt_count;

	if (user_interrupt_cnt) {
		hdev->user_interrupt = kcalloc(user_interrupt_cnt,
				sizeof(*hdev->user_interrupt),
				GFP_KERNEL);

		if (!hdev->user_interrupt) {
			rc = -ENOMEM;
			goto early_fini;
		}
	}

	/*
	 * Start calling ASIC initialization. First S/W then H/W and finally
	 * late init
	 */
	rc = hdev->asic_funcs->sw_init(hdev);
	if (rc)
		goto user_interrupts_fini;


	/* initialize completion structure for multi CS wait */
	hl_multi_cs_completion_init(hdev);

	/*
	 * Initialize the H/W queues. Must be done before hw_init, because
	 * there the addresses of the kernel queue are being written to the
	 * registers of the device
	 */
	rc = hl_hw_queues_create(hdev);
	if (rc) {
		dev_err(hdev->dev, "failed to initialize kernel queues\n");
		goto sw_fini;
	}

	cq_cnt = hdev->asic_prop.completion_queues_count;

	/*
	 * Initialize the completion queues. Must be done before hw_init,
	 * because there the addresses of the completion queues are being
	 * passed as arguments to request_irq
	 */
	if (cq_cnt) {
		hdev->completion_queue = kcalloc(cq_cnt,
				sizeof(*hdev->completion_queue),
				GFP_KERNEL);

		if (!hdev->completion_queue) {
			dev_err(hdev->dev,
				"failed to allocate completion queues\n");
			rc = -ENOMEM;
			goto hw_queues_destroy;
		}
	}

	for (i = 0, cq_ready_cnt = 0 ; i < cq_cnt ; i++, cq_ready_cnt++) {
		rc = hl_cq_init(hdev, &hdev->completion_queue[i],
				hdev->asic_funcs->get_queue_id_for_cq(hdev, i));
		if (rc) {
			dev_err(hdev->dev,
				"failed to initialize completion queue\n");
			goto cq_fini;
		}
		hdev->completion_queue[i].cq_idx = i;
	}

	/*
	 * Initialize the event queue. Must be done before hw_init,
	 * because there the address of the event queue is being
	 * passed as argument to request_irq
	 */
	rc = hl_eq_init(hdev, &hdev->event_queue);
	if (rc) {
		dev_err(hdev->dev, "failed to initialize event queue\n");
		goto cq_fini;
	}

	/* MMU S/W must be initialized before kernel context is created */
	rc = hl_mmu_init(hdev);
	if (rc) {
		dev_err(hdev->dev, "Failed to initialize MMU S/W structures\n");
		goto eq_fini;
	}

	/* Allocate the kernel context */
	hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
	if (!hdev->kernel_ctx) {
		rc = -ENOMEM;
		goto mmu_fini;
	}

	hdev->is_compute_ctx_active = false;

	hdev->asic_funcs->state_dump_init(hdev);

	hl_debugfs_add_device(hdev);

	/* debugfs nodes are created in hl_ctx_init so it must be called after
	 * hl_debugfs_add_device.
	 */
	rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
	if (rc) {
		dev_err(hdev->dev, "failed to initialize kernel context\n");
		kfree(hdev->kernel_ctx);
		goto remove_device_from_debugfs;
	}

	rc = hl_cb_pool_init(hdev);
	if (rc) {
		dev_err(hdev->dev, "failed to initialize CB pool\n");
		goto release_ctx;
	}

	/*
	 * From this point, override rc (=0) in case of an error to allow
	 * debugging (by adding char devices and create sysfs nodes as part of
	 * the error flow).
	 */
	add_cdev_sysfs_on_err = true;

	/* Device is now enabled as part of the initialization requires
	 * communication with the device firmware to get information that
	 * is required for the initialization itself
	 */
	hdev->disabled = false;

	rc = hdev->asic_funcs->hw_init(hdev);
	if (rc) {
		dev_err(hdev->dev, "failed to initialize the H/W\n");
		rc = 0;
		goto out_disabled;
	}

	/* Check that the communication with the device is working */
	rc = hdev->asic_funcs->test_queues(hdev);
	if (rc) {
		dev_err(hdev->dev, "Failed to detect if device is alive\n");
		rc = 0;
		goto out_disabled;
	}

	rc = device_late_init(hdev);
	if (rc) {
		dev_err(hdev->dev, "Failed late initialization\n");
		rc = 0;
		goto out_disabled;
	}

	dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n",
		hdev->asic_name,
		hdev->asic_prop.dram_size / SZ_1G);

	rc = hl_vm_init(hdev);
	if (rc) {
		dev_err(hdev->dev, "Failed to initialize memory module\n");
		rc = 0;
		goto out_disabled;
	}

	/*
	 * Expose devices and sysfs nodes to user.
	 * From here there is no need to add char devices and create sysfs nodes
	 * in case of an error.
	 */
	add_cdev_sysfs_on_err = false;
	rc = device_cdev_sysfs_add(hdev);
	if (rc) {
		dev_err(hdev->dev,
			"Failed to add char devices and sysfs nodes\n");
		rc = 0;
		goto out_disabled;
	}

	/* Need to call this again because the max power might change,
	 * depending on card type for certain ASICs
	 */
	hl_set_max_power(hdev);

	/*
	 * hl_hwmon_init() must be called after device_late_init(), because only
	 * there we get the information from the device about which
	 * hwmon-related sensors the device supports.
	 * Furthermore, it must be done after adding the device to the system.
	 */
	rc = hl_hwmon_init(hdev);
	if (rc) {
		dev_err(hdev->dev, "Failed to initialize hwmon\n");
		rc = 0;
		goto out_disabled;
	}

	dev_notice(hdev->dev,
		"Successfully added device to habanalabs driver\n");

	hdev->init_done = true;

	/* After initialization is done, we are ready to receive events from
	 * the F/W. We can't do it before because we will ignore events and if
	 * those events are fatal, we won't know about it and the device will
	 * be operational although it shouldn't be
	 */
	hdev->asic_funcs->enable_events_from_fw(hdev);

	return 0;

release_ctx:
	if (hl_ctx_put(hdev->kernel_ctx) != 1)
		dev_err(hdev->dev,
			"kernel ctx is still alive on initialization failure\n");
remove_device_from_debugfs:
	hl_debugfs_remove_device(hdev);
mmu_fini:
	hl_mmu_fini(hdev);
eq_fini:
	hl_eq_fini(hdev, &hdev->event_queue);
cq_fini:
	for (i = 0 ; i < cq_ready_cnt ; i++)
		hl_cq_fini(hdev, &hdev->completion_queue[i]);
	kfree(hdev->completion_queue);
hw_queues_destroy:
	hl_hw_queues_destroy(hdev);
sw_fini:
	hdev->asic_funcs->sw_fini(hdev);
user_interrupts_fini:
	kfree(hdev->user_interrupt);
early_fini:
	device_early_fini(hdev);
free_dev_ctrl:
	put_device(hdev->dev_ctrl);
free_dev:
	put_device(hdev->dev);
out_disabled:
	hdev->disabled = true;
	if (add_cdev_sysfs_on_err)
		device_cdev_sysfs_add(hdev);
	if (hdev->pdev)
		dev_err(&hdev->pdev->dev,
			"Failed to initialize hl%d. Device is NOT usable !\n",
			hdev->id / 2);
	else
		pr_err("Failed to initialize hl%d. Device is NOT usable !\n",
			hdev->id / 2);

	return rc;
}