static void gaudi_handle_eqe()

in habanalabs/gaudi/gaudi.c [8065:8320]


static void gaudi_handle_eqe(struct hl_device *hdev,
				struct hl_eq_entry *eq_entry)
{
	struct gaudi_device *gaudi = hdev->asic_specific;
	u32 ctl = le32_to_cpu(eq_entry->hdr.ctl);
	u32 fw_fatal_err_flag = 0;
	u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK)
			>> EQ_CTL_EVENT_TYPE_SHIFT);
	bool reset_required;
	u8 cause;
	int rc;

	if (event_type >= GAUDI_EVENT_SIZE) {
		dev_err(hdev->dev, "Event type %u exceeds maximum of %u",
				event_type, GAUDI_EVENT_SIZE - 1);
		return;
	}

	gaudi->events_stat[event_type]++;
	gaudi->events_stat_aggregate[event_type]++;

	switch (event_type) {
	case GAUDI_EVENT_PCIE_CORE_DERR:
	case GAUDI_EVENT_PCIE_IF_DERR:
	case GAUDI_EVENT_PCIE_PHY_DERR:
	case GAUDI_EVENT_TPC0_DERR ... GAUDI_EVENT_TPC7_DERR:
	case GAUDI_EVENT_MME0_ACC_DERR:
	case GAUDI_EVENT_MME0_SBAB_DERR:
	case GAUDI_EVENT_MME1_ACC_DERR:
	case GAUDI_EVENT_MME1_SBAB_DERR:
	case GAUDI_EVENT_MME2_ACC_DERR:
	case GAUDI_EVENT_MME2_SBAB_DERR:
	case GAUDI_EVENT_MME3_ACC_DERR:
	case GAUDI_EVENT_MME3_SBAB_DERR:
	case GAUDI_EVENT_DMA0_DERR_ECC ... GAUDI_EVENT_DMA7_DERR_ECC:
		fallthrough;
	case GAUDI_EVENT_CPU_IF_ECC_DERR:
	case GAUDI_EVENT_PSOC_MEM_DERR:
	case GAUDI_EVENT_PSOC_CORESIGHT_DERR:
	case GAUDI_EVENT_SRAM0_DERR ... GAUDI_EVENT_SRAM28_DERR:
	case GAUDI_EVENT_DMA_IF0_DERR ... GAUDI_EVENT_DMA_IF3_DERR:
	case GAUDI_EVENT_HBM_0_DERR ... GAUDI_EVENT_HBM_3_DERR:
	case GAUDI_EVENT_MMU_DERR:
	case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR:
		gaudi_print_irq_info(hdev, event_type, true);
		gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
		fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
		goto reset_device;

	case GAUDI_EVENT_GIC500:
	case GAUDI_EVENT_AXI_ECC:
	case GAUDI_EVENT_L2_RAM_ECC:
	case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17:
		gaudi_print_irq_info(hdev, event_type, false);
		fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
		goto reset_device;

	case GAUDI_EVENT_HBM0_SPI_0:
	case GAUDI_EVENT_HBM1_SPI_0:
	case GAUDI_EVENT_HBM2_SPI_0:
	case GAUDI_EVENT_HBM3_SPI_0:
		gaudi_print_irq_info(hdev, event_type, false);
		gaudi_hbm_read_interrupts(hdev,
				gaudi_hbm_event_to_dev(event_type),
				&eq_entry->hbm_ecc_data);
		fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
		goto reset_device;

	case GAUDI_EVENT_HBM0_SPI_1:
	case GAUDI_EVENT_HBM1_SPI_1:
	case GAUDI_EVENT_HBM2_SPI_1:
	case GAUDI_EVENT_HBM3_SPI_1:
		gaudi_print_irq_info(hdev, event_type, false);
		gaudi_hbm_read_interrupts(hdev,
				gaudi_hbm_event_to_dev(event_type),
				&eq_entry->hbm_ecc_data);
		hl_fw_unmask_irq(hdev, event_type);
		break;

	case GAUDI_EVENT_TPC0_DEC:
	case GAUDI_EVENT_TPC1_DEC:
	case GAUDI_EVENT_TPC2_DEC:
	case GAUDI_EVENT_TPC3_DEC:
	case GAUDI_EVENT_TPC4_DEC:
	case GAUDI_EVENT_TPC5_DEC:
	case GAUDI_EVENT_TPC6_DEC:
	case GAUDI_EVENT_TPC7_DEC:
		gaudi_print_irq_info(hdev, event_type, true);
		reset_required = gaudi_tpc_read_interrupts(hdev,
					tpc_dec_event_to_tpc_id(event_type),
					"AXI_SLV_DEC_Error");
		if (reset_required) {
			dev_err(hdev->dev, "reset required due to %s\n",
				gaudi_irq_map_table[event_type].name);

			hl_device_reset(hdev, 0);
		} else {
			hl_fw_unmask_irq(hdev, event_type);
		}
		break;

	case GAUDI_EVENT_TPC0_KRN_ERR:
	case GAUDI_EVENT_TPC1_KRN_ERR:
	case GAUDI_EVENT_TPC2_KRN_ERR:
	case GAUDI_EVENT_TPC3_KRN_ERR:
	case GAUDI_EVENT_TPC4_KRN_ERR:
	case GAUDI_EVENT_TPC5_KRN_ERR:
	case GAUDI_EVENT_TPC6_KRN_ERR:
	case GAUDI_EVENT_TPC7_KRN_ERR:
		gaudi_print_irq_info(hdev, event_type, true);
		reset_required = gaudi_tpc_read_interrupts(hdev,
					tpc_krn_event_to_tpc_id(event_type),
					"KRN_ERR");
		if (reset_required) {
			dev_err(hdev->dev, "reset required due to %s\n",
				gaudi_irq_map_table[event_type].name);

			hl_device_reset(hdev, 0);
		} else {
			hl_fw_unmask_irq(hdev, event_type);
		}
		break;

	case GAUDI_EVENT_PCIE_CORE_SERR:
	case GAUDI_EVENT_PCIE_IF_SERR:
	case GAUDI_EVENT_PCIE_PHY_SERR:
	case GAUDI_EVENT_TPC0_SERR ... GAUDI_EVENT_TPC7_SERR:
	case GAUDI_EVENT_MME0_ACC_SERR:
	case GAUDI_EVENT_MME0_SBAB_SERR:
	case GAUDI_EVENT_MME1_ACC_SERR:
	case GAUDI_EVENT_MME1_SBAB_SERR:
	case GAUDI_EVENT_MME2_ACC_SERR:
	case GAUDI_EVENT_MME2_SBAB_SERR:
	case GAUDI_EVENT_MME3_ACC_SERR:
	case GAUDI_EVENT_MME3_SBAB_SERR:
	case GAUDI_EVENT_DMA0_SERR_ECC ... GAUDI_EVENT_DMA7_SERR_ECC:
	case GAUDI_EVENT_CPU_IF_ECC_SERR:
	case GAUDI_EVENT_PSOC_MEM_SERR:
	case GAUDI_EVENT_PSOC_CORESIGHT_SERR:
	case GAUDI_EVENT_SRAM0_SERR ... GAUDI_EVENT_SRAM28_SERR:
	case GAUDI_EVENT_DMA_IF0_SERR ... GAUDI_EVENT_DMA_IF3_SERR:
	case GAUDI_EVENT_HBM_0_SERR ... GAUDI_EVENT_HBM_3_SERR:
		fallthrough;
	case GAUDI_EVENT_MMU_SERR:
		gaudi_print_irq_info(hdev, event_type, true);
		gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
		hl_fw_unmask_irq(hdev, event_type);
		break;

	case GAUDI_EVENT_PCIE_DEC:
	case GAUDI_EVENT_MME0_WBC_RSP:
	case GAUDI_EVENT_MME0_SBAB0_RSP:
	case GAUDI_EVENT_MME1_WBC_RSP:
	case GAUDI_EVENT_MME1_SBAB0_RSP:
	case GAUDI_EVENT_MME2_WBC_RSP:
	case GAUDI_EVENT_MME2_SBAB0_RSP:
	case GAUDI_EVENT_MME3_WBC_RSP:
	case GAUDI_EVENT_MME3_SBAB0_RSP:
	case GAUDI_EVENT_CPU_AXI_SPLITTER:
	case GAUDI_EVENT_PSOC_AXI_DEC:
	case GAUDI_EVENT_PSOC_PRSTN_FALL:
	case GAUDI_EVENT_MMU_PAGE_FAULT:
	case GAUDI_EVENT_MMU_WR_PERM:
	case GAUDI_EVENT_RAZWI_OR_ADC:
	case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM:
	case GAUDI_EVENT_MME0_QM ... GAUDI_EVENT_MME2_QM:
	case GAUDI_EVENT_DMA0_QM ... GAUDI_EVENT_DMA7_QM:
		fallthrough;
	case GAUDI_EVENT_NIC0_QM0:
	case GAUDI_EVENT_NIC0_QM1:
	case GAUDI_EVENT_NIC1_QM0:
	case GAUDI_EVENT_NIC1_QM1:
	case GAUDI_EVENT_NIC2_QM0:
	case GAUDI_EVENT_NIC2_QM1:
	case GAUDI_EVENT_NIC3_QM0:
	case GAUDI_EVENT_NIC3_QM1:
	case GAUDI_EVENT_NIC4_QM0:
	case GAUDI_EVENT_NIC4_QM1:
	case GAUDI_EVENT_DMA0_CORE ... GAUDI_EVENT_DMA7_CORE:
		gaudi_print_irq_info(hdev, event_type, true);
		gaudi_handle_qman_err(hdev, event_type);
		hl_fw_unmask_irq(hdev, event_type);
		break;

	case GAUDI_EVENT_RAZWI_OR_ADC_SW:
		gaudi_print_irq_info(hdev, event_type, true);
		goto reset_device;

	case GAUDI_EVENT_TPC0_BMON_SPMU:
	case GAUDI_EVENT_TPC1_BMON_SPMU:
	case GAUDI_EVENT_TPC2_BMON_SPMU:
	case GAUDI_EVENT_TPC3_BMON_SPMU:
	case GAUDI_EVENT_TPC4_BMON_SPMU:
	case GAUDI_EVENT_TPC5_BMON_SPMU:
	case GAUDI_EVENT_TPC6_BMON_SPMU:
	case GAUDI_EVENT_TPC7_BMON_SPMU:
	case GAUDI_EVENT_DMA_BM_CH0 ... GAUDI_EVENT_DMA_BM_CH7:
		gaudi_print_irq_info(hdev, event_type, false);
		hl_fw_unmask_irq(hdev, event_type);
		break;

	case GAUDI_EVENT_DMA_IF_SEI_0 ... GAUDI_EVENT_DMA_IF_SEI_3:
		gaudi_print_irq_info(hdev, event_type, false);
		gaudi_print_sm_sei_info(hdev, event_type,
					&eq_entry->sm_sei_data);
		rc = hl_state_dump(hdev);
		if (rc)
			dev_err(hdev->dev,
				"Error during system state dump %d\n", rc);
		hl_fw_unmask_irq(hdev, event_type);
		break;

	case GAUDI_EVENT_FIX_POWER_ENV_S ... GAUDI_EVENT_FIX_THERMAL_ENV_E:
		gaudi_print_clk_change_info(hdev, event_type);
		hl_fw_unmask_irq(hdev, event_type);
		break;

	case GAUDI_EVENT_PSOC_GPIO_U16_0:
		cause = le64_to_cpu(eq_entry->data[0]) & 0xFF;
		dev_err(hdev->dev,
			"Received high temp H/W interrupt %d (cause %d)\n",
			event_type, cause);
		break;

	case GAUDI_EVENT_DEV_RESET_REQ:
		gaudi_print_irq_info(hdev, event_type, false);
		goto reset_device;

	case GAUDI_EVENT_PKT_QUEUE_OUT_SYNC:
		gaudi_print_irq_info(hdev, event_type, false);
		gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
		goto reset_device;

	case GAUDI_EVENT_FW_ALIVE_S:
		gaudi_print_irq_info(hdev, event_type, false);
		gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive);
		goto reset_device;

	default:
		dev_err(hdev->dev, "Received invalid H/W interrupt %d\n",
				event_type);
		break;
	}

	return;

reset_device:
	if (hdev->asic_prop.fw_security_enabled)
		hl_device_reset(hdev, HL_DRV_RESET_HARD
					| HL_DRV_RESET_BYPASS_REQ_TO_FW
					| fw_fatal_err_flag);
	else if (hdev->hard_reset_on_fw_events)
		hl_device_reset(hdev, HL_DRV_RESET_HARD | fw_fatal_err_flag);
	else
		hl_fw_unmask_irq(hdev, event_type);
}