in habanalabs/gaudi/gaudi.c [8065:8320]
static void gaudi_handle_eqe(struct hl_device *hdev,
struct hl_eq_entry *eq_entry)
{
struct gaudi_device *gaudi = hdev->asic_specific;
u32 ctl = le32_to_cpu(eq_entry->hdr.ctl);
u32 fw_fatal_err_flag = 0;
u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK)
>> EQ_CTL_EVENT_TYPE_SHIFT);
bool reset_required;
u8 cause;
int rc;
if (event_type >= GAUDI_EVENT_SIZE) {
dev_err(hdev->dev, "Event type %u exceeds maximum of %u",
event_type, GAUDI_EVENT_SIZE - 1);
return;
}
gaudi->events_stat[event_type]++;
gaudi->events_stat_aggregate[event_type]++;
switch (event_type) {
case GAUDI_EVENT_PCIE_CORE_DERR:
case GAUDI_EVENT_PCIE_IF_DERR:
case GAUDI_EVENT_PCIE_PHY_DERR:
case GAUDI_EVENT_TPC0_DERR ... GAUDI_EVENT_TPC7_DERR:
case GAUDI_EVENT_MME0_ACC_DERR:
case GAUDI_EVENT_MME0_SBAB_DERR:
case GAUDI_EVENT_MME1_ACC_DERR:
case GAUDI_EVENT_MME1_SBAB_DERR:
case GAUDI_EVENT_MME2_ACC_DERR:
case GAUDI_EVENT_MME2_SBAB_DERR:
case GAUDI_EVENT_MME3_ACC_DERR:
case GAUDI_EVENT_MME3_SBAB_DERR:
case GAUDI_EVENT_DMA0_DERR_ECC ... GAUDI_EVENT_DMA7_DERR_ECC:
fallthrough;
case GAUDI_EVENT_CPU_IF_ECC_DERR:
case GAUDI_EVENT_PSOC_MEM_DERR:
case GAUDI_EVENT_PSOC_CORESIGHT_DERR:
case GAUDI_EVENT_SRAM0_DERR ... GAUDI_EVENT_SRAM28_DERR:
case GAUDI_EVENT_DMA_IF0_DERR ... GAUDI_EVENT_DMA_IF3_DERR:
case GAUDI_EVENT_HBM_0_DERR ... GAUDI_EVENT_HBM_3_DERR:
case GAUDI_EVENT_MMU_DERR:
case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR:
gaudi_print_irq_info(hdev, event_type, true);
gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
goto reset_device;
case GAUDI_EVENT_GIC500:
case GAUDI_EVENT_AXI_ECC:
case GAUDI_EVENT_L2_RAM_ECC:
case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17:
gaudi_print_irq_info(hdev, event_type, false);
fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
goto reset_device;
case GAUDI_EVENT_HBM0_SPI_0:
case GAUDI_EVENT_HBM1_SPI_0:
case GAUDI_EVENT_HBM2_SPI_0:
case GAUDI_EVENT_HBM3_SPI_0:
gaudi_print_irq_info(hdev, event_type, false);
gaudi_hbm_read_interrupts(hdev,
gaudi_hbm_event_to_dev(event_type),
&eq_entry->hbm_ecc_data);
fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
goto reset_device;
case GAUDI_EVENT_HBM0_SPI_1:
case GAUDI_EVENT_HBM1_SPI_1:
case GAUDI_EVENT_HBM2_SPI_1:
case GAUDI_EVENT_HBM3_SPI_1:
gaudi_print_irq_info(hdev, event_type, false);
gaudi_hbm_read_interrupts(hdev,
gaudi_hbm_event_to_dev(event_type),
&eq_entry->hbm_ecc_data);
hl_fw_unmask_irq(hdev, event_type);
break;
case GAUDI_EVENT_TPC0_DEC:
case GAUDI_EVENT_TPC1_DEC:
case GAUDI_EVENT_TPC2_DEC:
case GAUDI_EVENT_TPC3_DEC:
case GAUDI_EVENT_TPC4_DEC:
case GAUDI_EVENT_TPC5_DEC:
case GAUDI_EVENT_TPC6_DEC:
case GAUDI_EVENT_TPC7_DEC:
gaudi_print_irq_info(hdev, event_type, true);
reset_required = gaudi_tpc_read_interrupts(hdev,
tpc_dec_event_to_tpc_id(event_type),
"AXI_SLV_DEC_Error");
if (reset_required) {
dev_err(hdev->dev, "reset required due to %s\n",
gaudi_irq_map_table[event_type].name);
hl_device_reset(hdev, 0);
} else {
hl_fw_unmask_irq(hdev, event_type);
}
break;
case GAUDI_EVENT_TPC0_KRN_ERR:
case GAUDI_EVENT_TPC1_KRN_ERR:
case GAUDI_EVENT_TPC2_KRN_ERR:
case GAUDI_EVENT_TPC3_KRN_ERR:
case GAUDI_EVENT_TPC4_KRN_ERR:
case GAUDI_EVENT_TPC5_KRN_ERR:
case GAUDI_EVENT_TPC6_KRN_ERR:
case GAUDI_EVENT_TPC7_KRN_ERR:
gaudi_print_irq_info(hdev, event_type, true);
reset_required = gaudi_tpc_read_interrupts(hdev,
tpc_krn_event_to_tpc_id(event_type),
"KRN_ERR");
if (reset_required) {
dev_err(hdev->dev, "reset required due to %s\n",
gaudi_irq_map_table[event_type].name);
hl_device_reset(hdev, 0);
} else {
hl_fw_unmask_irq(hdev, event_type);
}
break;
case GAUDI_EVENT_PCIE_CORE_SERR:
case GAUDI_EVENT_PCIE_IF_SERR:
case GAUDI_EVENT_PCIE_PHY_SERR:
case GAUDI_EVENT_TPC0_SERR ... GAUDI_EVENT_TPC7_SERR:
case GAUDI_EVENT_MME0_ACC_SERR:
case GAUDI_EVENT_MME0_SBAB_SERR:
case GAUDI_EVENT_MME1_ACC_SERR:
case GAUDI_EVENT_MME1_SBAB_SERR:
case GAUDI_EVENT_MME2_ACC_SERR:
case GAUDI_EVENT_MME2_SBAB_SERR:
case GAUDI_EVENT_MME3_ACC_SERR:
case GAUDI_EVENT_MME3_SBAB_SERR:
case GAUDI_EVENT_DMA0_SERR_ECC ... GAUDI_EVENT_DMA7_SERR_ECC:
case GAUDI_EVENT_CPU_IF_ECC_SERR:
case GAUDI_EVENT_PSOC_MEM_SERR:
case GAUDI_EVENT_PSOC_CORESIGHT_SERR:
case GAUDI_EVENT_SRAM0_SERR ... GAUDI_EVENT_SRAM28_SERR:
case GAUDI_EVENT_DMA_IF0_SERR ... GAUDI_EVENT_DMA_IF3_SERR:
case GAUDI_EVENT_HBM_0_SERR ... GAUDI_EVENT_HBM_3_SERR:
fallthrough;
case GAUDI_EVENT_MMU_SERR:
gaudi_print_irq_info(hdev, event_type, true);
gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
hl_fw_unmask_irq(hdev, event_type);
break;
case GAUDI_EVENT_PCIE_DEC:
case GAUDI_EVENT_MME0_WBC_RSP:
case GAUDI_EVENT_MME0_SBAB0_RSP:
case GAUDI_EVENT_MME1_WBC_RSP:
case GAUDI_EVENT_MME1_SBAB0_RSP:
case GAUDI_EVENT_MME2_WBC_RSP:
case GAUDI_EVENT_MME2_SBAB0_RSP:
case GAUDI_EVENT_MME3_WBC_RSP:
case GAUDI_EVENT_MME3_SBAB0_RSP:
case GAUDI_EVENT_CPU_AXI_SPLITTER:
case GAUDI_EVENT_PSOC_AXI_DEC:
case GAUDI_EVENT_PSOC_PRSTN_FALL:
case GAUDI_EVENT_MMU_PAGE_FAULT:
case GAUDI_EVENT_MMU_WR_PERM:
case GAUDI_EVENT_RAZWI_OR_ADC:
case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM:
case GAUDI_EVENT_MME0_QM ... GAUDI_EVENT_MME2_QM:
case GAUDI_EVENT_DMA0_QM ... GAUDI_EVENT_DMA7_QM:
fallthrough;
case GAUDI_EVENT_NIC0_QM0:
case GAUDI_EVENT_NIC0_QM1:
case GAUDI_EVENT_NIC1_QM0:
case GAUDI_EVENT_NIC1_QM1:
case GAUDI_EVENT_NIC2_QM0:
case GAUDI_EVENT_NIC2_QM1:
case GAUDI_EVENT_NIC3_QM0:
case GAUDI_EVENT_NIC3_QM1:
case GAUDI_EVENT_NIC4_QM0:
case GAUDI_EVENT_NIC4_QM1:
case GAUDI_EVENT_DMA0_CORE ... GAUDI_EVENT_DMA7_CORE:
gaudi_print_irq_info(hdev, event_type, true);
gaudi_handle_qman_err(hdev, event_type);
hl_fw_unmask_irq(hdev, event_type);
break;
case GAUDI_EVENT_RAZWI_OR_ADC_SW:
gaudi_print_irq_info(hdev, event_type, true);
goto reset_device;
case GAUDI_EVENT_TPC0_BMON_SPMU:
case GAUDI_EVENT_TPC1_BMON_SPMU:
case GAUDI_EVENT_TPC2_BMON_SPMU:
case GAUDI_EVENT_TPC3_BMON_SPMU:
case GAUDI_EVENT_TPC4_BMON_SPMU:
case GAUDI_EVENT_TPC5_BMON_SPMU:
case GAUDI_EVENT_TPC6_BMON_SPMU:
case GAUDI_EVENT_TPC7_BMON_SPMU:
case GAUDI_EVENT_DMA_BM_CH0 ... GAUDI_EVENT_DMA_BM_CH7:
gaudi_print_irq_info(hdev, event_type, false);
hl_fw_unmask_irq(hdev, event_type);
break;
case GAUDI_EVENT_DMA_IF_SEI_0 ... GAUDI_EVENT_DMA_IF_SEI_3:
gaudi_print_irq_info(hdev, event_type, false);
gaudi_print_sm_sei_info(hdev, event_type,
&eq_entry->sm_sei_data);
rc = hl_state_dump(hdev);
if (rc)
dev_err(hdev->dev,
"Error during system state dump %d\n", rc);
hl_fw_unmask_irq(hdev, event_type);
break;
case GAUDI_EVENT_FIX_POWER_ENV_S ... GAUDI_EVENT_FIX_THERMAL_ENV_E:
gaudi_print_clk_change_info(hdev, event_type);
hl_fw_unmask_irq(hdev, event_type);
break;
case GAUDI_EVENT_PSOC_GPIO_U16_0:
cause = le64_to_cpu(eq_entry->data[0]) & 0xFF;
dev_err(hdev->dev,
"Received high temp H/W interrupt %d (cause %d)\n",
event_type, cause);
break;
case GAUDI_EVENT_DEV_RESET_REQ:
gaudi_print_irq_info(hdev, event_type, false);
goto reset_device;
case GAUDI_EVENT_PKT_QUEUE_OUT_SYNC:
gaudi_print_irq_info(hdev, event_type, false);
gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
goto reset_device;
case GAUDI_EVENT_FW_ALIVE_S:
gaudi_print_irq_info(hdev, event_type, false);
gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive);
goto reset_device;
default:
dev_err(hdev->dev, "Received invalid H/W interrupt %d\n",
event_type);
break;
}
return;
reset_device:
if (hdev->asic_prop.fw_security_enabled)
hl_device_reset(hdev, HL_DRV_RESET_HARD
| HL_DRV_RESET_BYPASS_REQ_TO_FW
| fw_fatal_err_flag);
else if (hdev->hard_reset_on_fw_events)
hl_device_reset(hdev, HL_DRV_RESET_HARD | fw_fatal_err_flag);
else
hl_fw_unmask_irq(hdev, event_type);
}