in kernel/eeh_driver.c [836:1109]
void eeh_handle_normal_event(struct eeh_pe *pe)
{
struct pci_bus *bus;
struct eeh_dev *edev, *tmp;
struct eeh_pe *tmp_pe;
int rc = 0;
enum pci_ers_result result = PCI_ERS_RESULT_NONE;
struct eeh_rmv_data rmv_data =
{LIST_HEAD_INIT(rmv_data.removed_vf_list), 0};
int devices = 0;
bus = eeh_pe_bus_get(pe);
if (!bus) {
pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
__func__, pe->phb->global_number, pe->addr);
return;
}
/*
* When devices are hot-removed we might get an EEH due to
* a driver attempting to touch the MMIO space of a removed
* device. In this case we don't have a device to recover
* so suppress the event if we can't find any present devices.
*
* The hotplug driver should take care of tearing down the
* device itself.
*/
eeh_for_each_pe(pe, tmp_pe)
eeh_pe_for_each_dev(tmp_pe, edev, tmp)
if (eeh_slot_presence_check(edev->pdev))
devices++;
if (!devices) {
pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n",
pe->phb->global_number, pe->addr);
goto out; /* nothing to recover */
}
/* Log the event */
if (pe->type & EEH_PE_PHB) {
pr_err("EEH: Recovering PHB#%x, location: %s\n",
pe->phb->global_number, eeh_pe_loc_get(pe));
} else {
struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb);
pr_err("EEH: Recovering PHB#%x-PE#%x\n",
pe->phb->global_number, pe->addr);
pr_err("EEH: PE location: %s, PHB location: %s\n",
eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
}
#ifdef CONFIG_STACKTRACE
/*
* Print the saved stack trace now that we've verified there's
* something to recover.
*/
if (pe->trace_entries) {
void **ptrs = (void **) pe->stack_trace;
int i;
pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
pe->phb->global_number, pe->addr);
/* FIXME: Use the same format as dump_stack() */
pr_err("EEH: Call Trace:\n");
for (i = 0; i < pe->trace_entries; i++)
pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]);
pe->trace_entries = 0;
}
#endif /* CONFIG_STACKTRACE */
eeh_for_each_pe(pe, tmp_pe)
eeh_pe_for_each_dev(tmp_pe, edev, tmp)
edev->mode &= ~EEH_DEV_NO_HANDLER;
eeh_pe_update_time_stamp(pe);
pe->freeze_count++;
if (pe->freeze_count > eeh_max_freezes) {
pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n",
pe->phb->global_number, pe->addr,
pe->freeze_count);
goto recover_failed;
}
/* Walk the various device drivers attached to this slot through
* a reset sequence, giving each an opportunity to do what it needs
* to accomplish the reset. Each child gets a report of the
* status ... if any child can't handle the reset, then the entire
* slot is dlpar removed and added.
*
* When the PHB is fenced, we have to issue a reset to recover from
* the error. Override the result if necessary to have partially
* hotplug for this case.
*/
pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n",
pe->freeze_count, eeh_max_freezes);
pr_info("EEH: Notify device drivers to shutdown\n");
eeh_set_channel_state(pe, pci_channel_io_frozen);
eeh_set_irq_state(pe, false);
eeh_pe_report("error_detected(IO frozen)", pe,
eeh_report_error, &result);
if (result == PCI_ERS_RESULT_DISCONNECT)
goto recover_failed;
/*
* Error logged on a PHB are always fences which need a full
* PHB reset to clear so force that to happen.
*/
if ((pe->type & EEH_PE_PHB) && result != PCI_ERS_RESULT_NONE)
result = PCI_ERS_RESULT_NEED_RESET;
/* Get the current PCI slot state. This can take a long time,
* sometimes over 300 seconds for certain systems.
*/
rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY * 1000);
if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
pr_warn("EEH: Permanent failure\n");
goto recover_failed;
}
/* Since rtas may enable MMIO when posting the error log,
* don't post the error log until after all dev drivers
* have been informed.
*/
pr_info("EEH: Collect temporary log\n");
eeh_slot_error_detail(pe, EEH_LOG_TEMP);
/* If all device drivers were EEH-unaware, then shut
* down all of the device drivers, and hope they
* go down willingly, without panicing the system.
*/
if (result == PCI_ERS_RESULT_NONE) {
pr_info("EEH: Reset with hotplug activity\n");
rc = eeh_reset_device(pe, bus, NULL, false);
if (rc) {
pr_warn("%s: Unable to reset, err=%d\n", __func__, rc);
goto recover_failed;
}
}
/* If all devices reported they can proceed, then re-enable MMIO */
if (result == PCI_ERS_RESULT_CAN_RECOVER) {
pr_info("EEH: Enable I/O for affected devices\n");
rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
if (rc < 0)
goto recover_failed;
if (rc) {
result = PCI_ERS_RESULT_NEED_RESET;
} else {
pr_info("EEH: Notify device drivers to resume I/O\n");
eeh_pe_report("mmio_enabled", pe,
eeh_report_mmio_enabled, &result);
}
}
if (result == PCI_ERS_RESULT_CAN_RECOVER) {
pr_info("EEH: Enabled DMA for affected devices\n");
rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
if (rc < 0)
goto recover_failed;
if (rc) {
result = PCI_ERS_RESULT_NEED_RESET;
} else {
/*
* We didn't do PE reset for the case. The PE
* is still in frozen state. Clear it before
* resuming the PE.
*/
eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true);
result = PCI_ERS_RESULT_RECOVERED;
}
}
/* If any device called out for a reset, then reset the slot */
if (result == PCI_ERS_RESULT_NEED_RESET) {
pr_info("EEH: Reset without hotplug activity\n");
rc = eeh_reset_device(pe, bus, &rmv_data, true);
if (rc) {
pr_warn("%s: Cannot reset, err=%d\n", __func__, rc);
goto recover_failed;
}
result = PCI_ERS_RESULT_NONE;
eeh_set_channel_state(pe, pci_channel_io_normal);
eeh_set_irq_state(pe, true);
eeh_pe_report("slot_reset", pe, eeh_report_reset,
&result);
}
if ((result == PCI_ERS_RESULT_RECOVERED) ||
(result == PCI_ERS_RESULT_NONE)) {
/*
* For those hot removed VFs, we should add back them after PF
* get recovered properly.
*/
list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list,
rmv_entry) {
eeh_add_virt_device(edev);
list_del(&edev->rmv_entry);
}
/* Tell all device drivers that they can resume operations */
pr_info("EEH: Notify device driver to resume\n");
eeh_set_channel_state(pe, pci_channel_io_normal);
eeh_set_irq_state(pe, true);
eeh_pe_report("resume", pe, eeh_report_resume, NULL);
eeh_for_each_pe(pe, tmp_pe) {
eeh_pe_for_each_dev(tmp_pe, edev, tmp) {
edev->mode &= ~EEH_DEV_NO_HANDLER;
edev->in_error = false;
}
}
pr_info("EEH: Recovery successful.\n");
goto out;
}
recover_failed:
/*
* About 90% of all real-life EEH failures in the field
* are due to poorly seated PCI cards. Only 10% or so are
* due to actual, failed cards.
*/
pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
"Please try reseating or replacing it\n",
pe->phb->global_number, pe->addr);
eeh_slot_error_detail(pe, EEH_LOG_PERM);
/* Notify all devices that they're about to go down. */
eeh_set_channel_state(pe, pci_channel_io_perm_failure);
eeh_set_irq_state(pe, false);
eeh_pe_report("error_detected(permanent failure)", pe,
eeh_report_failure, NULL);
/* Mark the PE to be removed permanently */
eeh_pe_state_mark(pe, EEH_PE_REMOVED);
/*
* Shut down the device drivers for good. We mark
* all removed devices correctly to avoid access
* the their PCI config any more.
*/
if (pe->type & EEH_PE_VF) {
eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
} else {
eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
pci_lock_rescan_remove();
pci_hp_remove_devices(bus);
pci_unlock_rescan_remove();
/* The passed PE should no longer be used */
return;
}
out:
/*
* Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING
* we don't want to modify the PE tree structure so we do it here.
*/
eeh_pe_cleanup(pe);
/* clear the slot attention LED for all recovered devices */
eeh_for_each_pe(pe, tmp_pe)
eeh_pe_for_each_dev(tmp_pe, edev, tmp)
eeh_clear_slot_attention(edev->pdev);
eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
}