void eeh_handle_normal_event()

in kernel/eeh_driver.c [836:1109]


void eeh_handle_normal_event(struct eeh_pe *pe)
{
	struct pci_bus *bus;
	struct eeh_dev *edev, *tmp;
	struct eeh_pe *tmp_pe;
	int rc = 0;
	enum pci_ers_result result = PCI_ERS_RESULT_NONE;
	struct eeh_rmv_data rmv_data =
		{LIST_HEAD_INIT(rmv_data.removed_vf_list), 0};
	int devices = 0;

	bus = eeh_pe_bus_get(pe);
	if (!bus) {
		pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
			__func__, pe->phb->global_number, pe->addr);
		return;
	}

	/*
	 * When devices are hot-removed we might get an EEH due to
	 * a driver attempting to touch the MMIO space of a removed
	 * device. In this case we don't have a device to recover
	 * so suppress the event if we can't find any present devices.
	 *
	 * The hotplug driver should take care of tearing down the
	 * device itself.
	 */
	eeh_for_each_pe(pe, tmp_pe)
		eeh_pe_for_each_dev(tmp_pe, edev, tmp)
			if (eeh_slot_presence_check(edev->pdev))
				devices++;

	if (!devices) {
		pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n",
			pe->phb->global_number, pe->addr);
		goto out; /* nothing to recover */
	}

	/* Log the event */
	if (pe->type & EEH_PE_PHB) {
		pr_err("EEH: Recovering PHB#%x, location: %s\n",
			pe->phb->global_number, eeh_pe_loc_get(pe));
	} else {
		struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb);

		pr_err("EEH: Recovering PHB#%x-PE#%x\n",
		       pe->phb->global_number, pe->addr);
		pr_err("EEH: PE location: %s, PHB location: %s\n",
		       eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
	}

#ifdef CONFIG_STACKTRACE
	/*
	 * Print the saved stack trace now that we've verified there's
	 * something to recover.
	 */
	if (pe->trace_entries) {
		void **ptrs = (void **) pe->stack_trace;
		int i;

		pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
		       pe->phb->global_number, pe->addr);

		/* FIXME: Use the same format as dump_stack() */
		pr_err("EEH: Call Trace:\n");
		for (i = 0; i < pe->trace_entries; i++)
			pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]);

		pe->trace_entries = 0;
	}
#endif /* CONFIG_STACKTRACE */

	eeh_for_each_pe(pe, tmp_pe)
		eeh_pe_for_each_dev(tmp_pe, edev, tmp)
			edev->mode &= ~EEH_DEV_NO_HANDLER;

	eeh_pe_update_time_stamp(pe);
	pe->freeze_count++;
	if (pe->freeze_count > eeh_max_freezes) {
		pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n",
		       pe->phb->global_number, pe->addr,
		       pe->freeze_count);

		goto recover_failed;
	}

	/* Walk the various device drivers attached to this slot through
	 * a reset sequence, giving each an opportunity to do what it needs
	 * to accomplish the reset.  Each child gets a report of the
	 * status ... if any child can't handle the reset, then the entire
	 * slot is dlpar removed and added.
	 *
	 * When the PHB is fenced, we have to issue a reset to recover from
	 * the error. Override the result if necessary to have partially
	 * hotplug for this case.
	 */
	pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n",
		pe->freeze_count, eeh_max_freezes);
	pr_info("EEH: Notify device drivers to shutdown\n");
	eeh_set_channel_state(pe, pci_channel_io_frozen);
	eeh_set_irq_state(pe, false);
	eeh_pe_report("error_detected(IO frozen)", pe,
		      eeh_report_error, &result);
	if (result == PCI_ERS_RESULT_DISCONNECT)
		goto recover_failed;

	/*
	 * Error logged on a PHB are always fences which need a full
	 * PHB reset to clear so force that to happen.
	 */
	if ((pe->type & EEH_PE_PHB) && result != PCI_ERS_RESULT_NONE)
		result = PCI_ERS_RESULT_NEED_RESET;

	/* Get the current PCI slot state. This can take a long time,
	 * sometimes over 300 seconds for certain systems.
	 */
	rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY * 1000);
	if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
		pr_warn("EEH: Permanent failure\n");
		goto recover_failed;
	}

	/* Since rtas may enable MMIO when posting the error log,
	 * don't post the error log until after all dev drivers
	 * have been informed.
	 */
	pr_info("EEH: Collect temporary log\n");
	eeh_slot_error_detail(pe, EEH_LOG_TEMP);

	/* If all device drivers were EEH-unaware, then shut
	 * down all of the device drivers, and hope they
	 * go down willingly, without panicing the system.
	 */
	if (result == PCI_ERS_RESULT_NONE) {
		pr_info("EEH: Reset with hotplug activity\n");
		rc = eeh_reset_device(pe, bus, NULL, false);
		if (rc) {
			pr_warn("%s: Unable to reset, err=%d\n", __func__, rc);
			goto recover_failed;
		}
	}

	/* If all devices reported they can proceed, then re-enable MMIO */
	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
		pr_info("EEH: Enable I/O for affected devices\n");
		rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
		if (rc < 0)
			goto recover_failed;

		if (rc) {
			result = PCI_ERS_RESULT_NEED_RESET;
		} else {
			pr_info("EEH: Notify device drivers to resume I/O\n");
			eeh_pe_report("mmio_enabled", pe,
				      eeh_report_mmio_enabled, &result);
		}
	}
	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
		pr_info("EEH: Enabled DMA for affected devices\n");
		rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
		if (rc < 0)
			goto recover_failed;

		if (rc) {
			result = PCI_ERS_RESULT_NEED_RESET;
		} else {
			/*
			 * We didn't do PE reset for the case. The PE
			 * is still in frozen state. Clear it before
			 * resuming the PE.
			 */
			eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true);
			result = PCI_ERS_RESULT_RECOVERED;
		}
	}

	/* If any device called out for a reset, then reset the slot */
	if (result == PCI_ERS_RESULT_NEED_RESET) {
		pr_info("EEH: Reset without hotplug activity\n");
		rc = eeh_reset_device(pe, bus, &rmv_data, true);
		if (rc) {
			pr_warn("%s: Cannot reset, err=%d\n", __func__, rc);
			goto recover_failed;
		}

		result = PCI_ERS_RESULT_NONE;
		eeh_set_channel_state(pe, pci_channel_io_normal);
		eeh_set_irq_state(pe, true);
		eeh_pe_report("slot_reset", pe, eeh_report_reset,
			      &result);
	}

	if ((result == PCI_ERS_RESULT_RECOVERED) ||
	    (result == PCI_ERS_RESULT_NONE)) {
		/*
		 * For those hot removed VFs, we should add back them after PF
		 * get recovered properly.
		 */
		list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list,
					 rmv_entry) {
			eeh_add_virt_device(edev);
			list_del(&edev->rmv_entry);
		}

		/* Tell all device drivers that they can resume operations */
		pr_info("EEH: Notify device driver to resume\n");
		eeh_set_channel_state(pe, pci_channel_io_normal);
		eeh_set_irq_state(pe, true);
		eeh_pe_report("resume", pe, eeh_report_resume, NULL);
		eeh_for_each_pe(pe, tmp_pe) {
			eeh_pe_for_each_dev(tmp_pe, edev, tmp) {
				edev->mode &= ~EEH_DEV_NO_HANDLER;
				edev->in_error = false;
			}
		}

		pr_info("EEH: Recovery successful.\n");
		goto out;
	}

recover_failed:
	/*
	 * About 90% of all real-life EEH failures in the field
	 * are due to poorly seated PCI cards. Only 10% or so are
	 * due to actual, failed cards.
	 */
	pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
		"Please try reseating or replacing it\n",
		pe->phb->global_number, pe->addr);

	eeh_slot_error_detail(pe, EEH_LOG_PERM);

	/* Notify all devices that they're about to go down. */
	eeh_set_channel_state(pe, pci_channel_io_perm_failure);
	eeh_set_irq_state(pe, false);
	eeh_pe_report("error_detected(permanent failure)", pe,
		      eeh_report_failure, NULL);

	/* Mark the PE to be removed permanently */
	eeh_pe_state_mark(pe, EEH_PE_REMOVED);

	/*
	 * Shut down the device drivers for good. We mark
	 * all removed devices correctly to avoid access
	 * the their PCI config any more.
	 */
	if (pe->type & EEH_PE_VF) {
		eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
		eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
	} else {
		eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
		eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);

		pci_lock_rescan_remove();
		pci_hp_remove_devices(bus);
		pci_unlock_rescan_remove();
		/* The passed PE should no longer be used */
		return;
	}

out:
	/*
	 * Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING
	 * we don't want to modify the PE tree structure so we do it here.
	 */
	eeh_pe_cleanup(pe);

	/* clear the slot attention LED for all recovered devices */
	eeh_for_each_pe(pe, tmp_pe)
		eeh_pe_for_each_dev(tmp_pe, edev, tmp)
			eeh_clear_slot_attention(edev->pdev);

	eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
}