int kvmppc_book3s_hv_page_fault()

in kvm/book3s_64_mmu_hv.c [483:731]


int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
				unsigned long ea, unsigned long dsisr)
{
	struct kvm *kvm = vcpu->kvm;
	unsigned long hpte[3], r;
	unsigned long hnow_v, hnow_r;
	__be64 *hptep;
	unsigned long mmu_seq, psize, pte_size;
	unsigned long gpa_base, gfn_base;
	unsigned long gpa, gfn, hva, pfn, hpa;
	struct kvm_memory_slot *memslot;
	unsigned long *rmap;
	struct revmap_entry *rev;
	struct page *page;
	long index, ret;
	bool is_ci;
	bool writing, write_ok;
	unsigned int shift;
	unsigned long rcbits;
	long mmio_update;
	pte_t pte, *ptep;

	if (kvm_is_radix(kvm))
		return kvmppc_book3s_radix_page_fault(vcpu, ea, dsisr);

	/*
	 * Real-mode code has already searched the HPT and found the
	 * entry we're interested in.  Lock the entry and check that
	 * it hasn't changed.  If it has, just return and re-execute the
	 * instruction.
	 */
	if (ea != vcpu->arch.pgfault_addr)
		return RESUME_GUEST;

	if (vcpu->arch.pgfault_cache) {
		mmio_update = atomic64_read(&kvm->arch.mmio_update);
		if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) {
			r = vcpu->arch.pgfault_cache->rpte;
			psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0],
						   r);
			gpa_base = r & HPTE_R_RPN & ~(psize - 1);
			gfn_base = gpa_base >> PAGE_SHIFT;
			gpa = gpa_base | (ea & (psize - 1));
			return kvmppc_hv_emulate_mmio(vcpu, gpa, ea,
						dsisr & DSISR_ISSTORE);
		}
	}
	index = vcpu->arch.pgfault_index;
	hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
	rev = &kvm->arch.hpt.rev[index];
	preempt_disable();
	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
		cpu_relax();
	hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
	hpte[1] = be64_to_cpu(hptep[1]);
	hpte[2] = r = rev->guest_rpte;
	unlock_hpte(hptep, hpte[0]);
	preempt_enable();

	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
		hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]);
		hpte[1] = hpte_new_to_old_r(hpte[1]);
	}
	if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
	    hpte[1] != vcpu->arch.pgfault_hpte[1])
		return RESUME_GUEST;

	/* Translate the logical address and get the page */
	psize = kvmppc_actual_pgsz(hpte[0], r);
	gpa_base = r & HPTE_R_RPN & ~(psize - 1);
	gfn_base = gpa_base >> PAGE_SHIFT;
	gpa = gpa_base | (ea & (psize - 1));
	gfn = gpa >> PAGE_SHIFT;
	memslot = gfn_to_memslot(kvm, gfn);

	trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr);

	/* No memslot means it's an emulated MMIO region */
	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
		return kvmppc_hv_emulate_mmio(vcpu, gpa, ea,
					      dsisr & DSISR_ISSTORE);

	/*
	 * This should never happen, because of the slot_is_aligned()
	 * check in kvmppc_do_h_enter().
	 */
	if (gfn_base < memslot->base_gfn)
		return -EFAULT;

	/* used to check for invalidations in progress */
	mmu_seq = kvm->mmu_notifier_seq;
	smp_rmb();

	ret = -EFAULT;
	page = NULL;
	writing = (dsisr & DSISR_ISSTORE) != 0;
	/* If writing != 0, then the HPTE must allow writing, if we get here */
	write_ok = writing;
	hva = gfn_to_hva_memslot(memslot, gfn);

	/*
	 * Do a fast check first, since __gfn_to_pfn_memslot doesn't
	 * do it with !atomic && !async, which is how we call it.
	 * We always ask for write permission since the common case
	 * is that the page is writable.
	 */
	if (get_user_page_fast_only(hva, FOLL_WRITE, &page)) {
		write_ok = true;
	} else {
		/* Call KVM generic code to do the slow-path check */
		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
					   writing, &write_ok, NULL);
		if (is_error_noslot_pfn(pfn))
			return -EFAULT;
		page = NULL;
		if (pfn_valid(pfn)) {
			page = pfn_to_page(pfn);
			if (PageReserved(page))
				page = NULL;
		}
	}

	/*
	 * Read the PTE from the process' radix tree and use that
	 * so we get the shift and attribute bits.
	 */
	spin_lock(&kvm->mmu_lock);
	ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
	pte = __pte(0);
	if (ptep)
		pte = READ_ONCE(*ptep);
	spin_unlock(&kvm->mmu_lock);
	/*
	 * If the PTE disappeared temporarily due to a THP
	 * collapse, just return and let the guest try again.
	 */
	if (!pte_present(pte)) {
		if (page)
			put_page(page);
		return RESUME_GUEST;
	}
	hpa = pte_pfn(pte) << PAGE_SHIFT;
	pte_size = PAGE_SIZE;
	if (shift)
		pte_size = 1ul << shift;
	is_ci = pte_ci(pte);

	if (psize > pte_size)
		goto out_put;
	if (pte_size > psize)
		hpa |= hva & (pte_size - psize);

	/* Check WIMG vs. the actual page we're accessing */
	if (!hpte_cache_flags_ok(r, is_ci)) {
		if (is_ci)
			goto out_put;
		/*
		 * Allow guest to map emulated device memory as
		 * uncacheable, but actually make it cacheable.
		 */
		r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
	}

	/*
	 * Set the HPTE to point to hpa.
	 * Since the hpa is at PAGE_SIZE granularity, make sure we
	 * don't mask out lower-order bits if psize < PAGE_SIZE.
	 */
	if (psize < PAGE_SIZE)
		psize = PAGE_SIZE;
	r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) | hpa;
	if (hpte_is_writable(r) && !write_ok)
		r = hpte_make_readonly(r);
	ret = RESUME_GUEST;
	preempt_disable();
	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
		cpu_relax();
	hnow_v = be64_to_cpu(hptep[0]);
	hnow_r = be64_to_cpu(hptep[1]);
	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
		hnow_v = hpte_new_to_old_v(hnow_v, hnow_r);
		hnow_r = hpte_new_to_old_r(hnow_r);
	}

	/*
	 * If the HPT is being resized, don't update the HPTE,
	 * instead let the guest retry after the resize operation is complete.
	 * The synchronization for mmu_ready test vs. set is provided
	 * by the HPTE lock.
	 */
	if (!kvm->arch.mmu_ready)
		goto out_unlock;

	if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] ||
	    rev->guest_rpte != hpte[2])
		/* HPTE has been changed under us; let the guest retry */
		goto out_unlock;
	hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;

	/* Always put the HPTE in the rmap chain for the page base address */
	rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn];
	lock_rmap(rmap);

	/* Check if we might have been invalidated; let the guest retry if so */
	ret = RESUME_GUEST;
	if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
		unlock_rmap(rmap);
		goto out_unlock;
	}

	/* Only set R/C in real HPTE if set in both *rmap and guest_rpte */
	rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
	r &= rcbits | ~(HPTE_R_R | HPTE_R_C);

	if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) {
		/* HPTE was previously valid, so we need to invalidate it */
		unlock_rmap(rmap);
		hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
		kvmppc_invalidate_hpte(kvm, hptep, index);
		/* don't lose previous R and C bits */
		r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
	} else {
		kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
	}

	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
		r = hpte_old_to_new_r(hpte[0], r);
		hpte[0] = hpte_old_to_new_v(hpte[0]);
	}
	hptep[1] = cpu_to_be64(r);
	eieio();
	__unlock_hpte(hptep, hpte[0]);
	asm volatile("ptesync" : : : "memory");
	preempt_enable();
	if (page && hpte_is_writable(r))
		set_page_dirty_lock(page);

 out_put:
	trace_kvm_page_fault_exit(vcpu, hpte, ret);

	if (page)
		put_page(page);
	return ret;

 out_unlock:
	__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
	preempt_enable();
	goto out_put;
}