in kvm/book3s_64_mmu_hv.c [483:731]
int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
unsigned long ea, unsigned long dsisr)
{
struct kvm *kvm = vcpu->kvm;
unsigned long hpte[3], r;
unsigned long hnow_v, hnow_r;
__be64 *hptep;
unsigned long mmu_seq, psize, pte_size;
unsigned long gpa_base, gfn_base;
unsigned long gpa, gfn, hva, pfn, hpa;
struct kvm_memory_slot *memslot;
unsigned long *rmap;
struct revmap_entry *rev;
struct page *page;
long index, ret;
bool is_ci;
bool writing, write_ok;
unsigned int shift;
unsigned long rcbits;
long mmio_update;
pte_t pte, *ptep;
if (kvm_is_radix(kvm))
return kvmppc_book3s_radix_page_fault(vcpu, ea, dsisr);
/*
* Real-mode code has already searched the HPT and found the
* entry we're interested in. Lock the entry and check that
* it hasn't changed. If it has, just return and re-execute the
* instruction.
*/
if (ea != vcpu->arch.pgfault_addr)
return RESUME_GUEST;
if (vcpu->arch.pgfault_cache) {
mmio_update = atomic64_read(&kvm->arch.mmio_update);
if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) {
r = vcpu->arch.pgfault_cache->rpte;
psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0],
r);
gpa_base = r & HPTE_R_RPN & ~(psize - 1);
gfn_base = gpa_base >> PAGE_SHIFT;
gpa = gpa_base | (ea & (psize - 1));
return kvmppc_hv_emulate_mmio(vcpu, gpa, ea,
dsisr & DSISR_ISSTORE);
}
}
index = vcpu->arch.pgfault_index;
hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
rev = &kvm->arch.hpt.rev[index];
preempt_disable();
while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
cpu_relax();
hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
hpte[1] = be64_to_cpu(hptep[1]);
hpte[2] = r = rev->guest_rpte;
unlock_hpte(hptep, hpte[0]);
preempt_enable();
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]);
hpte[1] = hpte_new_to_old_r(hpte[1]);
}
if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
hpte[1] != vcpu->arch.pgfault_hpte[1])
return RESUME_GUEST;
/* Translate the logical address and get the page */
psize = kvmppc_actual_pgsz(hpte[0], r);
gpa_base = r & HPTE_R_RPN & ~(psize - 1);
gfn_base = gpa_base >> PAGE_SHIFT;
gpa = gpa_base | (ea & (psize - 1));
gfn = gpa >> PAGE_SHIFT;
memslot = gfn_to_memslot(kvm, gfn);
trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr);
/* No memslot means it's an emulated MMIO region */
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
return kvmppc_hv_emulate_mmio(vcpu, gpa, ea,
dsisr & DSISR_ISSTORE);
/*
* This should never happen, because of the slot_is_aligned()
* check in kvmppc_do_h_enter().
*/
if (gfn_base < memslot->base_gfn)
return -EFAULT;
/* used to check for invalidations in progress */
mmu_seq = kvm->mmu_notifier_seq;
smp_rmb();
ret = -EFAULT;
page = NULL;
writing = (dsisr & DSISR_ISSTORE) != 0;
/* If writing != 0, then the HPTE must allow writing, if we get here */
write_ok = writing;
hva = gfn_to_hva_memslot(memslot, gfn);
/*
* Do a fast check first, since __gfn_to_pfn_memslot doesn't
* do it with !atomic && !async, which is how we call it.
* We always ask for write permission since the common case
* is that the page is writable.
*/
if (get_user_page_fast_only(hva, FOLL_WRITE, &page)) {
write_ok = true;
} else {
/* Call KVM generic code to do the slow-path check */
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
writing, &write_ok, NULL);
if (is_error_noslot_pfn(pfn))
return -EFAULT;
page = NULL;
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
if (PageReserved(page))
page = NULL;
}
}
/*
* Read the PTE from the process' radix tree and use that
* so we get the shift and attribute bits.
*/
spin_lock(&kvm->mmu_lock);
ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
pte = __pte(0);
if (ptep)
pte = READ_ONCE(*ptep);
spin_unlock(&kvm->mmu_lock);
/*
* If the PTE disappeared temporarily due to a THP
* collapse, just return and let the guest try again.
*/
if (!pte_present(pte)) {
if (page)
put_page(page);
return RESUME_GUEST;
}
hpa = pte_pfn(pte) << PAGE_SHIFT;
pte_size = PAGE_SIZE;
if (shift)
pte_size = 1ul << shift;
is_ci = pte_ci(pte);
if (psize > pte_size)
goto out_put;
if (pte_size > psize)
hpa |= hva & (pte_size - psize);
/* Check WIMG vs. the actual page we're accessing */
if (!hpte_cache_flags_ok(r, is_ci)) {
if (is_ci)
goto out_put;
/*
* Allow guest to map emulated device memory as
* uncacheable, but actually make it cacheable.
*/
r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
}
/*
* Set the HPTE to point to hpa.
* Since the hpa is at PAGE_SIZE granularity, make sure we
* don't mask out lower-order bits if psize < PAGE_SIZE.
*/
if (psize < PAGE_SIZE)
psize = PAGE_SIZE;
r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) | hpa;
if (hpte_is_writable(r) && !write_ok)
r = hpte_make_readonly(r);
ret = RESUME_GUEST;
preempt_disable();
while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
cpu_relax();
hnow_v = be64_to_cpu(hptep[0]);
hnow_r = be64_to_cpu(hptep[1]);
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
hnow_v = hpte_new_to_old_v(hnow_v, hnow_r);
hnow_r = hpte_new_to_old_r(hnow_r);
}
/*
* If the HPT is being resized, don't update the HPTE,
* instead let the guest retry after the resize operation is complete.
* The synchronization for mmu_ready test vs. set is provided
* by the HPTE lock.
*/
if (!kvm->arch.mmu_ready)
goto out_unlock;
if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] ||
rev->guest_rpte != hpte[2])
/* HPTE has been changed under us; let the guest retry */
goto out_unlock;
hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
/* Always put the HPTE in the rmap chain for the page base address */
rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn];
lock_rmap(rmap);
/* Check if we might have been invalidated; let the guest retry if so */
ret = RESUME_GUEST;
if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
unlock_rmap(rmap);
goto out_unlock;
}
/* Only set R/C in real HPTE if set in both *rmap and guest_rpte */
rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) {
/* HPTE was previously valid, so we need to invalidate it */
unlock_rmap(rmap);
hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
kvmppc_invalidate_hpte(kvm, hptep, index);
/* don't lose previous R and C bits */
r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
} else {
kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
}
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
r = hpte_old_to_new_r(hpte[0], r);
hpte[0] = hpte_old_to_new_v(hpte[0]);
}
hptep[1] = cpu_to_be64(r);
eieio();
__unlock_hpte(hptep, hpte[0]);
asm volatile("ptesync" : : : "memory");
preempt_enable();
if (page && hpte_is_writable(r))
set_page_dirty_lock(page);
out_put:
trace_kvm_page_fault_exit(vcpu, hpte, ret);
if (page)
put_page(page);
return ret;
out_unlock:
__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
preempt_enable();
goto out_put;
}