in vfio_iommu_type1.c [1550:1680]
static int vfio_dma_do_map(struct vfio_iommu *iommu,
struct vfio_iommu_type1_dma_map *map)
{
bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR;
dma_addr_t iova = map->iova;
unsigned long vaddr = map->vaddr;
size_t size = map->size;
int ret = 0, prot = 0;
size_t pgsize;
struct vfio_dma *dma;
/* Verify that none of our __u64 fields overflow */
if (map->size != size || map->vaddr != vaddr || map->iova != iova)
return -EINVAL;
/* READ/WRITE from device perspective */
if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
prot |= IOMMU_WRITE;
if (map->flags & VFIO_DMA_MAP_FLAG_READ)
prot |= IOMMU_READ;
if ((prot && set_vaddr) || (!prot && !set_vaddr))
return -EINVAL;
mutex_lock(&iommu->lock);
pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
WARN_ON((pgsize - 1) & PAGE_MASK);
if (!size || (size | iova | vaddr) & (pgsize - 1)) {
ret = -EINVAL;
goto out_unlock;
}
/* Don't allow IOVA or virtual address wrap */
if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
ret = -EINVAL;
goto out_unlock;
}
dma = vfio_find_dma(iommu, iova, size);
if (set_vaddr) {
if (!dma) {
ret = -ENOENT;
} else if (!dma->vaddr_invalid || dma->iova != iova ||
dma->size != size) {
ret = -EINVAL;
} else {
dma->vaddr = vaddr;
dma->vaddr_invalid = false;
iommu->vaddr_invalid_count--;
wake_up_all(&iommu->vaddr_wait);
}
goto out_unlock;
} else if (dma) {
ret = -EEXIST;
goto out_unlock;
}
if (!iommu->dma_avail) {
ret = -ENOSPC;
goto out_unlock;
}
if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
ret = -EINVAL;
goto out_unlock;
}
dma = kzalloc(sizeof(*dma), GFP_KERNEL);
if (!dma) {
ret = -ENOMEM;
goto out_unlock;
}
iommu->dma_avail--;
dma->iova = iova;
dma->vaddr = vaddr;
dma->prot = prot;
/*
* We need to be able to both add to a task's locked memory and test
* against the locked memory limit and we need to be able to do both
* outside of this call path as pinning can be asynchronous via the
* external interfaces for mdev devices. RLIMIT_MEMLOCK requires a
* task_struct and VM locked pages requires an mm_struct, however
* holding an indefinite mm reference is not recommended, therefore we
* only hold a reference to a task. We could hold a reference to
* current, however QEMU uses this call path through vCPU threads,
* which can be killed resulting in a NULL mm and failure in the unmap
* path when called via a different thread. Avoid this problem by
* using the group_leader as threads within the same group require
* both CLONE_THREAD and CLONE_VM and will therefore use the same
* mm_struct.
*
* Previously we also used the task for testing CAP_IPC_LOCK at the
* time of pinning and accounting, however has_capability() makes use
* of real_cred, a copy-on-write field, so we can't guarantee that it
* matches group_leader, or in fact that it might not change by the
* time it's evaluated. If a process were to call MAP_DMA with
* CAP_IPC_LOCK but later drop it, it doesn't make sense that they
* possibly see different results for an iommu_mapped vfio_dma vs
* externally mapped. Therefore track CAP_IPC_LOCK in vfio_dma at the
* time of calling MAP_DMA.
*/
get_task_struct(current->group_leader);
dma->task = current->group_leader;
dma->lock_cap = capable(CAP_IPC_LOCK);
dma->pfn_list = RB_ROOT;
/* Insert zero-sized and grow as we map chunks of it */
vfio_link_dma(iommu, dma);
/* Don't pin and map if container doesn't contain IOMMU capable domain*/
if (list_empty(&iommu->domain_list))
dma->size = size;
else
ret = vfio_pin_map_dma(iommu, dma, size);
if (!ret && iommu->dirty_page_tracking) {
ret = vfio_dma_bitmap_alloc(dma, pgsize);
if (ret)
vfio_remove_dma(iommu, dma);
}
out_unlock:
mutex_unlock(&iommu->lock);
return ret;
}