Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 92cd1440 authored by Dmitry Osipenko's avatar Dmitry Osipenko Committed by Mauro Carvalho Chehab
Browse files

media: staging: media: tegra-vde: Defer dmabuf's unmapping



Frequent IOMMU remappings take about 50% of CPU usage because there is
quite a lot to remap. Defer dmabuf's unmapping by 5 seconds in order to
mitigate the mapping overhead which goes away completely and driver works
as fast as in a case of a disabled IOMMU. The case of a disabled IOMMU
should also benefit a tad from the caching since CPU cache maintenance
that happens on dmabuf's attaching takes some resources.

Signed-off-by: default avatarDmitry Osipenko <digetx@gmail.com>
Signed-off-by: default avatarHans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: default avatarMauro Carvalho Chehab <mchehab+samsung@kernel.org>
parent b301f8de
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
# SPDX-License-Identifier: GPL-2.0
tegra-vde-y := vde.o iommu.o
tegra-vde-y := vde.o iommu.o dmabuf-cache.o
obj-$(CONFIG_TEGRA_VDE)	+= tegra-vde.o
+226 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0+
/*
 * NVIDIA Tegra Video decoder driver
 *
 * Copyright (C) 2016-2019 GRATE-DRIVER project
 */

#include <linux/dma-buf.h>
#include <linux/iova.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/workqueue.h>

#include "vde.h"

struct tegra_vde_cache_entry {
	enum dma_data_direction dma_dir;
	struct dma_buf_attachment *a;
	struct delayed_work dwork;
	struct tegra_vde *vde;
	struct list_head list;
	struct sg_table *sgt;
	struct iova *iova;
	unsigned int refcnt;
};

static void tegra_vde_release_entry(struct tegra_vde_cache_entry *entry)
{
	struct dma_buf *dmabuf = entry->a->dmabuf;

	WARN_ON_ONCE(entry->refcnt);

	if (entry->vde->domain)
		tegra_vde_iommu_unmap(entry->vde, entry->iova);

	dma_buf_unmap_attachment(entry->a, entry->sgt, entry->dma_dir);
	dma_buf_detach(dmabuf, entry->a);
	dma_buf_put(dmabuf);

	list_del(&entry->list);
	kfree(entry);
}

static void tegra_vde_delayed_unmap(struct work_struct *work)
{
	struct tegra_vde_cache_entry *entry;
	struct tegra_vde *vde;

	entry = container_of(work, struct tegra_vde_cache_entry,
			     dwork.work);
	vde = entry->vde;

	mutex_lock(&vde->map_lock);
	tegra_vde_release_entry(entry);
	mutex_unlock(&vde->map_lock);
}

int tegra_vde_dmabuf_cache_map(struct tegra_vde *vde,
			       struct dma_buf *dmabuf,
			       enum dma_data_direction dma_dir,
			       struct dma_buf_attachment **ap,
			       dma_addr_t *addrp)
{
	struct device *dev = vde->miscdev.parent;
	struct dma_buf_attachment *attachment;
	struct tegra_vde_cache_entry *entry;
	struct sg_table *sgt;
	struct iova *iova;
	int err;

	mutex_lock(&vde->map_lock);

	list_for_each_entry(entry, &vde->map_list, list) {
		if (entry->a->dmabuf != dmabuf)
			continue;

		if (!cancel_delayed_work(&entry->dwork))
			continue;

		if (entry->dma_dir != dma_dir)
			entry->dma_dir = DMA_BIDIRECTIONAL;

		dma_buf_put(dmabuf);

		if (vde->domain)
			*addrp = iova_dma_addr(&vde->iova, entry->iova);
		else
			*addrp = sg_dma_address(entry->sgt->sgl);

		goto ref;
	}

	attachment = dma_buf_attach(dmabuf, dev);
	if (IS_ERR(attachment)) {
		dev_err(dev, "Failed to attach dmabuf\n");
		err = PTR_ERR(attachment);
		goto err_unlock;
	}

	sgt = dma_buf_map_attachment(attachment, dma_dir);
	if (IS_ERR(sgt)) {
		dev_err(dev, "Failed to get dmabufs sg_table\n");
		err = PTR_ERR(sgt);
		goto err_detach;
	}

	if (!vde->domain && sgt->nents > 1) {
		dev_err(dev, "Sparse DMA region is unsupported, please enable IOMMU\n");
		err = -EINVAL;
		goto err_unmap;
	}

	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
	if (!entry) {
		err = -ENOMEM;
		goto err_unmap;
	}

	if (vde->domain) {
		err = tegra_vde_iommu_map(vde, sgt, &iova, dmabuf->size);
		if (err)
			goto err_free;

		*addrp = iova_dma_addr(&vde->iova, iova);
	} else {
		*addrp = sg_dma_address(sgt->sgl);
		iova = NULL;
	}

	INIT_DELAYED_WORK(&entry->dwork, tegra_vde_delayed_unmap);
	list_add(&entry->list, &vde->map_list);

	entry->dma_dir = dma_dir;
	entry->iova = iova;
	entry->vde = vde;
	entry->sgt = sgt;
	entry->a = attachment;
ref:
	entry->refcnt++;

	*ap = entry->a;

	mutex_unlock(&vde->map_lock);

	return 0;

err_free:
	kfree(entry);
err_unmap:
	dma_buf_unmap_attachment(attachment, sgt, dma_dir);
err_detach:
	dma_buf_detach(dmabuf, attachment);
err_unlock:
	mutex_unlock(&vde->map_lock);

	return err;
}

void tegra_vde_dmabuf_cache_unmap(struct tegra_vde *vde,
				  struct dma_buf_attachment *a,
				  bool release)
{
	struct tegra_vde_cache_entry *entry;

	mutex_lock(&vde->map_lock);

	list_for_each_entry(entry, &vde->map_list, list) {
		if (entry->a != a)
			continue;

		WARN_ON_ONCE(!entry->refcnt);

		if (--entry->refcnt == 0) {
			if (release)
				tegra_vde_release_entry(entry);
			else
				schedule_delayed_work(&entry->dwork, 5 * HZ);
		}
		break;
	}

	mutex_unlock(&vde->map_lock);
}

void tegra_vde_dmabuf_cache_unmap_sync(struct tegra_vde *vde)
{
	struct tegra_vde_cache_entry *entry, *tmp;

	mutex_lock(&vde->map_lock);

	list_for_each_entry_safe(entry, tmp, &vde->map_list, list) {
		if (entry->refcnt)
			continue;

		if (!cancel_delayed_work(&entry->dwork))
			continue;

		tegra_vde_release_entry(entry);
	}

	mutex_unlock(&vde->map_lock);
}

void tegra_vde_dmabuf_cache_unmap_all(struct tegra_vde *vde)
{
	struct tegra_vde_cache_entry *entry, *tmp;

	mutex_lock(&vde->map_lock);

	while (!list_empty(&vde->map_list)) {
		list_for_each_entry_safe(entry, tmp, &vde->map_list, list) {
			if (!cancel_delayed_work(&entry->dwork))
				continue;

			tegra_vde_release_entry(entry);
		}

		mutex_unlock(&vde->map_lock);
		schedule();
		mutex_lock(&vde->map_lock);
	}

	mutex_unlock(&vde->map_lock);
}
+0 −2
Original line number Diff line number Diff line
@@ -19,7 +19,6 @@
int tegra_vde_iommu_map(struct tegra_vde *vde,
			struct sg_table *sgt,
			struct iova **iovap,
			dma_addr_t *addrp,
			size_t size)
{
	struct iova *iova;
@@ -45,7 +44,6 @@ int tegra_vde_iommu_map(struct tegra_vde *vde,
	}

	*iovap = iova;
	*addrp = addr;

	return 0;
}
+32 −111
Original line number Diff line number Diff line
@@ -11,6 +11,7 @@
#include <linux/genalloc.h>
#include <linux/interrupt.h>
#include <linux/iopoll.h>
#include <linux/list.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/of_device.h>
@@ -37,18 +38,10 @@
#define BSE_DMA_BUSY		BIT(23)

struct video_frame {
	struct iova *y_iova;
	struct iova *cb_iova;
	struct iova *cr_iova;
	struct iova *aux_iova;
	struct dma_buf_attachment *y_dmabuf_attachment;
	struct dma_buf_attachment *cb_dmabuf_attachment;
	struct dma_buf_attachment *cr_dmabuf_attachment;
	struct dma_buf_attachment *aux_dmabuf_attachment;
	struct sg_table *y_sgt;
	struct sg_table *cb_sgt;
	struct sg_table *cr_sgt;
	struct sg_table *aux_sgt;
	dma_addr_t y_addr;
	dma_addr_t cb_addr;
	dma_addr_t cr_addr;
@@ -494,22 +487,6 @@ static void tegra_vde_decode_frame(struct tegra_vde *vde,
			 vde->sxe, 0x00);
}

static void tegra_vde_detach_and_put_dmabuf(struct tegra_vde *vde,
					    enum dma_data_direction dma_dir,
					    struct dma_buf_attachment *a,
					    struct sg_table *sgt,
					    struct iova *iova)
{
	struct dma_buf *dmabuf = a->dmabuf;

	if (vde->domain)
		tegra_vde_iommu_unmap(vde, iova);

	dma_buf_unmap_attachment(a, sgt, dma_dir);
	dma_buf_detach(dmabuf, a);
	dma_buf_put(dmabuf);
}

static int tegra_vde_attach_dmabuf(struct tegra_vde *vde,
				   int fd,
				   unsigned long offset,
@@ -517,15 +494,11 @@ static int tegra_vde_attach_dmabuf(struct tegra_vde *vde,
				   size_t align_size,
				   struct dma_buf_attachment **a,
				   dma_addr_t *addrp,
				   struct sg_table **s,
				   struct iova **iovap,
				   size_t *size,
				   enum dma_data_direction dma_dir)
{
	struct device *dev = vde->miscdev.parent;
	struct dma_buf_attachment *attachment;
	struct dma_buf *dmabuf;
	struct sg_table *sgt;
	int err;

	dmabuf = dma_buf_get(fd);
@@ -546,49 +519,17 @@ static int tegra_vde_attach_dmabuf(struct tegra_vde *vde,
		return -EINVAL;
	}

	attachment = dma_buf_attach(dmabuf, dev);
	if (IS_ERR(attachment)) {
		dev_err(dev, "Failed to attach dmabuf\n");
		err = PTR_ERR(attachment);
	err = tegra_vde_dmabuf_cache_map(vde, dmabuf, dma_dir, a, addrp);
	if (err)
		goto err_put;
	}

	sgt = dma_buf_map_attachment(attachment, dma_dir);
	if (IS_ERR(sgt)) {
		dev_err(dev, "Failed to get dmabufs sg_table\n");
		err = PTR_ERR(sgt);
		goto err_detach;
	}

	if (!vde->domain && sgt->nents > 1) {
		dev_err(dev, "Sparse DMA region is unsupported, please enable IOMMU\n");
		err = -EINVAL;
		goto err_unmap;
	}

	if (vde->domain) {
		err = tegra_vde_iommu_map(vde, sgt, iovap, addrp, dmabuf->size);
		if (err) {
			dev_err(dev, "IOMMU mapping failed: %d\n", err);
			goto err_unmap;
		}
	} else {
		*addrp = sg_dma_address(sgt->sgl);
	}

	*addrp = *addrp + offset;
	*a = attachment;
	*s = sgt;

	if (size)
		*size = dmabuf->size - offset;

	return 0;

err_unmap:
	dma_buf_unmap_attachment(attachment, sgt, dma_dir);
err_detach:
	dma_buf_detach(dmabuf, attachment);
err_put:
	dma_buf_put(dmabuf);

@@ -608,8 +549,6 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
				      src->y_offset, lsize, SZ_256,
				      &frame->y_dmabuf_attachment,
				      &frame->y_addr,
				      &frame->y_sgt,
				      &frame->y_iova,
				      NULL, dma_dir);
	if (err)
		return err;
@@ -618,8 +557,6 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
				      src->cb_offset, csize, SZ_256,
				      &frame->cb_dmabuf_attachment,
				      &frame->cb_addr,
				      &frame->cb_sgt,
				      &frame->cb_iova,
				      NULL, dma_dir);
	if (err)
		goto err_release_y;
@@ -628,8 +565,6 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
				      src->cr_offset, csize, SZ_256,
				      &frame->cr_dmabuf_attachment,
				      &frame->cr_addr,
				      &frame->cr_sgt,
				      &frame->cr_iova,
				      NULL, dma_dir);
	if (err)
		goto err_release_cb;
@@ -643,8 +578,6 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
				      src->aux_offset, csize, SZ_256,
				      &frame->aux_dmabuf_attachment,
				      &frame->aux_addr,
				      &frame->aux_sgt,
				      &frame->aux_iova,
				      NULL, dma_dir);
	if (err)
		goto err_release_cr;
@@ -652,20 +585,11 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
	return 0;

err_release_cr:
	tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
					frame->cr_dmabuf_attachment,
					frame->cr_sgt,
					frame->cr_iova);
	tegra_vde_dmabuf_cache_unmap(vde, frame->cr_dmabuf_attachment, true);
err_release_cb:
	tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
					frame->cb_dmabuf_attachment,
					frame->cb_sgt,
					frame->cb_iova);
	tegra_vde_dmabuf_cache_unmap(vde, frame->cb_dmabuf_attachment, true);
err_release_y:
	tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
					frame->y_dmabuf_attachment,
					frame->y_sgt,
					frame->y_iova);
	tegra_vde_dmabuf_cache_unmap(vde, frame->y_dmabuf_attachment, true);

	return err;
}
@@ -673,28 +597,16 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
static void tegra_vde_release_frame_dmabufs(struct tegra_vde *vde,
					    struct video_frame *frame,
					    enum dma_data_direction dma_dir,
					    bool baseline_profile)
					    bool baseline_profile,
					    bool release)
{
	if (!baseline_profile)
		tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
						frame->aux_dmabuf_attachment,
						frame->aux_sgt,
						frame->aux_iova);
		tegra_vde_dmabuf_cache_unmap(vde, frame->aux_dmabuf_attachment,
					     release);

	tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
					frame->cr_dmabuf_attachment,
					frame->cr_sgt,
					frame->cr_iova);

	tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
					frame->cb_dmabuf_attachment,
					frame->cb_sgt,
					frame->cb_iova);

	tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
					frame->y_dmabuf_attachment,
					frame->y_sgt,
					frame->y_iova);
	tegra_vde_dmabuf_cache_unmap(vde, frame->cr_dmabuf_attachment, release);
	tegra_vde_dmabuf_cache_unmap(vde, frame->cb_dmabuf_attachment, release);
	tegra_vde_dmabuf_cache_unmap(vde, frame->y_dmabuf_attachment, release);
}

static int tegra_vde_validate_frame(struct device *dev,
@@ -786,8 +698,6 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde,
	struct tegra_vde_h264_frame __user *frames_user;
	struct video_frame *dpb_frames;
	struct dma_buf_attachment *bitstream_data_dmabuf_attachment;
	struct sg_table *bitstream_sgt;
	struct iova *bitstream_iova;
	enum dma_data_direction dma_dir;
	dma_addr_t bitstream_data_addr;
	dma_addr_t bsev_ptr;
@@ -812,8 +722,6 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde,
				      SZ_16K, SZ_16K,
				      &bitstream_data_dmabuf_attachment,
				      &bitstream_data_addr,
				      &bitstream_sgt,
				      &bitstream_iova,
				      &bitstream_data_size,
				      DMA_TO_DEVICE);
	if (ret)
@@ -944,7 +852,7 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde,
		dma_dir = (i == 0) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;

		tegra_vde_release_frame_dmabufs(vde, &dpb_frames[i], dma_dir,
						ctx.baseline_profile);
						ctx.baseline_profile, ret != 0);
	}

free_dpb_frames:
@@ -954,10 +862,8 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde,
	kfree(frames);

release_bitstream_dmabuf:
	tegra_vde_detach_and_put_dmabuf(vde, DMA_TO_DEVICE,
					bitstream_data_dmabuf_attachment,
					bitstream_sgt,
					bitstream_iova);
	tegra_vde_dmabuf_cache_unmap(vde, bitstream_data_dmabuf_attachment,
				     ret != 0);

	return ret;
}
@@ -979,9 +885,21 @@ static long tegra_vde_unlocked_ioctl(struct file *filp,
	return -ENOTTY;
}

static int tegra_vde_release_file(struct inode *inode, struct file *filp)
{
	struct miscdevice *miscdev = filp->private_data;
	struct tegra_vde *vde = container_of(miscdev, struct tegra_vde,
					     miscdev);

	tegra_vde_dmabuf_cache_unmap_sync(vde);

	return 0;
}

static const struct file_operations tegra_vde_fops = {
	.owner		= THIS_MODULE,
	.unlocked_ioctl	= tegra_vde_unlocked_ioctl,
	.release	= tegra_vde_release_file,
};

static irqreturn_t tegra_vde_isr(int irq, void *data)
@@ -1159,6 +1077,8 @@ static int tegra_vde_probe(struct platform_device *pdev)
		return -ENOMEM;
	}

	INIT_LIST_HEAD(&vde->map_list);
	mutex_init(&vde->map_lock);
	mutex_init(&vde->lock);
	init_completion(&vde->decode_completion);

@@ -1221,6 +1141,7 @@ static int tegra_vde_remove(struct platform_device *pdev)

	misc_deregister(&vde->miscdev);

	tegra_vde_dmabuf_cache_unmap_all(vde);
	tegra_vde_iommu_deinit(vde);

	gen_pool_free(vde->iram_pool, (unsigned long)vde->iram,
+17 −1
Original line number Diff line number Diff line
@@ -9,16 +9,20 @@
#define TEGRA_VDE_H

#include <linux/completion.h>
#include <linux/dma-direction.h>
#include <linux/list.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
#include <linux/types.h>
#include <linux/iova.h>

struct clk;
struct dma_buf;
struct gen_pool;
struct iommu_group;
struct iommu_domain;
struct reset_control;
struct dma_buf_attachment;

struct tegra_vde {
	void __iomem *sxe;
@@ -31,6 +35,8 @@ struct tegra_vde {
	void __iomem *vdma;
	void __iomem *frameid;
	struct mutex lock;
	struct mutex map_lock;
	struct list_head map_list;
	struct miscdevice miscdev;
	struct reset_control *rst;
	struct reset_control *rst_mc;
@@ -51,10 +57,20 @@ void tegra_vde_iommu_deinit(struct tegra_vde *vde);
int tegra_vde_iommu_map(struct tegra_vde *vde,
			struct sg_table *sgt,
			struct iova **iovap,
			dma_addr_t *addrp,
			size_t size);
void tegra_vde_iommu_unmap(struct tegra_vde *vde, struct iova *iova);

int tegra_vde_dmabuf_cache_map(struct tegra_vde *vde,
			       struct dma_buf *dmabuf,
			       enum dma_data_direction dma_dir,
			       struct dma_buf_attachment **ap,
			       dma_addr_t *addrp);
void tegra_vde_dmabuf_cache_unmap(struct tegra_vde *vde,
				  struct dma_buf_attachment *a,
				  bool release);
void tegra_vde_dmabuf_cache_unmap_sync(struct tegra_vde *vde);
void tegra_vde_dmabuf_cache_unmap_all(struct tegra_vde *vde);

static __maybe_unused char const *
tegra_vde_reg_base_name(struct tegra_vde *vde, void __iomem *base)
{