1 files changed, 1587 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux-6.6/linux-yocto-6.6/0001-ae4dma-Initial-ae4dma-controller-driver-with-multi-c.patch b/meta-amd-bsp/recipes-kernel/linux-6.6/linux-yocto-6.6/0001-ae4dma-Initial-ae4dma-controller-driver-with-multi-c.patch
new file mode 100644
index 00000000..1d0a08ec
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux-6.6/linux-yocto-6.6/0001-ae4dma-Initial-ae4dma-controller-driver-with-multi-c.patch
@@ -0,0 +1,1587 @@
+From 3d6d0e4175d9d78c4bd2e338accedfe06275e898 Mon Sep 17 00:00:00 2001
+From: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
+Date: Fri, 9 Feb 2024 18:25:37 +0530
+Subject: [PATCH 1/5] ae4dma: Initial ae4dma controller driver with
+ multi-channel
+
+Add support for AMD AE4DMA controller. It performs high-bandwidth
+memory to memory and IO copy operation. Device commands are managed
+via a circular queue of 'descriptors', each of which specifies source
+and destination addresses for copying a single buffer of data.
+
+Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
+---
+ drivers/dma/Kconfig                   |   2 +
+ drivers/dma/Makefile                  |   1 +
+ drivers/dma/ae4dma/Kconfig            |  13 +
+ drivers/dma/ae4dma/Makefile           |  10 +
+ drivers/dma/ae4dma/ae4dma-dev.c       | 387 ++++++++++++++++++++++++
+ drivers/dma/ae4dma/ae4dma-dmaengine.c | 417 ++++++++++++++++++++++++++
+ drivers/dma/ae4dma/ae4dma-pci.c       | 251 ++++++++++++++++
+ drivers/dma/ae4dma/ae4dma.h           | 416 +++++++++++++++++++++++++
+ 8 files changed, 1497 insertions(+)
+ create mode 100644 drivers/dma/ae4dma/Kconfig
+ create mode 100644 drivers/dma/ae4dma/Makefile
+ create mode 100644 drivers/dma/ae4dma/ae4dma-dev.c
+ create mode 100644 drivers/dma/ae4dma/ae4dma-dmaengine.c
+ create mode 100644 drivers/dma/ae4dma/ae4dma-pci.c
+ create mode 100644 drivers/dma/ae4dma/ae4dma.h
+
+diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
+index e928f2ca0f1e..34344fc2454c 100644
+--- a/drivers/dma/Kconfig
++++ b/drivers/dma/Kconfig
+@@ -772,6 +772,8 @@ source "drivers/dma/fsl-dpaa2-qdma/Kconfig"
+ 
+ source "drivers/dma/lgm/Kconfig"
+ 
++source "drivers/dma/ae4dma/Kconfig"
++
+ # clients
+ comment "DMA Clients"
+ 	depends on DMA_ENGINE
+diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
+index dfd40d14e408..9df21ec28966 100644
+--- a/drivers/dma/Makefile
++++ b/drivers/dma/Makefile
+@@ -83,6 +83,7 @@ obj-$(CONFIG_XGENE_DMA) += xgene-dma.o
+ obj-$(CONFIG_ST_FDMA) += st_fdma.o
+ obj-$(CONFIG_FSL_DPAA2_QDMA) += fsl-dpaa2-qdma/
+ obj-$(CONFIG_INTEL_LDMA) += lgm/
++obj-$(CONFIG_AMD_AE4DMA) += ae4dma/
+ 
+ obj-y += mediatek/
+ obj-y += qcom/
+diff --git a/drivers/dma/ae4dma/Kconfig b/drivers/dma/ae4dma/Kconfig
+new file mode 100644
+index 000000000000..50a69f1b984d
+--- /dev/null
++++ b/drivers/dma/ae4dma/Kconfig
+@@ -0,0 +1,13 @@
++# SPDX-License-Identifier: GPL-2.0-only
++config AMD_AE4DMA
++	tristate  "AMD AE4DMA Engine"
++	depends on X86_64 && PCI
++	select DMA_ENGINE
++	select DMA_VIRTUAL_CHANNELS
++	help
++	  Enable support for the AMD AE4DMA controller. This controller
++	  provides DMA capabilities to perform high bandwidth memory to
++	  memory and IO copy operations. It performs DMA transfer through
++	  queue-based descriptor management. This DMA controller is intended
++	  to be used with AMD Non-Transparent Bridge devices and not for
++	  general purpose peripheral DMA.
+diff --git a/drivers/dma/ae4dma/Makefile b/drivers/dma/ae4dma/Makefile
+new file mode 100644
+index 000000000000..b1e431842d18
+--- /dev/null
++++ b/drivers/dma/ae4dma/Makefile
+@@ -0,0 +1,10 @@
++# SPDX-License-Identifier: GPL-2.0-only
++#
++# AMD AE4DMA driver
++#
++
++obj-$(CONFIG_AMD_AE4DMA) += ae4dma.o
++
++ae4dma-objs := ae4dma-dev.o ae4dma-dmaengine.o
++
++ae4dma-$(CONFIG_PCI) += ae4dma-pci.o
+diff --git a/drivers/dma/ae4dma/ae4dma-dev.c b/drivers/dma/ae4dma/ae4dma-dev.c
+new file mode 100644
+index 000000000000..9163327a8fc4
+--- /dev/null
++++ b/drivers/dma/ae4dma/ae4dma-dev.c
+@@ -0,0 +1,387 @@
++// SPDX-License-Identifier: GPL-2.0-only
++	
++/*
++ * AMD AE4DMA device driver
++ * -- Based on the PTDMA driver
++ *
++ * Copyright (C) 2024 Advanced Micro Devices, Inc.
++ *
++ * Author: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
++
++ */
++
++#include <linux/bitfield.h>
++#include <linux/dma-mapping.h>
++#include <linux/interrupt.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/pci.h>
++#include <linux/delay.h>
++#include <linux/time.h>
++
++#include "ae4dma.h"
++#include "../dmaengine.h"
++#include "../virt-dma.h"
++
++static unsigned int max_hw_q = 2;
++module_param(max_hw_q, uint, 0444);
++MODULE_PARM_DESC(max_hw_q, "Max hw queues supported by engine (any non-zero value, default: 1)");
++
++static inline struct pt_dma_chan *to_pt_chan(struct dma_chan *dma_chan)
++{
++	return container_of(dma_chan, struct pt_dma_chan, vc.chan);
++}
++/* Human-readable error strings */
++static char *pt_error_codes[] = {
++	"",
++	"ERR 01: INVALID HEADER DW0",
++	"ERR 02: INVALID STATUS",
++	"ERR 03: INVALID LENGHT - 4 BYTE ALIGNMENT",
++	"ERR 04: INVALID SRC ADDR - 4 BYTE ALIGNMENT",
++	"ERR 05: INVALID DST ADDR - 4 BYTE ALIGNMENT",
++	"ERR 06: INVALID ALIGNMENT",
++	"ERR 07: INVALID DESCRIPTOR",
++};
++
++static void pt_log_error(struct pt_device *d, int e)
++{
++	if (e<=7)
++		dev_info(d->dev, "AE4DMA error: %s (0x%x)\n", pt_error_codes[e], e);
++	if((e>7) && (e<=15))
++		dev_info(d->dev, "AE4DMA error: %s (0x%x)\n", "INVALID DESCRIPTOR", e);
++	else if((e>15) && (e<=31))
++		dev_info(d->dev, "AE4DMA error: %s (0x%x)\n", "INVALID DESCRIPTOR", e);
++	else if((e>31) && (e<=63))
++		dev_info(d->dev, "AE4DMA error: %s (0x%x)\n", "INVALID DESCRIPTOR", e);
++	else if((e>63) && (e<=127))
++		dev_info(d->dev, "AE4DMA error: %s (0x%x)\n", "PTE ERROR", e);
++	else if((e>127) && (e<=255))
++		dev_info(d->dev, "AE4DMA error: %s (0x%x)\n", "PTE ERROR", e);
++		
++}
++
++void pt_start_queue(struct pt_cmd_queue *cmd_q)
++{
++        u32 status = readl(cmd_q->reg_control);
++        status |= (cmd_q->qcontrol | CMD_Q_RUN);
++        /* Turn on the run bit */
++        writel(status, cmd_q->reg_control);
++}
++
++void pt_stop_queue(struct pt_cmd_queue *cmd_q)
++{
++	/* Turn off the run bit */
++	writel(cmd_q->qcontrol & ~CMD_Q_RUN, cmd_q->reg_control);
++}
++
++static u16 pt_check_status_error(struct pt_cmd_queue *cmd_q, int idx)
++{
++	struct pt_device *pt = cmd_q->pt;
++	struct device *dev = pt->dev;
++	struct ptdma_desc desc;
++	u8 status;
++
++	do {
++		dma_sync_single_for_device(dev, ((cmd_q->qbase_dma) + (idx * sizeof(struct ptdma_desc))), sizeof(struct ptdma_desc), DMA_FROM_DEVICE);
++		memcpy(&desc, &cmd_q->qbase[idx], sizeof(struct ptdma_desc));
++		dma_sync_single_for_device(dev, ((cmd_q->qbase_dma) + (idx * sizeof(struct ptdma_desc))), sizeof(struct ptdma_desc), DMA_FROM_DEVICE);
++		mb();
++
++		status = desc.dw1.status;
++		if (status) {
++			if (status != 0x3) {
++				/* On error, only save the first error value */
++				cmd_q->cmd_error = desc.dw1.err_code;
++				if (cmd_q->cmd_error) {
++					/*
++					 * Log the error and flush the queue by
++					 * moving the head pointer
++					 */
++					pt_log_error(cmd_q->pt, cmd_q->cmd_error);
++				}
++			}
++		}	
++	} while (status == 0);
++
++	return desc.dwouv.dws.timestamp;
++}
++
++static int pt_core_execute_cmd(struct ptdma_desc *desc, struct pt_cmd_queue *cmd_q)
++{
++	struct pt_device *pt = cmd_q->pt;
++	struct device *dev = pt->dev;
++	unsigned long flags;
++	
++	bool soc = FIELD_GET(DWORD0_SOC, desc->dwouv.dw0);
++
++	if (soc) {
++		desc->dwouv.dw0 |= FIELD_PREP(DWORD0_IOC, desc->dwouv.dw0);
++		desc->dwouv.dw0 &= ~DWORD0_SOC;
++	}
++
++	mutex_lock(&cmd_q->q_mutex);
++	spin_lock_irqsave(&cmd_q->cmd_lock, flags);
++
++	desc->dwouv.dws.timestamp = cmd_q->desc_id_counter++;
++
++	volatile u32 tail_wi =  atomic_read(&cmd_q->tail_wi);
++
++	dma_sync_single_for_device(dev, (cmd_q->qbase_dma + (tail_wi * sizeof(struct ptdma_desc))),
++			sizeof(struct ptdma_desc), DMA_TO_DEVICE);
++	memcpy(&cmd_q->qbase[tail_wi], desc, sizeof(struct ptdma_desc));
++	dma_sync_single_for_device(dev, (cmd_q->qbase_dma + (tail_wi * sizeof(struct ptdma_desc))),
++			sizeof(struct ptdma_desc), DMA_TO_DEVICE);
++	cmd_q->qidx = (cmd_q->qidx + 1) % CMD_Q_LEN; 
++
++	atomic64_inc(&cmd_q->q_cmd_count);
++	tail_wi = (tail_wi + 1) % CMD_Q_LEN;
++	atomic_set(&cmd_q->tail_wi, tail_wi);
++	mb();
++	writel(tail_wi , cmd_q->reg_control + 0x10); 
++	mb();
++	spin_unlock_irqrestore(&cmd_q->cmd_lock, flags);
++	mutex_unlock(&cmd_q->q_mutex);
++
++	return 0;
++}
++
++int pt_core_perform_passthru(struct pt_cmd_queue *cmd_q,
++			     struct pt_passthru_engine *pt_engine)
++{
++	struct ptdma_desc desc;
++	
++	cmd_q->cmd_error = 0;
++	cmd_q->total_pt_ops++;
++	memset(&desc, 0, sizeof(desc));
++	desc.dwouv.dws.byte0 = CMD_DESC_DW0_VAL;
++
++	desc.dw1.status = 0;
++	desc.dw1.err_code = 0;
++	desc.dw1.desc_id = 0;
++
++	desc.length = pt_engine->src_len;
++	
++	desc.src_lo = upper_32_bits(pt_engine->src_dma);
++	desc.src_hi = lower_32_bits(pt_engine->src_dma);
++	desc.dst_lo = upper_32_bits(pt_engine->dst_dma);
++	desc.dst_hi = lower_32_bits(pt_engine->dst_dma);
++
++	return pt_core_execute_cmd(&desc, cmd_q);
++}
++
++static irqreturn_t pt_core_irq_handler(int irq, void *data)
++{
++	struct pt_cmd_queue *cmd_q = data;
++	struct pt_device *pt = cmd_q->pt;
++	u32 status = readl(cmd_q->reg_control + 0x4);
++	u8 q_intr_type = (status>>24) & 0xf;
++	unsigned long flags;
++	struct pt_cmd *cmd;	
++
++	pt->total_interrupts++;
++
++	if (q_intr_type ==  0x4) 
++		dev_info(pt->dev, "AE4DMA INTR: %s (0x%x)\n", "queue desc error", q_intr_type);
++	else if (q_intr_type ==  0x2)
++		dev_info(pt->dev, "AE4DMA INTR: %s (0x%x)\n", "queue stopped", q_intr_type);
++	else if (q_intr_type ==  0x1)
++		dev_info(pt->dev, "AE4DMA INTR: %s (0x%x)\n", "queue empty", q_intr_type);
++	else if (q_intr_type !=  0x3)
++		dev_info(pt->dev, "AE4DMA INTR: %s (0x%x)\n", "unknown error", q_intr_type);
++
++	spin_lock_irqsave(&cmd_q->cmd_lock, flags);
++	volatile u32 crdi = readl(cmd_q->reg_control + 0x0C);
++	volatile u32 dridx = atomic_read(&cmd_q->dridx);
++	while (dridx != crdi) { 
++
++		if(list_empty(&cmd_q->cmd)) {
++			
++			break;
++		}
++		cmd = list_first_entry(&cmd_q->cmd, struct pt_cmd, entry);
++		list_del(&cmd->entry);
++		pt_check_status_error(cmd_q, dridx);
++		cmd->pt_cmd_callback(cmd->data, cmd->ret);
++		atomic64_dec(&cmd_q->q_cmd_count);
++		dridx = (dridx + 1) % CMD_Q_LEN; 
++		atomic_set(&cmd_q->dridx, dridx);
++		mb();
++	}
++	spin_unlock_irqrestore(&cmd_q->cmd_lock, flags);
++
++	status = readl(cmd_q->reg_control + 0x14);
++	if (status & 1) {
++		status = status & ~1;
++		writel(status , cmd_q->reg_control + 0x14);
++	}
++
++	return IRQ_HANDLED;
++}
++
++int pt_core_init(struct pt_device *pt)
++{
++	char dma_pool_name[MAX_DMAPOOL_NAME_LEN];
++	struct pt_cmd_queue *cmd_q;
++	u32 dma_addr_lo, dma_addr_hi;
++	struct device *dev = pt->dev;
++	struct dma_pool *dma_pool;
++	unsigned int i;
++	int ret;
++	u32 q_per_eng = max_hw_q;
++
++	/* Update the device registers with queue information. */
++	writel(q_per_eng, pt->io_regs);
++
++	q_per_eng = readl(pt->io_regs);
++
++	for (i = 0; i < q_per_eng; i++) {
++
++		/* Allocate a dma pool for the queue */
++		snprintf(dma_pool_name, sizeof(dma_pool_name), "%s_q%d", dev_name(pt->dev), i);
++
++		dma_pool = dma_pool_create(dma_pool_name, dev,
++					   PT_DMAPOOL_MAX_SIZE,
++					   PT_DMAPOOL_ALIGN, 0);
++		if (!dma_pool)
++			return -ENOMEM;
++
++		/* ae4dma core initialisation */
++		cmd_q = &pt->cmd_q[i];
++		cmd_q->id = pt->cmd_q_count;
++		pt->cmd_q_count++;
++		
++		cmd_q->pt = pt;
++		cmd_q->dma_pool = dma_pool;
++		mutex_init(&cmd_q->q_mutex);
++		spin_lock_init(&cmd_q->q_lock);
++
++		/* Preset some register values (Q size is 32byte (0x20)) */
++		cmd_q->reg_control = pt->io_regs + ((i + 1) * 0x20);
++
++		/* Page alignment satisfies our needs for N <= 128 */
++		cmd_q->qsize = Q_SIZE(Q_DESC_SIZE);
++
++		cmd_q->qbase = kmalloc(cmd_q->qsize, GFP_KERNEL);
++		cmd_q->qbase_dma = dma_map_single(dev, cmd_q->qbase, cmd_q->qsize, DMA_BIDIRECTIONAL);
++
++		if (dma_mapping_error(dev, cmd_q->qbase_dma)) {
++			dev_err(dev, "dma mapping error \n");
++			goto e_destroy_pool;
++		}
++
++		cmd_q->qidx = 0;
++		atomic64_set(&cmd_q->q_cmd_count, 0);
++		atomic_set(&cmd_q->dridx ,0);
++		cmd_q->q_space_available = 0;
++
++		atomic_set(&cmd_q->tail_wi ,readl(cmd_q->reg_control + 0x10));
++
++		init_waitqueue_head(&cmd_q->int_queue);
++		init_waitqueue_head(&cmd_q->q_space);
++
++		dev_dbg(dev, "queue #%u available\n", i);
++	}
++
++	if (pt->cmd_q_count == 0) {
++		dev_notice(dev, "no command queues available\n");
++		ret = -EIO;
++		goto e_free_dma;
++	}
++
++	dev_info(dev, "BB1.0011 AE4DMA\n");
++	for (i = 0; i < pt->cmd_q_count; i++) {
++		cmd_q = &pt->cmd_q[i];
++
++		cmd_q->qcontrol = 0; /* Start with nothing */
++
++		/* Request an irq */
++		ret = request_irq(pt->pt_irq[i], pt_core_irq_handler, 0, dev_name(pt->dev), cmd_q);
++		if (ret) {
++			dev_err(dev, "unable to allocate an IRQ\n");
++			goto e_free_dma;
++		}
++
++		/* Update the device registers with queue information. */
++		writel(CMD_Q_LEN, cmd_q->reg_control + 0x08); // Max Index (cmd queue lenght)
++
++		cmd_q->qdma_tail = cmd_q->qbase_dma;
++		
++		dma_addr_lo = lower_32_bits(cmd_q->qdma_tail);
++		writel((u32)dma_addr_lo, cmd_q->reg_control + 0x18);
++
++		dma_addr_lo = readl(cmd_q->reg_control + 0x18);
++		
++		dma_addr_hi = upper_32_bits(cmd_q->qdma_tail);
++		writel((u32)dma_addr_hi, cmd_q->reg_control + 0x1C);
++
++		dma_addr_hi = readl(cmd_q->reg_control + 0x1C);
++		
++		pt_core_enable_queue_interrupts(pt, cmd_q);
++
++		INIT_LIST_HEAD(&cmd_q->cmd);
++	}
++
++	/* Register the DMA engine support */
++	ret = pt_dmaengine_register(pt);
++	if (ret)
++		goto e_free_irq;
++
++	return 0;
++
++e_free_irq:
++	for (i = 0; i < pt->cmd_q_count; i++)
++		free_irq(pt->pt_irq[i], pt);
++
++e_free_dma:
++	for (i = 0; i < pt->cmd_q_count; i++) {
++		cmd_q = &pt->cmd_q[i];
++		dma_unmap_single(dev, cmd_q->qbase_dma, cmd_q->qsize, DMA_BIDIRECTIONAL);
++		kfree(cmd_q->qbase);
++	}
++
++e_destroy_pool:
++	for (i = 0; i < pt->cmd_q_count; i++)
++		dma_pool_destroy(pt->cmd_q[i].dma_pool);
++
++	kmem_cache_destroy(pt->dma_desc_cache);
++	return ret;
++}
++
++void pt_core_destroy(struct pt_device *pt)
++{
++	struct device *dev = pt->dev;
++	struct pt_cmd_queue *cmd_q;
++	struct pt_cmd *cmd;
++	unsigned int i;
++
++	/* Unregister the DMA engine */
++	pt_dmaengine_unregister(pt);
++
++	for (i = 0; i < pt->cmd_q_count; i++) {
++		cmd_q = &pt->cmd_q[i];
++
++		wake_up_all(&cmd_q->q_space);
++		wake_up_all(&cmd_q->int_queue);
++
++		/* Disable and clear interrupts */
++		pt_core_disable_queue_interrupts(pt, cmd_q);
++
++		/* Turn off the run bit */
++		pt_stop_queue(cmd_q);
++
++		free_irq(pt->pt_irq[i], cmd_q);
++
++		dma_unmap_single(dev, cmd_q->qbase_dma, cmd_q->qsize, DMA_BIDIRECTIONAL);
++		kfree(cmd_q->qbase);
++	}
++
++	/* Flush the cmd queue */
++	while (!list_empty(&pt->cmd)) {
++		/* Invoke the callback directly with an error code */
++		cmd = list_first_entry(&pt->cmd, struct pt_cmd, entry);
++		list_del(&cmd->entry);
++		cmd->pt_cmd_callback(cmd->data, -ENODEV);
++	}
++
++	kmem_cache_destroy(pt->dma_desc_cache);
++}
+diff --git a/drivers/dma/ae4dma/ae4dma-dmaengine.c b/drivers/dma/ae4dma/ae4dma-dmaengine.c
+new file mode 100644
+index 000000000000..03f28eb10ad6
+--- /dev/null
++++ b/drivers/dma/ae4dma/ae4dma-dmaengine.c
+@@ -0,0 +1,417 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/*
++ * AMD AE4DMA device driver
++ * -- Based on the PTDMA driver
++ *
++ * Copyright (C) 2024 Advanced Micro Devices, Inc.
++ *
++ */
++#include <linux/delay.h>
++#include "ae4dma.h"
++#include "../dmaengine.h"
++#include "../virt-dma.h"
++
++static inline struct pt_dma_chan *to_pt_chan(struct dma_chan *dma_chan)
++{
++	return container_of(dma_chan, struct pt_dma_chan, vc.chan);
++}
++
++static inline struct pt_dma_desc *to_pt_desc(struct virt_dma_desc *vd)
++{
++	return container_of(vd, struct pt_dma_desc, vd);
++}
++
++static void pt_free_chan_resources(struct dma_chan *dma_chan)
++{
++	struct pt_dma_chan *chan = to_pt_chan(dma_chan);
++
++	vchan_free_chan_resources(&chan->vc);
++}
++
++static void pt_synchronize(struct dma_chan *dma_chan)
++{
++	struct pt_dma_chan *chan = to_pt_chan(dma_chan);
++
++	vchan_synchronize(&chan->vc);
++}
++
++static void pt_do_cleanup(struct virt_dma_desc *vd)
++{
++	struct pt_dma_desc *desc = to_pt_desc(vd);
++	struct pt_device *pt = desc->pt;
++
++	kmem_cache_free(pt->dma_desc_cache, desc);
++}
++
++static int pt_dma_start_desc(struct pt_dma_desc *desc, struct pt_dma_chan *chan)
++{
++	struct pt_passthru_engine *pt_engine;
++	struct pt_device *pt;
++	struct pt_cmd *pt_cmd;
++	struct pt_cmd_queue *cmd_q;
++
++	desc->issued_to_hw = 1;
++	list_del(&desc->vd.node);
++
++	pt_cmd = &desc->pt_cmd;
++	pt = pt_cmd->pt;
++	cmd_q = chan->cmd_q;
++	pt_engine = &pt_cmd->passthru;
++
++	pt_cmd->qid = cmd_q->qidx;
++	cmd_q->tdata.cmd = pt_cmd;
++
++	/* Execute the command */
++	pt_cmd->ret = pt_core_perform_passthru(cmd_q, pt_engine);
++
++	return 0;
++}
++
++static struct pt_dma_desc *pt_next_dma_desc(struct pt_dma_chan *chan)
++{
++	struct virt_dma_desc *vd = vchan_next_desc(&chan->vc);
++
++	return vd ? to_pt_desc(vd) : NULL;
++}
++
++static void pt_cmd_callback_tasklet(void *data, int err)
++{
++	struct pt_dma_desc *desc = data;
++	struct dma_chan *dma_chan;
++	struct pt_dma_chan *chan;
++	struct dma_async_tx_descriptor *tx_desc;
++	struct virt_dma_desc *vd;
++	unsigned long flags;
++
++	dma_chan = desc->vd.tx.chan;
++	chan = to_pt_chan(dma_chan);
++
++	if (err == -EINPROGRESS)
++		return;
++
++	tx_desc = &desc->vd.tx;
++	vd = &desc->vd;
++
++	if (err)
++		desc->status = DMA_ERROR;
++
++
++	spin_lock_irqsave(&chan->vc.lock, flags);
++	if (desc) {
++		if (desc->status != DMA_COMPLETE) {
++			if (desc->status != DMA_ERROR)
++				desc->status = DMA_COMPLETE;
++
++			dma_cookie_complete(tx_desc);
++			dma_descriptor_unmap(tx_desc);
++		} else {
++			/* Don't handle it twice */
++			tx_desc = NULL;
++		}
++	}
++	spin_unlock_irqrestore(&chan->vc.lock, flags);
++
++	if (tx_desc) {
++		dmaengine_desc_get_callback_invoke(tx_desc, NULL);
++		dma_run_dependencies(tx_desc);
++		vchan_vdesc_fini(vd);
++	}
++}
++
++static struct pt_dma_desc *pt_handle_active_desc(struct pt_dma_chan *chan,
++						 struct pt_dma_desc *desc)
++{
++	struct dma_async_tx_descriptor *tx_desc;
++	struct virt_dma_desc *vd;
++	unsigned long flags;
++	
++	/* Loop over descriptors until one is found with commands */
++	do {
++		if (desc) {
++			if (!desc->issued_to_hw) {
++				/* No errors, keep going */
++				if (desc->status != DMA_ERROR)
++					return desc;
++			}
++			tx_desc = &desc->vd.tx;
++			vd = &desc->vd;
++		} else {
++			tx_desc = NULL;
++		}
++		spin_lock_irqsave(&chan->vc.lock, flags);
++		desc = pt_next_dma_desc(chan);
++		spin_unlock_irqrestore(&chan->vc.lock, flags);
++	} while (desc);
++
++	return NULL;
++}
++
++static void pt_cmd_callback(void *data, int err)
++{
++	struct pt_dma_desc *desc = data;
++	struct dma_chan *dma_chan;
++	struct pt_dma_chan *chan;
++	struct pt_device *pt;
++	int ret;
++
++	if (err == -EINPROGRESS)
++		return;
++
++	dma_chan = desc->vd.tx.chan;
++	chan = to_pt_chan(dma_chan);
++	pt = chan->pt;
++
++	if (err)
++		desc->status = DMA_ERROR;
++
++	while (true) {
++		/* if queue is full dont submit to queue */
++		if((atomic64_read(&chan->cmd_q->q_cmd_count) >= (CMD_Q_LEN - 1)) || pt_core_queue_full(pt, chan->cmd_q)) {
++			cpu_relax();
++			continue;
++		}
++
++		/* Check for DMA descriptor completion */
++		desc = pt_handle_active_desc(chan, desc);
++
++		/* Don't submit cmd if no descriptor or DMA is paused */
++		if (!desc)
++			break;
++
++		ret = pt_dma_start_desc(desc, chan);
++		if (!ret)
++			break;
++
++		desc->status = DMA_ERROR;
++	}
++}
++
++static struct pt_dma_desc *pt_alloc_dma_desc(struct pt_dma_chan *chan,
++					     unsigned long flags)
++{
++	struct pt_dma_desc *desc;
++	struct pt_cmd_queue *cmd_q = chan->cmd_q;
++
++	desc = kmem_cache_zalloc(chan->pt->dma_desc_cache, GFP_NOWAIT);
++	if (!desc)
++		return NULL;
++
++	vchan_tx_prep(&chan->vc, &desc->vd, flags);
++
++	desc->pt = chan->pt;
++	cmd_q->int_en = !!(flags & DMA_PREP_INTERRUPT);
++	desc->issued_to_hw = 0;
++	desc->status = DMA_IN_PROGRESS;
++
++	return desc;
++}
++
++static struct pt_dma_desc *pt_create_desc(struct dma_chan *dma_chan,
++					  dma_addr_t dst,
++					  dma_addr_t src,
++					  unsigned int len,
++					  unsigned long flags)
++{
++	struct pt_dma_chan *chan = to_pt_chan(dma_chan);
++	struct pt_cmd_queue *cmd_q = chan->cmd_q;
++	struct pt_passthru_engine *pt_engine;
++	struct pt_dma_desc *desc;
++	struct pt_cmd *pt_cmd;
++
++	desc = pt_alloc_dma_desc(chan, flags);
++	if (!desc)
++		return NULL;
++
++	pt_cmd = &desc->pt_cmd;
++	pt_cmd->pt = chan->pt;
++	pt_engine = &pt_cmd->passthru;
++	pt_cmd->engine = PT_ENGINE_PASSTHRU;
++	pt_engine->src_dma = src;
++	pt_engine->dst_dma = dst;
++	pt_engine->src_len = len;
++	pt_cmd->pt_cmd_callback = pt_cmd_callback_tasklet;
++	pt_cmd->data = desc;
++
++	desc->len = len;
++
++	spin_lock_irqsave(&cmd_q->cmd_lock, flags);
++	list_add_tail(&pt_cmd->entry, &cmd_q->cmd);
++	spin_unlock_irqrestore(&cmd_q->cmd_lock, flags);
++
++	return desc;
++}
++
++static struct dma_async_tx_descriptor *
++pt_prep_dma_memcpy(struct dma_chan *dma_chan, dma_addr_t dst,
++		   dma_addr_t src, size_t len, unsigned long flags)
++{
++	struct pt_dma_desc *desc;
++
++	desc = pt_create_desc(dma_chan, dst, src, len, flags);
++	if (!desc)
++		return NULL;
++
++	return &desc->vd.tx;
++}
++
++static struct dma_async_tx_descriptor *
++pt_prep_dma_interrupt(struct dma_chan *dma_chan, unsigned long flags)
++{
++	struct pt_dma_chan *chan = to_pt_chan(dma_chan);
++	struct pt_dma_desc *desc;
++
++	desc = pt_alloc_dma_desc(chan, flags);
++	if (!desc)
++		return NULL;
++
++	return &desc->vd.tx;
++}
++
++static void pt_issue_pending(struct dma_chan *dma_chan)
++{
++	struct pt_dma_chan *chan = to_pt_chan(dma_chan);
++	struct pt_dma_desc *desc;
++	unsigned long flags;
++
++	spin_lock_irqsave(&chan->vc.lock, flags);
++	vchan_issue_pending(&chan->vc);
++	desc = pt_next_dma_desc(chan);
++	spin_unlock_irqrestore(&chan->vc.lock, flags);
++
++	pt_cmd_callback(desc, 0);
++}
++
++static int pt_pause(struct dma_chan *dma_chan)
++{
++	return 0;
++}
++
++static int pt_resume(struct dma_chan *dma_chan)
++{
++	struct pt_dma_chan *chan = to_pt_chan(dma_chan);
++	struct pt_dma_desc *desc = NULL;
++	unsigned long flags;
++
++	spin_lock_irqsave(&chan->vc.lock, flags);
++	pt_start_queue(chan->cmd_q);
++	desc = pt_next_dma_desc(chan);
++	spin_unlock_irqrestore(&chan->vc.lock, flags);
++
++	/* If there was something active, re-start */
++	if (desc)
++		pt_cmd_callback(desc, 0);
++
++	return 0;
++}
++
++static int pt_terminate_all(struct dma_chan *dma_chan)
++{
++	struct pt_dma_chan *chan = to_pt_chan(dma_chan);
++	unsigned long flags;
++	LIST_HEAD(head);
++
++	spin_lock_irqsave(&chan->vc.lock, flags);
++	vchan_get_all_descriptors(&chan->vc, &head);
++	spin_unlock_irqrestore(&chan->vc.lock, flags);
++
++	vchan_dma_desc_free_list(&chan->vc, &head);
++	vchan_free_chan_resources(&chan->vc);
++
++	return 0;
++}
++
++int pt_dmaengine_register(struct pt_device *pt)
++{
++	struct pt_dma_chan *chan;
++	struct pt_cmd_queue *cmd_q;
++	struct dma_device *dma_dev = &pt->dma_dev;
++	char *cmd_cache_name;
++	char *desc_cache_name;
++	unsigned int i;
++	int ret;
++
++	pt->pt_dma_chan = devm_kcalloc(pt->dev, pt->cmd_q_count, sizeof(*pt->pt_dma_chan),
++				       GFP_KERNEL);
++	if (!pt->pt_dma_chan)
++		return -ENOMEM;
++
++	cmd_cache_name = devm_kasprintf(pt->dev, GFP_KERNEL,
++					"%s-dmaengine-cmd-cache",
++					dev_name(pt->dev));
++	if (!cmd_cache_name)
++		return -ENOMEM;
++
++	desc_cache_name = devm_kasprintf(pt->dev, GFP_KERNEL,
++					 "%s-dmaengine-desc-cache",
++					 dev_name(pt->dev));
++	if (!desc_cache_name) {
++		ret = -ENOMEM;
++		goto err_cache;
++	}
++
++	pt->dma_desc_cache = kmem_cache_create(desc_cache_name,
++					       sizeof(struct pt_dma_desc), 0,
++					       SLAB_HWCACHE_ALIGN, NULL);
++	if (!pt->dma_desc_cache) {
++		ret = -ENOMEM;
++		goto err_cache;
++	}
++
++	dma_dev->dev = pt->dev;
++	dma_dev->src_addr_widths = DMA_SLAVE_BUSWIDTH_64_BYTES;
++	dma_dev->dst_addr_widths = DMA_SLAVE_BUSWIDTH_64_BYTES;
++	dma_dev->directions = DMA_MEM_TO_MEM;
++	dma_dev->residue_granularity = DMA_RESIDUE_GRANULARITY_DESCRIPTOR;
++	dma_cap_set(DMA_MEMCPY, dma_dev->cap_mask);
++	dma_cap_set(DMA_INTERRUPT, dma_dev->cap_mask);
++
++	/*
++	 * PTDMA is intended to be used with the AMD NTB devices, hence
++	 * marking it as DMA_PRIVATE.
++	 */
++	dma_cap_set(DMA_PRIVATE, dma_dev->cap_mask);
++
++	/* Set base and prep routines */
++	dma_dev->device_free_chan_resources = pt_free_chan_resources;
++	dma_dev->device_prep_dma_memcpy = pt_prep_dma_memcpy;
++	dma_dev->device_prep_dma_interrupt = pt_prep_dma_interrupt;
++	dma_dev->device_issue_pending = pt_issue_pending;
++	dma_dev->device_tx_status = dma_cookie_status;
++	dma_dev->device_pause = pt_pause;
++	dma_dev->device_resume = pt_resume;
++	dma_dev->device_terminate_all = pt_terminate_all;
++	dma_dev->device_synchronize = pt_synchronize;
++
++	INIT_LIST_HEAD(&dma_dev->channels);
++	for (i = 0; i < pt->cmd_q_count; i++) {
++		chan = pt->pt_dma_chan + i;
++		cmd_q = &pt->cmd_q[i];
++		chan->cmd_q = cmd_q;
++		chan->id = cmd_q->id;
++		chan->pt = pt;
++		chan->vc.desc_free = pt_do_cleanup;
++		vchan_init(&chan->vc, dma_dev);
++	}
++
++	ret = dma_async_device_register(dma_dev);
++	if (ret)
++		goto err_reg;
++
++	return 0;
++
++err_reg:
++	kmem_cache_destroy(pt->dma_desc_cache);
++
++err_cache:
++	kmem_cache_destroy(pt->dma_cmd_cache);
++
++	return ret;
++}
++
++void pt_dmaengine_unregister(struct pt_device *pt)
++{
++	struct dma_device *dma_dev = &pt->dma_dev;
++
++	dma_async_device_unregister(dma_dev);
++	kmem_cache_destroy(pt->dma_cmd_cache);
++}
+diff --git a/drivers/dma/ae4dma/ae4dma-pci.c b/drivers/dma/ae4dma/ae4dma-pci.c
+new file mode 100644
+index 000000000000..bd1170d05081
+--- /dev/null
++++ b/drivers/dma/ae4dma/ae4dma-pci.c
+@@ -0,0 +1,251 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/*
++ * AMD AE4DMA device driver
++ * -- Based on the PTDMA driver
++ *
++ * Copyright (C) 2024 Advanced Micro Devices, Inc.
++ *
++ */
++
++#include <linux/device.h>
++#include <linux/dma-mapping.h>
++#include <linux/delay.h>
++#include <linux/interrupt.h>
++#include <linux/kernel.h>
++#include <linux/kthread.h>
++#include <linux/module.h>
++#include <linux/pci_ids.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++
++#include "ae4dma.h"
++
++static char test_device[32];
++module_param_string(device, test_device, sizeof(test_device), 0644);
++MODULE_PARM_DESC(device, "Bus ID of the DMA Engine to test (default: any)");
++
++struct pt_msix {
++	int msix_count;
++	struct msix_entry msix_entry[MAX_HW_QUEUES];
++};
++
++/*
++ * pt_alloc_struct - allocate and initialize the pt_device struct
++ *
++ * @dev: device struct of the PTDMA
++ */
++static struct pt_device *pt_alloc_struct(struct device *dev)
++{
++	struct pt_device *pt;
++
++	pt = devm_kzalloc(dev, sizeof(*pt), GFP_KERNEL);
++
++	if (!pt)
++		return NULL;
++	pt->dev = dev;
++
++	INIT_LIST_HEAD(&pt->cmd);
++
++	return pt;
++}
++
++static int pt_get_msix_irqs(struct pt_device *pt)
++{
++	struct pt_msix *pt_msix = pt->pt_msix;
++	struct device *dev = pt->dev;
++	struct pci_dev *pdev = to_pci_dev(dev);
++	int v, i, ret;
++
++	for (v = 0; v < ARRAY_SIZE(pt_msix->msix_entry); v++)
++		pt_msix->msix_entry[v].entry = v;
++
++	ret = pci_enable_msix_range(pdev, pt_msix->msix_entry, 1, v);
++	if (ret < 0)
++		return ret;
++
++	pt_msix->msix_count = ret;
++
++	for(i=0; i<MAX_HW_QUEUES; i++ )
++		pt->pt_irq[i] = pt_msix->msix_entry[i].vector;
++
++	return 0;
++}
++
++static int pt_get_msi_irq(struct pt_device *pt)
++{
++	struct device *dev = pt->dev;
++	struct pci_dev *pdev = to_pci_dev(dev);
++	int ret, i;
++
++	ret = pci_enable_msi(pdev);
++	if (ret)
++		return ret;
++
++	for(i=0; i<MAX_HW_QUEUES; i++ )
++		pt->pt_irq[i] = pdev->irq;
++
++	return 0;
++}
++
++static int pt_get_irqs(struct pt_device *pt)
++{
++	struct device *dev = pt->dev;
++	int ret;
++
++	ret = pt_get_msix_irqs(pt);
++	if (!ret)
++		return 0;
++
++	/* Couldn't get MSI-X vectors, try MSI */
++	dev_err(dev, "could not enable MSI-X (%d), trying MSI\n", ret);
++	ret = pt_get_msi_irq(pt);
++	if (!ret)
++		return 0;
++
++	/* Couldn't get MSI interrupt */
++	dev_err(dev, "could not enable MSI (%d)\n", ret);
++
++	return ret;
++}
++
++static void pt_free_irqs(struct pt_device *pt)
++{
++	struct pt_msix *pt_msix = pt->pt_msix;
++	struct device *dev = pt->dev;
++	struct pci_dev *pdev = to_pci_dev(dev);
++	unsigned int i;
++
++	if (pt_msix->msix_count)
++		pci_disable_msix(pdev);
++	else if (pt->pt_irq)
++		pci_disable_msi(pdev);
++
++	for(i=0; i<MAX_HW_QUEUES; i++ )
++		pt->pt_irq[i] = 0;
++}
++
++static int pt_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
++{
++	struct pt_device *pt;
++	struct pt_msix *pt_msix;
++	struct device *dev = &pdev->dev;
++	void __iomem * const *iomap_table;
++	int bar_mask;
++	int ret = -ENOMEM;
++
++	pt = pt_alloc_struct(dev);
++	if (!pt)
++		goto e_err;
++
++	pt_msix = devm_kzalloc(dev, sizeof(*pt_msix), GFP_KERNEL);
++	if (!pt_msix)
++		goto e_err;
++
++	pt->pt_msix = pt_msix;
++	pt->dev_vdata = (struct pt_dev_vdata *)id->driver_data;
++	if (!pt->dev_vdata) {
++		ret = -ENODEV;
++		dev_err(dev, "missing driver data\n");
++		goto e_err;
++	}
++
++	ret = pcim_enable_device(pdev);
++	if (ret) {
++		dev_err(dev, "pcim_enable_device failed (%d)\n", ret);
++		goto e_err;
++	}
++
++	bar_mask = pci_select_bars(pdev, IORESOURCE_MEM);
++	ret = pcim_iomap_regions(pdev, bar_mask, "ae4dma");
++	if (ret) {
++		dev_err(dev, "pcim_iomap_regions failed (%d)\n", ret);
++		goto e_err;
++	}
++
++	iomap_table = pcim_iomap_table(pdev);
++	if (!iomap_table) {
++		dev_err(dev, "pcim_iomap_table failed\n");
++		ret = -ENOMEM;
++		goto e_err;
++	}
++
++	pt->io_regs = iomap_table[pt->dev_vdata->bar];
++	if (!pt->io_regs) {
++		dev_err(dev, "ioremap failed\n");
++		ret = -ENOMEM;
++		goto e_err;
++	}
++
++	ret = pt_get_irqs(pt);
++	if (ret)
++		goto e_err;
++
++	pci_set_master(pdev);
++
++	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(48));
++	if (ret) {
++		ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
++		if (ret) {
++			dev_err(dev, "dma_set_mask_and_coherent failed (%d)\n",
++				ret);
++			goto e_err;
++		}
++	}
++
++	dev_set_drvdata(dev, pt);
++
++	if (pt->dev_vdata)
++		ret = pt_core_init(pt);
++
++	if (ret)
++		goto e_err;
++
++	return 0;
++
++e_err:
++	dev_err(dev, "initialization failed ret = %d\n", ret);
++
++	return ret;
++}
++
++static void pt_pci_remove(struct pci_dev *pdev)
++{
++	struct device *dev = &pdev->dev;
++	struct pt_device *pt = dev_get_drvdata(dev);
++
++	if (!pt)
++		return;
++
++	if (pt->dev_vdata)
++		pt_core_destroy(pt);
++
++	pt_free_irqs(pt);
++}
++
++static const struct pt_dev_vdata dev_vdata[] = {
++	{
++		.bar = 0,
++	},
++};
++
++static const struct pci_device_id pt_pci_table[] = {
++	{ PCI_VDEVICE(AMD, 0x14C8), (kernel_ulong_t)&dev_vdata[0] },
++	{ PCI_VDEVICE(AMD, 0x14DC), (kernel_ulong_t)&dev_vdata[0] },
++	{ PCI_VDEVICE(AMD, 0x149B), (kernel_ulong_t)&dev_vdata[0] },
++	/* Last entry must be zero */
++	{ 0, }
++};
++MODULE_DEVICE_TABLE(pci, pt_pci_table);
++
++static struct pci_driver pt_pci_driver = {
++	.name = "ae4dma",
++	.id_table = pt_pci_table,
++	.probe = pt_pci_probe,
++	.remove = pt_pci_remove,
++};
++
++module_pci_driver(pt_pci_driver);
++
++MODULE_AUTHOR("amd");
++MODULE_LICENSE("GPL");
++MODULE_DESCRIPTION("AMD AE4DMA driver");
+diff --git a/drivers/dma/ae4dma/ae4dma.h b/drivers/dma/ae4dma/ae4dma.h
+new file mode 100644
+index 000000000000..30ce1c1ee29c
+--- /dev/null
++++ b/drivers/dma/ae4dma/ae4dma.h
+@@ -0,0 +1,416 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++/*
++ * AMD AE4DMA device driver
++ *
++ * Copyright (C) 2024 Advanced Micro Devices, Inc.
++ *
++ */
++
++#ifndef __PT_DEV_H__
++#define __PT_DEV_H__
++
++#include <linux/device.h>
++#include <linux/dmaengine.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++#include <linux/mutex.h>
++#include <linux/list.h>
++#include <linux/wait.h>
++#include <linux/dmapool.h>
++
++#include "../virt-dma.h"
++
++#define MAX_PT_NAME_LEN			16
++#define MAX_DMAPOOL_NAME_LEN		32
++
++#define MAX_HW_QUEUES			16
++#define MAX_CMD_QLEN			32	
++
++#define PT_ENGINE_PASSTHRU		5
++
++/* Register Mappings */
++#define IRQ_MASK_REG			0x040
++#define IRQ_STATUS_REG			0x200
++
++#define CMD_Q_ERROR(__qs)		((__qs) & 0x0000003f)
++
++#define CMD_QUEUE_PRIO_OFFSET		0x00
++#define CMD_REQID_CONFIG_OFFSET		0x04
++#define CMD_TIMEOUT_OFFSET		0x08
++#define CMD_PT_VERSION			0x10
++
++#define CMD_Q_CONTROL_BASE		0x0000
++#define CMD_Q_TAIL_LO_BASE		0x0004
++#define CMD_Q_HEAD_LO_BASE		0x0008
++#define CMD_Q_INT_ENABLE_BASE		0x000C
++#define CMD_Q_INTERRUPT_STATUS_BASE	0x0010
++
++#define CMD_Q_STATUS_BASE		0x0100
++#define CMD_Q_INT_STATUS_BASE		0x0104
++#define CMD_Q_DMA_STATUS_BASE		0x0108
++#define CMD_Q_DMA_READ_STATUS_BASE	0x010C
++#define CMD_Q_DMA_WRITE_STATUS_BASE	0x0110
++#define CMD_Q_ABORT_BASE		0x0114
++#define CMD_Q_AX_CACHE_BASE		0x0118
++
++#define CMD_CONFIG_OFFSET		0x1120
++#define CMD_CLK_GATE_CTL_OFFSET		0x6004
++
++#define CMD_DESC_DW0_VAL		0x000002
++
++/* Address offset for virtual queue registers */
++#define CMD_Q_STATUS_INCR		0x1000
++
++/* Bit masks */
++#define CMD_CONFIG_REQID		0
++#define CMD_TIMEOUT_DISABLE		0
++#define CMD_CLK_DYN_GATING_DIS		0
++#define CMD_CLK_SW_GATE_MODE		0
++#define CMD_CLK_GATE_CTL		0
++#define CMD_QUEUE_PRIO			GENMASK(2, 1)
++#define CMD_CONFIG_VHB_EN		BIT(0)
++#define CMD_CLK_DYN_GATING_EN		BIT(0)
++#define CMD_CLK_HW_GATE_MODE		BIT(0)
++#define CMD_CLK_GATE_ON_DELAY		BIT(12)
++#define CMD_CLK_GATE_OFF_DELAY		BIT(12)
++
++#define CMD_CLK_GATE_CONFIG		(CMD_CLK_GATE_CTL | \
++					CMD_CLK_HW_GATE_MODE | \
++					CMD_CLK_GATE_ON_DELAY | \
++					CMD_CLK_DYN_GATING_EN | \
++					CMD_CLK_GATE_OFF_DELAY)
++
++#define CMD_Q_LEN			32
++#define CMD_Q_RUN			BIT(0)
++#define CMD_Q_HALT			BIT(1)
++#define CMD_Q_MEM_LOCATION		BIT(2)
++#define CMD_Q_SIZE_MASK			GENMASK(4, 0)
++#define CMD_Q_SIZE			GENMASK(7, 3)
++#define CMD_Q_SHIFT			GENMASK(1, 0)
++#define QUEUE_SIZE_VAL			((ffs(CMD_Q_LEN) - 2) & \
++								  CMD_Q_SIZE_MASK)
++#define Q_PTR_MASK			(2 << (QUEUE_SIZE_VAL + 5) - 1)
++#define Q_DESC_SIZE			sizeof(struct ptdma_desc)
++#define Q_SIZE(n)			(CMD_Q_LEN * (n))
++
++#define INT_DESC_VALIDATED		BIT(1)
++#define INT_DESC_PROCESSED			BIT(2)
++#define INT_COMPLETION			BIT(3)
++#define INT_ERROR			BIT(4)
++
++#define SUPPORTED_INTERRUPTS		(INT_COMPLETION | INT_ERROR)
++
++/****** Local Storage Block ******/
++#define LSB_START			0
++#define LSB_END				127
++#define LSB_COUNT			(LSB_END - LSB_START + 1)
++
++#define PT_DMAPOOL_MAX_SIZE		64
++#define PT_DMAPOOL_ALIGN		BIT(5)
++
++#define PT_PASSTHRU_BLOCKSIZE		512
++
++struct pt_device;
++
++struct pt_tasklet_data {
++	struct completion completion;
++	struct pt_cmd *cmd;
++};
++
++/*
++ * struct pt_passthru_engine - pass-through operation
++ *   without performing DMA mapping
++ * @mask: mask to be applied to data
++ * @mask_len: length in bytes of mask
++ * @src_dma: data to be used for this operation
++ * @dst_dma: data produced by this operation
++ * @src_len: length in bytes of data used for this operation
++ *
++ * Variables required to be set when calling pt_enqueue_cmd():
++ *   - bit_mod, byte_swap, src, dst, src_len
++ *   - mask, mask_len if bit_mod is not PT_PASSTHRU_BITWISE_NOOP
++ */
++struct pt_passthru_engine {
++	dma_addr_t mask;
++	u32 mask_len;		/* In bytes */
++
++	dma_addr_t src_dma, dst_dma;
++	u64 src_len;		/* In bytes */
++};
++
++/*
++ * struct pt_cmd - PTDMA operation request
++ * @entry: list element
++ * @work: work element used for callbacks
++ * @pt: PT device to be run on
++ * @ret: operation return code
++ * @flags: cmd processing flags
++ * @engine: PTDMA operation to perform (passthru)
++ * @engine_error: PT engine return code
++ * @passthru: engine specific structures, refer to specific engine struct below
++ * @callback: operation completion callback function
++ * @data: parameter value to be supplied to the callback function
++ *
++ * Variables required to be set when calling pt_enqueue_cmd():
++ *   - engine, callback
++ *   - See the operation structures below for what is required for each
++ *     operation.
++ */
++struct pt_cmd {
++	struct list_head entry;
++	struct work_struct work;
++	struct pt_device *pt;
++	int ret;
++	u32 engine;
++	u32 engine_error;
++	struct pt_passthru_engine passthru;
++	/* Completion callback support */
++	void (*pt_cmd_callback)(void *data, int err);
++	void *data;
++	u8 qid;
++};
++
++struct pt_dma_desc {
++	struct virt_dma_desc vd;
++	struct pt_device *pt;
++	enum dma_status status;
++	size_t len;
++	bool issued_to_hw;
++	struct pt_cmd pt_cmd;
++};
++
++struct pt_dma_chan {
++	struct virt_dma_chan vc;
++	struct pt_device *pt;
++	struct pt_cmd_queue *cmd_q;
++	u32 id;
++};
++
++struct pt_cmd_queue {
++	struct pt_device *pt;
++
++	/* Queue identifier */
++	u32 id;
++
++	/* Queue dma pool */
++	struct dma_pool *dma_pool;
++
++	/* Queue base address (not neccessarily aligned)*/
++	struct ptdma_desc *qbase;
++
++	/* Aligned queue start address (per requirement) */
++	struct mutex q_mutex ____cacheline_aligned;
++	spinlock_t q_lock ____cacheline_aligned;
++	volatile unsigned long qidx;
++	volatile unsigned long ridx;
++
++
++	unsigned int qsize;
++	dma_addr_t qbase_dma;
++	dma_addr_t qdma_tail;
++
++	unsigned int active;
++	unsigned int suspended;
++
++	/* Interrupt flag */
++	bool int_en;
++
++	/* Register addresses for queue */
++	void __iomem *reg_control;
++	u32 qcontrol; /* Cached control register */
++
++	/* Status values from job */
++	u32 int_status;
++	u32 q_status;
++	u32 q_int_status;
++	u32 cmd_error;
++	atomic_t dridx;
++	/* Interrupt wait queue */
++	wait_queue_head_t int_queue;
++	unsigned int int_rcvd;
++
++	wait_queue_head_t q_space;
++	unsigned int q_space_available;
++
++	/* Queue Statistics */
++	unsigned long total_pt_ops;
++	atomic64_t	 q_cmd_count;
++	atomic_t tail_wi;
++	volatile unsigned long desc_id_counter;
++	struct pt_tasklet_data tdata;
++
++	struct list_head cmd;
++	spinlock_t cmd_lock ____cacheline_aligned;
++	spinlock_t cmd_control ____cacheline_aligned;
++	struct mutex cmd_mutex;
++} ____cacheline_aligned;
++
++struct pt_device {
++	struct list_head entry;
++
++	unsigned int ord;
++	char name[MAX_PT_NAME_LEN];
++
++	struct device *dev;
++
++	/* Bus specific device information */
++	struct pt_msix *pt_msix;
++
++	struct pt_dev_vdata *dev_vdata;
++
++	unsigned int pt_irq[MAX_HW_QUEUES];
++
++	/* I/O area used for device communication */
++	void __iomem *io_regs;
++
++	spinlock_t cmd_lock ____cacheline_aligned;
++	unsigned int cmd_count;
++	struct list_head cmd;
++
++	/*
++	 * The command queue. This represent the queue available on the
++	 * PTDMA that are available for processing cmds
++	 */
++	struct pt_cmd_queue cmd_q[MAX_HW_QUEUES];
++	unsigned int cmd_q_count;
++
++	/* Support for the DMA Engine capabilities */
++	struct dma_device dma_dev;
++	struct pt_dma_chan *pt_dma_chan;
++	struct kmem_cache *dma_cmd_cache;
++	struct kmem_cache *dma_desc_cache;
++
++	wait_queue_head_t lsb_queue;
++
++	/* Device Statistics */
++	volatile unsigned long current_interrupts;
++	volatile unsigned long total_interrupts;
++
++};
++
++/*
++ * descriptor for PTDMA commands
++ * 8 32-bit words:
++ * word 0: function; engine; control bits
++ * word 1: length of source data
++ * word 2: low 32 bits of source pointer
++ * word 3: upper 16 bits of source pointer; source memory type
++ * word 4: low 32 bits of destination pointer
++ * word 5: upper 16 bits of destination pointer; destination memory type
++ * word 6: reserved 32 bits
++ * word 7: reserved 32 bits
++ */
++
++#define DWORD0_SOC	BIT(0)
++#define DWORD0_IOC	BIT(1)
++#define DWORD0_SOM	BIT(3)
++#define DWORD0_EOM	BIT(4)
++#define DWORD0_DMT	GENMASK(5, 4)
++#define DWORD0_SMT	GENMASK(7, 6)
++
++#define DWORD0_DMT_MEM	0x0
++#define DWORD0_DMT_IO	1<<4
++#define DWORD0_SMT_MEM	0x0
++#define DWORD0_SMT_IO	1<<6
++
++union dwou {
++	u32 dw0;
++	struct dword0 {
++	u8	byte0;
++	u8	byte1;
++	u16	timestamp;
++	} dws;
++};
++
++struct dword1 {
++	u8	status;
++	u8	err_code;
++	u16	desc_id;
++};
++
++struct ptdma_desc {
++	union dwou dwouv;
++	struct dword1 dw1;
++	u32 length;
++	struct dword1 uu;
++	u32 src_hi;
++	u32 src_lo;
++	u32 dst_hi;
++	u32 dst_lo;
++};
++
++struct desc_work {
++	struct ptdma_desc *desc; 
++	struct pt_cmd_queue *cmd_q; 
++	struct list_head entry;
++	bool submitted;
++	bool processed;
++	bool init;
++	struct work_struct work;
++};
++
++/* Structure to hold PT device data */
++struct pt_dev_vdata {
++	const unsigned int bar;
++};
++
++int pt_dmaengine_register(struct pt_device *pt);
++void pt_dmaengine_unregister(struct pt_device *pt);
++
++int pt_core_init(struct pt_device *pt);
++void pt_core_destroy(struct pt_device *pt);
++
++int pt_core_perform_passthru(struct pt_cmd_queue *cmd_q,
++			     struct pt_passthru_engine *pt_engine);
++
++void pt_check_status_trans(struct pt_device *pt, struct pt_cmd_queue *cmd_q);
++void pt_start_queue(struct pt_cmd_queue *cmd_q);
++void pt_stop_queue(struct pt_cmd_queue *cmd_q);
++
++static inline void pt_core_disable_queue_interrupts(struct pt_device *pt, struct pt_cmd_queue *cmd_q)
++
++{
++
++        u32 status;
++        status = ioread32(cmd_q->reg_control);
++        status &= ~0x7;
++        iowrite32(status, cmd_q->reg_control);
++}
++ 
++static inline void pt_core_enable_queue_interrupts(struct pt_device *pt, struct pt_cmd_queue *cmd_q)
++
++{
++
++        u32 status;
++        status = ioread32(cmd_q->reg_control);
++        status |= 0x7;
++        iowrite32(status, cmd_q->reg_control);
++}
++
++static inline bool pt_core_queue_full(struct pt_device *pt, struct pt_cmd_queue *cmd_q)
++{
++	u32 q_sts = ioread32(cmd_q->reg_control + 0x4) & 0x06;
++
++	u32 rear_ri = ioread32(cmd_q->reg_control + 0x0C);
++	u32 front_wi  = ioread32(cmd_q->reg_control + 0x10);
++
++	q_sts >>= 1;
++
++	if ( ((MAX_CMD_QLEN + front_wi - rear_ri) % MAX_CMD_QLEN)  >= (MAX_CMD_QLEN-1) ) {
++		return true;
++	}
++		
++	return false;
++}
++
++static inline bool pt_core_queue_empty(struct pt_device *pt, struct pt_cmd_queue *cmd_q)
++{
++        u32 rear_ri = ioread32(cmd_q->reg_control + 0x0C);
++        u32 front_wi  = ioread32(cmd_q->reg_control + 0x10);
++ 
++        if (front_wi == rear_ri)
++                return true;
++ 
++        return false;
++}
++#endif
+-- 
+2.34.1
+