nvdla: add NVDLA driver

Additional update from Prashant Gaikwad <pgaikwad@nvidia.com> Adapted for Linux 5.13 and the BeagleV Starlight board by <cybergaszcz@gmail.com>
2025-03-17 20:54:10 +00:00 · 2018-09-20 19:08:27 -05:00 · 2018-09-20 19:08:27 -05:00 · 29e676e7fa
commit 29e676e7fa
parent 1aaa011e7e
33 changed files with 32588 additions and 0 deletions
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@ -236,4 +236,6 @@ source "drivers/interconnect/Kconfig"
 source "drivers/counter/Kconfig"

 source "drivers/most/Kconfig"
+
+source "drivers/nvdla/Kconfig"
 endmenu
--- a/drivers/Makefile
+++ b/drivers/Makefile
@ -189,3 +189,4 @@ obj-$(CONFIG_GNSS)		+= gnss/
 obj-$(CONFIG_INTERCONNECT)	+= interconnect/
 obj-$(CONFIG_COUNTER)		+= counter/
 obj-$(CONFIG_MOST)		+= most/
+obj-$(CONFIG_NVDLA)		+= nvdla/
--- a/drivers/nvdla/Kconfig
+++ b/drivers/nvdla/Kconfig
@ -0,0 +1,5 @@
+config NVDLA
+	bool "The NVIDIA Deep Learning Accelerator"
+	default n
+	depends on DRM
+	select DRM_GEM_CMA_HELPER
--- a/drivers/nvdla/Makefile
+++ b/drivers/nvdla/Makefile
@ -0,0 +1,19 @@
+
+ccflags-$(CONFIG_NVDLA) += -I$(srctree)/$(src)
+ccflags-$(CONFIG_NVDLA) += -I$(srctree)/$(src)/include
+
+obj-$(CONFIG_NVDLA) += scheduler.o
+obj-$(CONFIG_NVDLA) += engine.o
+obj-$(CONFIG_NVDLA) += bdma.o
+obj-$(CONFIG_NVDLA) += conv.o
+obj-$(CONFIG_NVDLA) += sdp.o
+obj-$(CONFIG_NVDLA) += cdp.o
+obj-$(CONFIG_NVDLA) += pdp.o
+obj-$(CONFIG_NVDLA) += rubik.o
+obj-$(CONFIG_NVDLA) += cache.o
+obj-$(CONFIG_NVDLA) += common.o
+obj-$(CONFIG_NVDLA) += engine_data.o
+obj-$(CONFIG_NVDLA) += engine_isr.o
+obj-$(CONFIG_NVDLA) += engine_debug.o
+obj-$(CONFIG_NVDLA) += nvdla_core_callbacks.o
+obj-$(CONFIG_NVDLA) += nvdla_gem.o
--- a/drivers/nvdla/bdma.c
+++ b/drivers/nvdla/bdma.c
@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <opendla.h>
+#include <dla_debug.h>
+#include <dla_err.h>
+#include <dla_interface.h>
+
+#include "dla_engine_internal.h"
+#include "engine_debug.h"
+
+static const uint8_t map_mem[] = {
+	FIELD_ENUM(BDMA_CFG_CMD_0, SRC_RAM_TYPE, MC),
+	FIELD_ENUM(BDMA_CFG_CMD_0, SRC_RAM_TYPE, CVSRAM),
+};
+
+#if STAT_ENABLE
+void
+dla_bdma_stat_data(struct dla_processor *processor,
+					struct dla_processor_group *group)
+{
+	uint64_t end_time = 0;
+	struct dla_bdma_stat_desc *bdma_stat;
+
+	bdma_stat = &processor->stat_data_desc->bdma_stat;
+
+	end_time = dla_get_time_us();
+
+	if (group->id == (uint32_t)0) {
+		bdma_stat->read_stall = bdma_reg_read(STATUS_GRP0_READ_STALL);
+		bdma_stat->write_stall = bdma_reg_read(STATUS_GRP0_WRITE_STALL);
+	} else {
+		bdma_stat->read_stall = bdma_reg_read(STATUS_GRP1_READ_STALL);
+		bdma_stat->write_stall = bdma_reg_read(STATUS_GRP1_WRITE_STALL);
+	}
+	bdma_stat->runtime = (uint32_t)(end_time - group->start_time);
+}
+
+void
+dla_bdma_dump_stat(struct dla_processor *processor)
+{
+	struct dla_bdma_stat_desc *bdma_stat;
+
+	bdma_stat = &processor->stat_data_desc->bdma_stat;
+
+	dla_debug_bdma_stats(bdma_stat);
+}
+#endif /* STAT_ENABLE */
+
+void
+dla_bdma_set_producer(int32_t group_id, int32_t rdma_group_id)
+{
+	/**
+	 * There is no producer bit for BDMA operation,
+	 * interrupt pointer decides which outstanding request
+	 * to use for this BDMA operation
+	 */
+}
+
+int
+dla_bdma_enable(struct dla_processor_group *group)
+{
+	struct dla_engine *engine = dla_get_engine();
+
+	dla_debug("Enter: %s\n", __func__);
+
+	if (group->surface_desc->bdma_surface.num_transfers == (uint16_t)0) {
+		group->events |= ((uint8_t)1 << DLA_EVENT_OP_COMPLETED);
+		goto exit;
+	}
+
+	if (engine->stat_enable == (uint32_t)1) {
+		bdma_reg_write(CFG_STATUS, FIELD_ENUM(BDMA_CFG_STATUS_0,
+							STALL_COUNT_EN, YES));
+		group->start_time = dla_get_time_us();
+	}
+
+	/**
+	 * Launch BDMA transfer
+	 */
+	if (group->id == 0)
+		bdma_reg_write(CFG_LAUNCH0, FIELD_ENUM(BDMA_CFG_LAUNCH0_0,
+							GRP0_LAUNCH, YES));
+	else
+		bdma_reg_write(CFG_LAUNCH1, FIELD_ENUM(BDMA_CFG_LAUNCH1_0,
+							GRP1_LAUNCH, YES));
+
+exit:
+	dla_debug("Exit: %s\n", __func__);
+	return 0;
+}
+
+void
+dla_bdma_rdma_check(struct dla_processor_group *group)
+{
+	group->is_rdma_needed = 0;
+}
+
+/**
+ * Program BDMA slot for transfer
+ */
+static int32_t
+processor_bdma_program_slot(struct dla_bdma_surface_desc *bdma_surface,
+				struct dla_bdma_transfer_desc *transfer)
+{
+	int32_t ret = 0;
+	uint64_t source_addr = 0;
+	uint64_t destination_addr = 0;
+	uint32_t high, low, reg;
+	uint8_t  bdma_free_slots = 0;
+	struct dla_engine *engine = dla_get_engine();
+
+	dla_debug("Enter: %s\n", __func__);
+
+	/* make sure there're enough free slots */
+	if (bdma_free_slots <= 0) {
+		do {
+			reg = bdma_reg_read(STATUS);
+			reg = (reg & MASK(BDMA_STATUS_0, FREE_SLOT)) >>
+					SHIFT(BDMA_STATUS_0, FREE_SLOT);
+		} while (reg == 0);
+		bdma_free_slots = (uint8_t)reg;
+	}
+
+	dla_get_dma_address(engine->driver_context, engine->task->task_data,
+						transfer->source_address,
+						(void *)&source_addr,
+						DESTINATION_DMA);
+	dla_get_dma_address(engine->driver_context, engine->task->task_data,
+						transfer->destination_address,
+						(void *)&destination_addr,
+						DESTINATION_DMA);
+
+	ASSERT_GOTO((transfer->line_repeat <= 8192),
+				ret, ERR(INVALID_INPUT), exit);
+	ASSERT_GOTO((transfer->surface_repeat <= 8192),
+				ret, ERR(INVALID_INPUT), exit);
+	ASSERT_GOTO((transfer->line_size % 32) == 0,
+				ret, ERR(INVALID_INPUT), exit);
+	ASSERT_GOTO(transfer->source_line >= transfer->line_size,
+				ret, ERR(INVALID_INPUT), exit);
+	ASSERT_GOTO(transfer->destination_line >= transfer->line_size,
+				ret, ERR(INVALID_INPUT), exit);
+	ASSERT_GOTO(transfer->source_surface >=
+			(transfer->source_line * transfer->line_repeat),
+				ret, ERR(INVALID_INPUT), exit);
+	ASSERT_GOTO(transfer->destination_surface >=
+			(transfer->destination_line * transfer->line_repeat),
+				ret, ERR(INVALID_INPUT), exit);
+
+	/* config registers */
+	high = HIGH32BITS(source_addr);
+	low = LOW32BITS(source_addr);
+	bdma_reg_write(CFG_SRC_ADDR_LOW, low);
+	bdma_reg_write(CFG_SRC_ADDR_HIGH, high);
+	high = HIGH32BITS(destination_addr);
+	low = LOW32BITS(destination_addr);
+	bdma_reg_write(CFG_DST_ADDR_LOW, low);
+	bdma_reg_write(CFG_DST_ADDR_HIGH, high);
+	bdma_reg_write(CFG_LINE, (transfer->line_size >> 5) - 1);
+	reg = (map_mem[bdma_surface->source_type] <<
+				SHIFT(BDMA_CFG_CMD_0, SRC_RAM_TYPE)) |
+		(map_mem[bdma_surface->destination_type] <<
+				SHIFT(BDMA_CFG_CMD_0, DST_RAM_TYPE));
+	bdma_reg_write(CFG_CMD, reg);
+	bdma_reg_write(CFG_LINE_REPEAT, transfer->line_repeat - 1);
+	bdma_reg_write(CFG_SRC_LINE, transfer->source_line);
+	bdma_reg_write(CFG_DST_LINE, transfer->destination_line);
+	bdma_reg_write(CFG_SURF_REPEAT, transfer->surface_repeat - 1);
+	bdma_reg_write(CFG_SRC_SURF, transfer->source_surface);
+	bdma_reg_write(CFG_DST_SURF, transfer->destination_surface);
+	bdma_reg_write(CFG_OP, FIELD_ENUM(BDMA_CFG_OP_0, EN, ENABLE));
+
+	dla_debug("Exit: %s\n", __func__);
+
+exit:
+	RETURN(ret);
+}
+
+int
+dla_bdma_is_ready(struct dla_processor *processor,
+			    struct dla_processor_group *group)
+{
+	struct dla_processor_group *next_group;
+
+	next_group = &processor->groups[!group->id];
+
+	/**
+	 * If another group is already programmed but not active then
+	 * do not program this operation as BDMA does not really
+	 * have shadow copies for groups. It will end programming
+	 * same group. Wait for another group to get enabled.
+	 */
+	if ((processor->group_status & (1 << next_group->id)) &&
+						!next_group->active)
+		return 0;
+
+	return 1;
+}
+
+void
+dla_bdma_dump_config(struct dla_processor_group *group)
+{
+	struct dla_bdma_op_desc *bdma_op;
+	struct dla_bdma_surface_desc *bdma_surface;
+
+	bdma_surface = &group->surface_desc->bdma_surface;
+	bdma_op = &group->operation_desc->bdma_op;
+
+	dla_debug_bdma_surface_desc(bdma_surface, group->roi_index);
+	dla_debug_bdma_op_desc(bdma_op, group->roi_index);
+}
+
+int
+dla_bdma_program(struct dla_processor_group *group)
+{
+	int32_t i;
+	int32_t ret = 0;
+	struct dla_bdma_surface_desc *bdma_surface;
+	struct dla_engine *engine = dla_get_engine();
+
+	dla_debug("Enter: %s\n", __func__);
+
+	if (!engine->config_data->bdma_enable) {
+		dla_error("BDMA is not supported for this configuration\n");
+		ret = ERR(INVALID_INPUT);
+		goto exit;
+	}
+
+	bdma_surface = &group->surface_desc->bdma_surface;
+
+	dla_debug("Num of transfers %u\n", bdma_surface->num_transfers);
+	if (bdma_surface->num_transfers == (uint16_t)0)
+		goto exit;
+
+	if (bdma_surface->num_transfers > NUM_MAX_BDMA_OPS) {
+		dla_error("Invalid number of transfers\n");
+		ret = ERR(INVALID_INPUT);
+		goto exit;
+	}
+
+	for (i = 0; i < bdma_surface->num_transfers; i++) {
+		ret = processor_bdma_program_slot(bdma_surface,
+					&bdma_surface->transfers[i]);
+		if (ret)
+			goto exit;
+	}
+
+	dla_enable_intr(MASK(GLB_S_INTR_MASK_0, BDMA_DONE_MASK1) |
+			MASK(GLB_S_INTR_MASK_0, BDMA_DONE_MASK0));
+
+exit:
+	dla_debug("Exit: %s\n", __func__);
+	RETURN(ret);
+}
--- a/drivers/nvdla/cache.c
+++ b/drivers/nvdla/cache.c
@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <opendla.h>
+#include <dla_debug.h>
+#include <dla_engine.h>
+#include <dla_interface.h>
+
+#include "dla_engine_internal.h"
+
+#define DLA_OP_CACHE_SIZE (DLA_NUM_GROUPS * ((DLA_OP_NUM + 2) * 2))
+
+static struct dla_common_op_desc desc_cache[DLA_OP_NUM][DLA_OP_CACHE_SIZE];
+static int32_t desc_refcount[DLA_OP_NUM][DLA_OP_CACHE_SIZE];
+
+void
+dla_get_refcount(struct dla_common_op_desc *op_desc)
+{
+	int32_t i;
+	struct dla_common_op_desc *desc = NULL;
+
+	if (op_desc == NULL)
+		return;
+
+	if (op_desc->index == -1)
+		return;
+
+	desc = &desc_cache[op_desc->op_type][0];
+
+	for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
+		if (desc->index == op_desc->index &&
+				desc->roi_index == op_desc->roi_index) {
+			desc_refcount[op_desc->op_type][i]++;
+			return;
+		}
+	}
+}
+
+struct dla_common_op_desc *
+dla_get_op_desc(struct dla_task *task, int16_t index,
+			uint8_t op_type, uint8_t roi_index)
+{
+	int32_t i;
+	int32_t ret;
+	uint64_t op_base;
+	uint64_t dep_graph_addr;
+	struct dla_common_op_desc *desc = NULL;
+	struct dla_engine *engine = dla_get_engine();
+
+	if (index == -1) {
+		dla_debug("no desc get due to index==-1\n");
+		goto exit;
+	}
+
+	dep_graph_addr = (sizeof(struct dla_common_op_desc) *
+				engine->network->num_operations * roi_index);
+
+	desc = &desc_cache[op_type][0];
+
+	for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
+		if (desc->index == index && desc->roi_index == roi_index) {
+			if (desc->op_type != op_type) {
+				dla_error("op_cache[op=%u] contains incorrect "
+						"entry of op[%u]\n", op_type,
+						desc->op_type);
+				continue;
+			}
+			desc_refcount[op_type][i]++;
+			goto exit;
+		}
+	}
+
+	desc = &desc_cache[op_type][0];
+
+	for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
+		if (desc->index == -1) {
+			op_base = dep_graph_addr +
+					(sizeof(struct dla_common_op_desc) *
+					(uint64_t)index);
+			ret = dla_data_read(engine->driver_context,
+					task->task_data,
+					task->dependency_graph_addr,
+					(void *)(desc),
+					sizeof(struct dla_common_op_desc),
+					op_base);
+			if (ret) {
+				desc = NULL;
+				goto exit;
+			}
+
+			if (op_type != desc->op_type) {
+				/*
+				 * op_type of entry read from DRAM should not
+				 * mismatch with given op_type. If they
+				 * mismatches, then wrong entry is fetched, so
+				 * report this issue by throwing error.
+				 */
+				dla_error("Fetched [op_type=%u] from DRAM doesn't "
+					"match with op_type[%u]\n",
+					desc->op_type,
+					op_type);
+				desc->op_type = op_type;
+				desc->index = -1;
+				desc->roi_index = -1;
+				desc = NULL;
+				goto exit;
+			}
+
+			desc->index = index;
+			desc->roi_index = roi_index;
+
+			/**
+			 * Refcount must be 0 if we are reading it first time
+			 * from DRAM
+			 */
+			assert(desc_refcount[op_type][i] == 0);
+
+			desc_refcount[op_type][i]++;
+			goto exit;
+		}
+	}
+
+exit:
+	return desc;
+}
+
+static void
+dla_free_op_desc(struct dla_common_op_desc *op_desc)
+{
+	uint64_t op_base;
+	uint64_t dep_graph_addr;
+	struct dla_task *task;
+	struct dla_engine *engine = dla_get_engine();
+
+	dla_debug("Enter: %s op desc index %u ROI %d\n", __func__,
+				op_desc->index, op_desc->roi_index);
+
+	task = engine->task;
+	dep_graph_addr = (sizeof(struct dla_common_op_desc) *
+				engine->network->num_operations *
+				op_desc->roi_index);
+
+	if (op_desc->index == -1)
+		goto exit;
+
+	if (op_desc == NULL)
+		goto exit;
+
+	/**
+	 * TODO: keeping the depth value hardcoded as 0 for now,
+	 * need to replace it once corresponding implementation is done.
+	 */
+	op_base = (dep_graph_addr +
+			(sizeof(struct dla_common_op_desc) *
+			(uint64_t)op_desc->index));
+
+	/**
+	 * Flush descriptor to DRAM
+	 */
+	dla_data_write(engine->driver_context,
+			task->task_data,
+			(void *)op_desc,
+			task->dependency_graph_addr,
+			sizeof(struct dla_common_op_desc),
+			op_base);
+
+	/**
+	 * Release it
+	 */
+	op_desc->index = -1;
+	op_desc->roi_index = -1;
+exit:
+	dla_debug("Exit: %s\n", __func__);
+}
+
+void
+dla_put_op_desc(struct dla_common_op_desc *op_desc)
+{
+	int32_t i;
+	struct dla_common_op_desc *desc;
+
+	if (op_desc == NULL)
+		return;
+
+	if (op_desc->index == -1)
+		return;
+
+	desc = &desc_cache[op_desc->op_type][0];
+
+	for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
+		if (desc->index == op_desc->index &&
+				desc->roi_index == op_desc->roi_index) {
+			/**
+			 * Refcount can't be 0 when we are trying to free it
+			 */
+			assert(desc_refcount[op_desc->op_type][i] > 0);
+
+			desc_refcount[op_desc->op_type][i]--;
+
+			/**
+			 * Free desc if refcount is 0
+			 */
+			if (desc_refcount[op_desc->op_type][i] == 0)
+				dla_free_op_desc(op_desc);
+
+			return;
+		}
+	}
+}
+
+void
+dla_init_op_cache(struct dla_engine *engine)
+{
+	int32_t i, j;
+	struct dla_common_op_desc *desc = &desc_cache[0][0];
+
+	dla_memset((uint8_t *)&desc_cache[0][0], 0, sizeof(desc_cache));
+	dla_memset((uint8_t *)&desc_refcount[0][0], 0, sizeof(desc_refcount));
+
+	for (i = 0; i < DLA_OP_NUM; i++) {
+		for (j = 0; j < DLA_OP_CACHE_SIZE; j++) {
+			desc->index = -1;
+			desc->roi_index = -1;
+			desc->op_type = (uint8_t)i;
+			desc++;
+		}
+	}
+}
--- a/drivers/nvdla/cdp.c
+++ b/drivers/nvdla/cdp.c
@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <opendla.h>
+#include <dla_debug.h>
+#include <dla_err.h>
+#include <dla_interface.h>
+
+#include "common.h"
+#include "dla_engine_internal.h"
+#include "engine_debug.h"
+
+static const uint8_t map_ram[] = {
+	FIELD_ENUM(CDP_RDMA_D_SRC_DMA_CFG_0, SRC_RAM_TYPE, MC),
+	FIELD_ENUM(CDP_RDMA_D_SRC_DMA_CFG_0, SRC_RAM_TYPE, CV),
+};
+
+static const uint8_t map_precision[] = {
+	FIELD_ENUM(CDP_RDMA_D_DATA_FORMAT_0, INPUT_DATA, INT8),
+	FIELD_ENUM(CDP_RDMA_D_DATA_FORMAT_0, INPUT_DATA, INT16),
+	FIELD_ENUM(CDP_RDMA_D_DATA_FORMAT_0, INPUT_DATA, FP16),
+};
+
+static const uint8_t map_perf_dma[] = {
+	FIELD_ENUM(CDP_D_PERF_ENABLE_0, DMA_EN, DISABLE),
+	FIELD_ENUM(CDP_D_PERF_ENABLE_0, DMA_EN, ENABLE),
+};
+
+static const uint8_t map_perf_lut[] = {
+	FIELD_ENUM(CDP_D_PERF_ENABLE_0, LUT_EN, DISABLE),
+	FIELD_ENUM(CDP_D_PERF_ENABLE_0, LUT_EN, ENABLE),
+};
+
+#if STAT_ENABLE
+void
+dla_cdp_stat_data(struct dla_processor *processor,
+					struct dla_processor_group *group)
+{
+	uint64_t end_time = 0;
+	struct dla_cdp_stat_desc *cdp_stat;
+
+	cdp_stat = &processor->stat_data_desc->cdp_stat;
+
+	end_time = dla_get_time_us();
+
+	cdp_stat->write_stall = cdp_reg_read(D_PERF_WRITE_STALL);
+	cdp_stat->lut_uflow = cdp_reg_read(D_PERF_LUT_UFLOW);
+	cdp_stat->lut_oflow = cdp_reg_read(D_PERF_LUT_OFLOW);
+	cdp_stat->lut_hybrid = cdp_reg_read(D_PERF_LUT_HYBRID);
+	cdp_stat->lut_le_hit = cdp_reg_read(D_PERF_LUT_LE_HIT);
+	cdp_stat->lut_lo_hit = cdp_reg_read(D_PERF_LUT_LO_HIT);
+	cdp_stat->runtime = (uint32_t)(end_time - group->start_time);
+}
+
+void
+dla_cdp_dump_stat(struct dla_processor *processor)
+{
+	struct dla_cdp_stat_desc *cdp_stat;
+
+	cdp_stat = &processor->stat_data_desc->cdp_stat;
+
+	dla_debug_cdp_stats(cdp_stat);
+}
+#endif /* STAT_ENABLE */
+
+static uint32_t
+map_local_size(uint8_t local_size)
+{
+	return ((local_size-1)/2)-1;
+}
+
+void
+dla_cdp_set_producer(int32_t group_id, int32_t rdma_group_id)
+{
+	uint32_t reg;
+
+	/**
+	 * set producer pointer for all sub-modules
+	 */
+	reg = group_id << SHIFT(CDP_S_POINTER_0, PRODUCER);
+	cdp_reg_write(S_POINTER, reg);
+	reg = group_id << SHIFT(CDP_RDMA_S_POINTER_0, PRODUCER);
+	cdp_rdma_reg_write(S_POINTER, reg);
+}
+
+int
+dla_cdp_enable(struct dla_processor_group *group)
+{
+	uint32_t reg;
+	uint8_t perf_reg;
+	struct dla_engine *engine = dla_get_engine();
+
+	dla_debug("Enter: %s\n", __func__);
+
+	if (engine->stat_enable == (uint32_t)1) {
+		perf_reg = (map_perf_dma[1] <<
+				SHIFT(CDP_D_PERF_ENABLE_0, DMA_EN)) |
+			(map_perf_lut[1] <<
+				SHIFT(CDP_D_PERF_ENABLE_0, LUT_EN));
+
+		cdp_reg_write(D_PERF_ENABLE, perf_reg);
+		group->start_time = dla_get_time_us();
+	}
+
+	/**
+	 * enable all sub-modules
+	 */
+	reg = FIELD_ENUM(CDP_RDMA_D_OP_ENABLE_0, OP_EN, ENABLE);
+	cdp_rdma_reg_write(D_OP_ENABLE, reg);
+	reg = FIELD_ENUM(CDP_D_OP_ENABLE_0, OP_EN, ENABLE);
+	cdp_reg_write(D_OP_ENABLE, reg);
+
+	dla_debug("Exit: %s\n", __func__);
+
+	RETURN(0);
+}
+
+void
+dla_cdp_rdma_check(struct dla_processor_group *group)
+{
+	group->is_rdma_needed = 1;
+}
+
+static int32_t
+processor_cdp_program(struct dla_processor_group *group)
+{
+	int32_t ret = 0;
+	uint32_t reg, high, low;
+	uint64_t input_address = 0;
+	uint64_t output_address = 0;
+	struct dla_lut_param lut;
+	struct dla_engine *engine = dla_get_engine();
+	struct dla_cdp_op_desc *cdp_op;
+	struct dla_cdp_surface_desc *cdp_surface;
+
+	dla_debug("Enter: %s\n", __func__);
+
+	cdp_op = &group->operation_desc->cdp_op;
+	cdp_surface = &group->surface_desc->cdp_surface;
+
+	/* Argument check */
+	if (cdp_surface->src_data.type == DLA_MEM_HW) {
+		dla_error("Invalid source memory type\n");
+		ret = ERR(INVALID_INPUT);
+		goto exit;
+	}
+	if (cdp_surface->dst_data.type == DLA_MEM_HW) {
+		dla_error("Invalid destination memory type\n");
+		ret = ERR(INVALID_INPUT);
+		goto exit;
+	}
+
+	if (cdp_op->in_precision != cdp_op->out_precision) {
+		dla_error("CDP does not support precision conversion\n");
+		ret = ERR(INVALID_INPUT);
+		goto exit;
+	}
+
+	/* get the addresses from task descriptor */
+	ret = dla_read_input_address(&cdp_surface->src_data,
+						&input_address,
+						group->op_desc->index,
+						group->roi_index,
+						1);
+	if (ret)
+		goto exit;
+
+	dla_get_dma_cube_address(engine->driver_context,
+				engine->task->task_data,
+				cdp_surface->dst_data.address,
+				cdp_surface->dst_data.offset,
+				(void *)&output_address,
+				DESTINATION_DMA);
+	if (cdp_op->lut_index >= 0) {
+		group->lut_index = cdp_op->lut_index;
+		dla_read_lut(engine, cdp_op->lut_index, (void *)&lut);
+		dla_debug_lut_params(&lut);
+	}
+
+	/* config CDP RDMA registers */
+	reg = ((cdp_surface->src_data.width - 1)
+		<< SHIFT(CDP_RDMA_D_DATA_CUBE_WIDTH_0, WIDTH));
+	cdp_rdma_reg_write(D_DATA_CUBE_WIDTH, reg);
+
+	reg = ((cdp_surface->src_data.height - 1)
+		<< SHIFT(CDP_RDMA_D_DATA_CUBE_HEIGHT_0, HEIGHT));
+	cdp_rdma_reg_write(D_DATA_CUBE_HEIGHT, reg);
+
+	reg = ((cdp_surface->src_data.channel - 1)
+		<< SHIFT(CDP_RDMA_D_DATA_CUBE_CHANNEL_0, CHANNEL));
+	cdp_rdma_reg_write(D_DATA_CUBE_CHANNEL, reg);
+
+	high = HIGH32BITS(input_address);
+	low = LOW32BITS(input_address);
+	cdp_rdma_reg_write(D_SRC_BASE_ADDR_LOW, low);
+	cdp_rdma_reg_write(D_SRC_BASE_ADDR_HIGH, high);
+
+	cdp_rdma_reg_write(D_SRC_LINE_STRIDE,
+			cdp_surface->src_data.line_stride);
+	cdp_rdma_reg_write(D_SRC_SURFACE_STRIDE,
+			cdp_surface->src_data.surf_stride);
+
+	reg = (map_ram[cdp_surface->src_data.type]
+		<< SHIFT(CDP_RDMA_D_SRC_DMA_CFG_0, SRC_RAM_TYPE));
+	cdp_rdma_reg_write(D_SRC_DMA_CFG, reg);
+
+	reg = (map_precision[cdp_op->in_precision]
+		<< SHIFT(CDP_RDMA_D_DATA_FORMAT_0, INPUT_DATA));
+	cdp_rdma_reg_write(D_DATA_FORMAT, reg);
+
+	/* config CDP */
+	if (cdp_op->lut_index >= 0)
+		update_lut(CDP_S_LUT_ACCESS_CFG_0, &lut, cdp_op->in_precision);
+
+	high = HIGH32BITS(output_address);
+	low = LOW32BITS(output_address);
+	cdp_reg_write(D_DST_BASE_ADDR_LOW, low);
+	cdp_reg_write(D_DST_BASE_ADDR_HIGH, high);
+
+	cdp_reg_write(D_DST_LINE_STRIDE, cdp_surface->dst_data.line_stride);
+	cdp_reg_write(D_DST_SURFACE_STRIDE, cdp_surface->dst_data.surf_stride);
+
+	reg = (map_ram[cdp_surface->dst_data.type]
+		<< SHIFT(CDP_D_DST_DMA_CFG_0, DST_RAM_TYPE));
+	cdp_reg_write(D_DST_DMA_CFG, reg);
+
+	reg = (map_precision[cdp_op->in_precision]
+		<< SHIFT(CDP_D_DATA_FORMAT_0, INPUT_DATA_TYPE));
+	cdp_reg_write(D_DATA_FORMAT, reg);
+
+	reg = (map_local_size(cdp_op->local_size)
+		<< SHIFT(CDP_D_LRN_CFG_0, NORMALZ_LEN));
+	cdp_reg_write(D_LRN_CFG, reg);
+
+	reg = (cdp_op->in_cvt.offset
+		<< SHIFT(CDP_D_DATIN_OFFSET_0, DATIN_OFFSET));
+	cdp_reg_write(D_DATIN_OFFSET, reg);
+
+	reg = (cdp_op->in_cvt.scale
+		<< SHIFT(CDP_D_DATIN_SCALE_0, DATIN_SCALE));
+	cdp_reg_write(D_DATIN_SCALE, reg);
+
+	reg = (cdp_op->in_cvt.truncate
+		<< SHIFT(CDP_D_DATIN_SHIFTER_0, DATIN_SHIFTER));
+	cdp_reg_write(D_DATIN_SHIFTER, reg);
+
+	reg = (cdp_op->out_cvt.offset
+		<< SHIFT(CDP_D_DATOUT_OFFSET_0, DATOUT_OFFSET));
+	cdp_reg_write(D_DATOUT_OFFSET, reg);
+
+	reg = (cdp_op->out_cvt.scale
+		<< SHIFT(CDP_D_DATOUT_SCALE_0, DATOUT_SCALE));
+	cdp_reg_write(D_DATOUT_SCALE, reg);
+
+	reg = (cdp_op->out_cvt.truncate
+		<< SHIFT(CDP_D_DATOUT_SHIFTER_0, DATOUT_SHIFTER));
+	cdp_reg_write(D_DATOUT_SHIFTER, reg);
+
+	reg = ((cdp_op->bypass_sqsum ?
+		FIELD_ENUM(CDP_D_FUNC_BYPASS_0, SQSUM_BYPASS, ENABLE) :
+		FIELD_ENUM(CDP_D_FUNC_BYPASS_0, SQSUM_BYPASS, DISABLE)) <<
+		SHIFT(CDP_D_FUNC_BYPASS_0, SQSUM_BYPASS)) |
+		((cdp_op->bypass_out_mul ?
+		FIELD_ENUM(CDP_D_FUNC_BYPASS_0, MUL_BYPASS, ENABLE) :
+		FIELD_ENUM(CDP_D_FUNC_BYPASS_0, MUL_BYPASS, DISABLE)) <<
+		SHIFT(CDP_D_FUNC_BYPASS_0, MUL_BYPASS));
+	cdp_reg_write(D_FUNC_BYPASS, reg);
+
+exit:
+	dla_debug("Exit: %s", __func__);
+	RETURN(ret);
+}
+
+int
+dla_cdp_is_ready(struct dla_processor *processor,
+		 struct dla_processor_group *group)
+{
+	struct dla_processor_group *next_group;
+	struct dla_cdp_op_desc *cdp_op;
+
+	cdp_op = &group->operation_desc->cdp_op;
+	next_group = &processor->groups[!group->id];
+
+	/**
+	 * Single LUT is shared between two CDP groups, need to make
+	 * sure that usage does not conflict. Also, LUT write
+	 * access is locked when CDP sub-engine is active, so delay
+	 * writing LUT when another group is active.
+	 */
+
+	/**
+	 * if no LUT required for current group then it can be programmed
+	 * without further checks
+	 */
+	if (cdp_op->lut_index == -1)
+		return 1;
+
+	/**
+	 * if same LUT is used for both groups then it can be programmed
+	 * without more checks. Even if another group is active and LUT
+	 * is locked, it would have been programmed by another group.
+	 */
+	if (next_group->lut_index == cdp_op->lut_index)
+		return 1;
+
+	/**
+	 * if LUT index of another group is not -1 means some LUT is programmed,
+	 * then do not program current LUT as we already know current LUT is not
+	 * -1 and neither same as another group.
+	 */
+	if (next_group->lut_index != -1)
+		return 0;
+
+	/**
+	 * if current group needs LUT different than another group and that
+	 * group is not active then program it.
+	 */
+	if (!next_group->active)
+		return 1;
+
+	/**
+	 * if control is here it means current group is using LUT different than
+	 * another group and that group is active. Wait for another group to
+	 * become idle.
+	 */
+
+	return 0;
+}
+
+void
+dla_cdp_dump_config(struct dla_processor_group *group)
+{
+	struct dla_cdp_op_desc *cdp_op;
+	struct dla_cdp_surface_desc *cdp_surface;
+
+	cdp_surface = &group->surface_desc->cdp_surface;
+	cdp_op = &group->operation_desc->cdp_op;
+
+	dla_debug_cdp_surface_desc(cdp_surface, group->roi_index);
+	dla_debug_cdp_op_desc(cdp_op, group->roi_index);
+}
+
+int
+dla_cdp_program(struct dla_processor_group *group)
+{
+	int32_t ret;
+
+	dla_debug("Enter: %s", __func__);
+	dla_enable_intr(MASK(GLB_S_INTR_MASK_0, CDP_DONE_MASK1) |
+			MASK(GLB_S_INTR_MASK_0, CDP_DONE_MASK0));
+
+	ret = processor_cdp_program(group);
+	if (ret)
+		goto exit;
+
+exit:
+	dla_debug("Exit: %s", __func__);
+	RETURN(ret);
+}
--- a/drivers/nvdla/common.c
+++ b/drivers/nvdla/common.c
@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <opendla.h>
+#include <dla_debug.h>
+#include <dla_err.h>
+#include <dla_interface.h>
+
+#include "common.h"
+#include "dla_engine_internal.h"
+
+static const uint8_t map_lut_method[] = {
+	FIELD_ENUM(CDP_S_LUT_CFG_0, LUT_LE_FUNCTION, EXPONENT),
+	FIELD_ENUM(CDP_S_LUT_CFG_0, LUT_LE_FUNCTION, LINEAR)
+};
+static const uint8_t map_lut_out[] = {
+	FIELD_ENUM(CDP_S_LUT_CFG_0, LUT_UFLOW_PRIORITY, LE),
+	FIELD_ENUM(CDP_S_LUT_CFG_0, LUT_UFLOW_PRIORITY, LO)
+};
+
+static const uint16_t access_data_offset[] = {
+	CDP_S_LUT_ACCESS_DATA_0 - CDP_S_LUT_ACCESS_CFG_0,
+	SDP_S_LUT_ACCESS_DATA_0 - SDP_S_LUT_ACCESS_CFG_0,
+};
+static const uint16_t lut_cfg_offset[] = {
+	CDP_S_LUT_CFG_0 - CDP_S_LUT_ACCESS_CFG_0,
+	SDP_S_LUT_CFG_0 - SDP_S_LUT_ACCESS_CFG_0,
+};
+static const uint16_t lut_info_offset[] = {
+	CDP_S_LUT_INFO_0 - CDP_S_LUT_ACCESS_CFG_0,
+	SDP_S_LUT_INFO_0 - SDP_S_LUT_ACCESS_CFG_0,
+};
+static const uint16_t le_start_offset[] = {
+	CDP_S_LUT_LE_START_LOW_0 - CDP_S_LUT_ACCESS_CFG_0,
+	SDP_S_LUT_LE_START_0 - SDP_S_LUT_ACCESS_CFG_0,
+};
+static const uint16_t le_end_offset[] = {
+	CDP_S_LUT_LE_END_LOW_0 - CDP_S_LUT_ACCESS_CFG_0,
+	SDP_S_LUT_LE_END_0 - SDP_S_LUT_ACCESS_CFG_0,
+};
+static const uint16_t lo_start_offset[] = {
+	CDP_S_LUT_LO_START_LOW_0 - CDP_S_LUT_ACCESS_CFG_0,
+	SDP_S_LUT_LO_START_0 - SDP_S_LUT_ACCESS_CFG_0,
+};
+static const uint16_t lo_end_offset[] = {
+	CDP_S_LUT_LO_END_LOW_0 - CDP_S_LUT_ACCESS_CFG_0,
+	SDP_S_LUT_LO_END_0 - SDP_S_LUT_ACCESS_CFG_0,
+};
+static const uint16_t le_slope_scale_offset[] = {
+	CDP_S_LUT_LE_SLOPE_SCALE_0 - CDP_S_LUT_ACCESS_CFG_0,
+	SDP_S_LUT_LE_SLOPE_SCALE_0 - SDP_S_LUT_ACCESS_CFG_0,
+};
+static const uint16_t le_slope_shift_offset[] = {
+	CDP_S_LUT_LE_SLOPE_SHIFT_0 - CDP_S_LUT_ACCESS_CFG_0,
+	SDP_S_LUT_LE_SLOPE_SHIFT_0 - SDP_S_LUT_ACCESS_CFG_0,
+};
+static const uint16_t lo_slope_scale_offset[] = {
+	CDP_S_LUT_LO_SLOPE_SCALE_0 - CDP_S_LUT_ACCESS_CFG_0,
+	SDP_S_LUT_LO_SLOPE_SCALE_0 - SDP_S_LUT_ACCESS_CFG_0,
+};
+static const uint16_t lo_slope_shift_offset[] = {
+	CDP_S_LUT_LO_SLOPE_SHIFT_0 - CDP_S_LUT_ACCESS_CFG_0,
+	SDP_S_LUT_LO_SLOPE_SHIFT_0 - SDP_S_LUT_ACCESS_CFG_0,
+};
+
+void update_lut(uint32_t reg_base, struct dla_lut_param *lut,
+							uint8_t precision)
+{
+	int32_t i;
+	uint32_t reg;
+	uint32_t high, low;
+	int32_t is_sdp = reg_base == SDP_S_LUT_ACCESS_CFG_0;
+	struct dla_engine *engine = dla_get_engine();
+
+	/* program raw table */
+	reg = (FIELD_ENUM(CDP_S_LUT_ACCESS_CFG_0, LUT_TABLE_ID, LE)
+		<< SHIFT(CDP_S_LUT_ACCESS_CFG_0, LUT_TABLE_ID)) |
+		(FIELD_ENUM(CDP_S_LUT_ACCESS_CFG_0, LUT_ACCESS_TYPE, WRITE)
+		<< SHIFT(CDP_S_LUT_ACCESS_CFG_0, LUT_ACCESS_TYPE));
+	reg_write(reg_base, reg);
+
+	for (i = 0; i < (1<<LUT_LINEAR_EXP_TABLE_ENTRY_LOG2)+1; i++) {
+		dla_reg_write(engine->driver_context,
+				reg_base + access_data_offset[is_sdp],
+				lut->linear_exp_table[i]);
+	}
+
+	/* program density table */
+	reg = (FIELD_ENUM(CDP_S_LUT_ACCESS_CFG_0, LUT_TABLE_ID, LO)
+		<< SHIFT(CDP_S_LUT_ACCESS_CFG_0, LUT_TABLE_ID)) |
+		(FIELD_ENUM(CDP_S_LUT_ACCESS_CFG_0, LUT_ACCESS_TYPE, WRITE)
+		<< SHIFT(CDP_S_LUT_ACCESS_CFG_0, LUT_ACCESS_TYPE));
+	dla_reg_write(engine->driver_context, reg_base, reg);
+
+	for (i = 0; i < (1<<LUT_LINEAR_ONLY_TABLE_ENTRY_LOG2)+1; i++) {
+		dla_reg_write(engine->driver_context,
+				reg_base + access_data_offset[is_sdp],
+				lut->linear_only_table[i]);
+	}
+
+	/* program other configurations */
+	reg = (map_lut_method[lut->method] <<
+		SHIFT(CDP_S_LUT_CFG_0, LUT_LE_FUNCTION)) |
+		(map_lut_out[lut->hybrid_priority] <<
+		SHIFT(CDP_S_LUT_CFG_0, LUT_HYBRID_PRIORITY)) |
+		(map_lut_out[lut->underflow_priority] <<
+		SHIFT(CDP_S_LUT_CFG_0, LUT_UFLOW_PRIORITY)) |
+		(map_lut_out[lut->overflow_priority] <<
+		SHIFT(CDP_S_LUT_CFG_0, LUT_OFLOW_PRIORITY));
+	dla_reg_write(engine->driver_context,
+			reg_base + lut_cfg_offset[is_sdp], reg);
+
+	if (lut->method == FIELD_ENUM(CDP_S_LUT_CFG_0,
+					LUT_LE_FUNCTION, EXPONENT)) {
+		reg = ((((uint32_t)lut->linear_exp_offset.exp_offset) <<
+			SHIFT(CDP_S_LUT_INFO_0, LUT_LE_INDEX_OFFSET))&
+		MASK(CDP_S_LUT_INFO_0, LUT_LE_INDEX_OFFSET)) |
+			((((uint32_t)lut->linear_only_offset.frac_bits) <<
+			SHIFT(CDP_S_LUT_INFO_0, LUT_LO_INDEX_SELECT))&
+		MASK(CDP_S_LUT_INFO_0, LUT_LO_INDEX_SELECT));
+	} else {
+		reg = ((((uint32_t)lut->linear_exp_offset.frac_bits) <<
+			SHIFT(CDP_S_LUT_INFO_0, LUT_LE_INDEX_SELECT))&
+		MASK(CDP_S_LUT_INFO_0, LUT_LE_INDEX_SELECT)) |
+			((((uint32_t)lut->linear_only_offset.frac_bits) <<
+			SHIFT(CDP_S_LUT_INFO_0, LUT_LO_INDEX_SELECT))&
+		MASK(CDP_S_LUT_INFO_0, LUT_LO_INDEX_SELECT));
+	}
+	dla_reg_write(engine->driver_context,
+			reg_base + lut_info_offset[is_sdp], reg);
+	high = HIGH32BITS(lut->linear_exp_start);
+	low = LOW32BITS(lut->linear_exp_start);
+	dla_reg_write(engine->driver_context,
+			reg_base + le_start_offset[is_sdp], low);
+	if (!is_sdp)
+		dla_reg_write(engine->driver_context,
+				reg_base + le_start_offset[is_sdp] + 4, high);
+
+	high = HIGH32BITS(lut->linear_exp_end);
+	low = LOW32BITS(lut->linear_exp_end);
+	dla_reg_write(engine->driver_context,
+				reg_base + le_end_offset[is_sdp], low);
+	if (!is_sdp)
+		dla_reg_write(engine->driver_context,
+				reg_base + le_end_offset[is_sdp] + 4, high);
+
+	high = HIGH32BITS(lut->linear_only_start);
+	low = LOW32BITS(lut->linear_only_start);
+	dla_reg_write(engine->driver_context,
+				reg_base + lo_start_offset[is_sdp], low);
+	if (!is_sdp)
+		dla_reg_write(engine->driver_context,
+				reg_base + lo_start_offset[is_sdp] + 4, high);
+
+	high = HIGH32BITS(lut->linear_only_end);
+	low = LOW32BITS(lut->linear_only_end);
+	dla_reg_write(engine->driver_context,
+				reg_base + lo_end_offset[is_sdp], low);
+	if (!is_sdp)
+		dla_reg_write(engine->driver_context,
+				reg_base + lo_end_offset[is_sdp] + 4, high);
+
+	if (precision == PRECISION_FP16) {
+		reg = (lut->linear_exp_underflow_slope.data_f <<
+			SHIFT(CDP_S_LUT_LE_SLOPE_SCALE_0,
+					LUT_LE_SLOPE_UFLOW_SCALE)) |
+			(lut->linear_exp_overflow_slope.data_f <<
+			SHIFT(CDP_S_LUT_LE_SLOPE_SCALE_0,
+					LUT_LE_SLOPE_OFLOW_SCALE));
+		dla_reg_write(engine->driver_context,
+				reg_base + le_slope_scale_offset[is_sdp], reg);
+
+		reg = (lut->linear_only_underflow_slope.data_f <<
+			SHIFT(CDP_S_LUT_LO_SLOPE_SCALE_0,
+					LUT_LO_SLOPE_UFLOW_SCALE)) |
+			(lut->linear_only_overflow_slope.data_f <<
+			SHIFT(CDP_S_LUT_LO_SLOPE_SCALE_0,
+					LUT_LO_SLOPE_OFLOW_SCALE));
+		dla_reg_write(engine->driver_context,
+				reg_base + lo_slope_scale_offset[is_sdp], reg);
+	} else {
+		union dla_slope *oslope;
+		union dla_slope *uslope;
+
+		uslope = &lut->linear_exp_underflow_slope;
+		oslope = &lut->linear_exp_overflow_slope;
+		reg = ((((uint32_t)uslope->data_i.scale)
+			<< SHIFT(CDP_S_LUT_LE_SLOPE_SCALE_0,
+					LUT_LE_SLOPE_UFLOW_SCALE))&
+			MASK(CDP_S_LUT_LE_SLOPE_SCALE_0,
+					LUT_LE_SLOPE_UFLOW_SCALE)) |
+			((((uint32_t)oslope->data_i.scale)
+			<< SHIFT(CDP_S_LUT_LE_SLOPE_SCALE_0,
+					LUT_LE_SLOPE_OFLOW_SCALE))&
+			MASK(CDP_S_LUT_LE_SLOPE_SCALE_0,
+					LUT_LE_SLOPE_OFLOW_SCALE));
+		dla_reg_write(engine->driver_context,
+				reg_base + le_slope_scale_offset[is_sdp], reg);
+
+		reg = ((((uint32_t)uslope->data_i.shifter) <<
+			SHIFT(CDP_S_LUT_LE_SLOPE_SHIFT_0,
+					LUT_LE_SLOPE_UFLOW_SHIFT))&
+			MASK(CDP_S_LUT_LE_SLOPE_SHIFT_0,
+					LUT_LE_SLOPE_UFLOW_SHIFT)) |
+			((((uint32_t)oslope->data_i.shifter) <<
+			SHIFT(CDP_S_LUT_LE_SLOPE_SHIFT_0,
+					LUT_LE_SLOPE_OFLOW_SHIFT))&
+			MASK(CDP_S_LUT_LE_SLOPE_SHIFT_0,
+					LUT_LE_SLOPE_OFLOW_SHIFT));
+		dla_reg_write(engine->driver_context,
+				reg_base + le_slope_shift_offset[is_sdp], reg);
+
+		uslope = &lut->linear_only_underflow_slope;
+		oslope = &lut->linear_only_overflow_slope;
+		reg = ((((uint32_t)uslope->data_i.scale) <<
+			SHIFT(CDP_S_LUT_LO_SLOPE_SCALE_0,
+					LUT_LO_SLOPE_UFLOW_SCALE))&
+			MASK(CDP_S_LUT_LO_SLOPE_SCALE_0,
+					LUT_LO_SLOPE_UFLOW_SCALE)) |
+			((((uint32_t)oslope->data_i.scale) <<
+			SHIFT(CDP_S_LUT_LO_SLOPE_SCALE_0,
+					LUT_LO_SLOPE_OFLOW_SCALE))&
+			MASK(CDP_S_LUT_LO_SLOPE_SCALE_0,
+					LUT_LO_SLOPE_OFLOW_SCALE));
+		dla_reg_write(engine->driver_context,
+				reg_base + lo_slope_scale_offset[is_sdp], reg);
+		reg = ((((uint32_t)uslope->data_i.shifter) <<
+			SHIFT(CDP_S_LUT_LO_SLOPE_SHIFT_0,
+					LUT_LO_SLOPE_UFLOW_SHIFT))&
+			MASK(CDP_S_LUT_LO_SLOPE_SHIFT_0,
+					LUT_LO_SLOPE_UFLOW_SHIFT)) |
+			((((uint32_t)oslope->data_i.shifter) <<
+			SHIFT(CDP_S_LUT_LO_SLOPE_SHIFT_0,
+					LUT_LO_SLOPE_OFLOW_SHIFT))&
+			MASK(CDP_S_LUT_LO_SLOPE_SHIFT_0,
+					LUT_LO_SLOPE_OFLOW_SHIFT));
+		dla_reg_write(engine->driver_context,
+				reg_base + lo_slope_shift_offset[is_sdp], reg);
+	}
+}
+
+int
+validate_data_cube(struct dla_data_cube src_data_cube,
+			struct dla_data_cube dst_data_cube,
+			uint8_t mem_type)
+{
+	int32_t ret = 0;
+
+	dla_trace("Enter: %s", __func__);
+
+	if ((src_data_cube.width > DCUBE_MAX_WIDTH) ||
+	    (src_data_cube.height > DCUBE_MAX_HEIGHT) ||
+	    (src_data_cube.channel > DCUBE_MAX_CHANNEL)) {
+		dla_error("Invalid SrcInput Cude[W: %u, H: %u, C: %u]",
+				src_data_cube.width, src_data_cube.height,
+				src_data_cube.channel);
+		ret = ERR(INVALID_INPUT);
+		goto exit;
+	}
+
+	if ((dst_data_cube.width > DCUBE_MAX_WIDTH) ||
+	    (dst_data_cube.height > DCUBE_MAX_HEIGHT) ||
+	    (dst_data_cube.channel > DCUBE_MAX_CHANNEL)) {
+		dla_error("Invalid DstInput Cude[W: %u, H: %u, C: %u]",
+				dst_data_cube.width, dst_data_cube.height,
+				dst_data_cube.channel);
+		ret = ERR(INVALID_INPUT);
+		goto exit;
+	}
+
+	if (src_data_cube.type > mem_type) {
+		dla_error("Invalid src_data.mem_type: %u\n", src_data_cube.type);
+		ret = ERR(INVALID_INPUT);
+		goto exit;
+	}
+
+	if (dst_data_cube.type > mem_type) {
+		dla_error("Invalid dst_data.mem_type: %u\n", dst_data_cube.type);
+		ret = ERR(INVALID_INPUT);
+		goto exit;
+	}
+
+exit:
+	dla_trace("Exit: %s", __func__);
+	RETURN(ret);
+}
+
+int
+validate_precision(uint8_t precision, uint8_t map_precision)
+{
+	int32_t ret = 0;
+
+	if (precision >= map_precision) {
+		dla_error("Invalid precision: %u\n", precision);
+		ret = ERR(INVALID_INPUT);
+	}
+
+	RETURN(ret);
+}
--- a/drivers/nvdla/common.h
+++ b/drivers/nvdla/common.h
@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __FIRMWARE_COMMON_H_
+#define __FIRMWARE_COMMON_H_
+
+#include <dla_interface.h>
+
+#define DCUBE_MAX_WIDTH		8192
+#define DCUBE_MAX_HEIGHT	8192
+#define DCUBE_MAX_CHANNEL	8192
+
+void update_lut(uint32_t reg_base,
+		struct dla_lut_param *lut,
+		uint8_t precision);
+int32_t validate_data_cube(struct dla_data_cube src_data_cube,
+			struct dla_data_cube dst_data_cube,
+			uint8_t mem_type);
+int32_t validate_precision(uint8_t precision,
+			uint8_t map_precision);
+
+#endif /* __FIRMWARE_COMMON_H_ */
--- a/drivers/nvdla/conv.c
+++ b/drivers/nvdla/conv.c
@ -0,0 +1,779 @@
+/*
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <opendla.h>
+#include <dla_debug.h>
+#include <dla_err.h>
+#include <dla_interface.h>
+
+#include "common.h"
+#include "dla_engine_internal.h"
+#include "engine_debug.h"
+
+static const uint8_t map_precision[] = {
+	FIELD_ENUM(CDMA_D_MISC_CFG_0, IN_PRECISION, INT8),
+	FIELD_ENUM(CDMA_D_MISC_CFG_0, IN_PRECISION, INT16),
+	FIELD_ENUM(CDMA_D_MISC_CFG_0, IN_PRECISION, FP16),
+};
+
+static const uint8_t map_conv[] = {
+	FIELD_ENUM(CACC_D_MISC_CFG_0, CONV_MODE, DIRECT),
+	FIELD_ENUM(CACC_D_MISC_CFG_0, CONV_MODE, WINOGRAD),
+};
+
+static const uint8_t map_weight_fmt[] = {
+	FIELD_ENUM(CSC_D_WEIGHT_FORMAT_0, WEIGHT_FORMAT, UNCOMPRESSED),
+	FIELD_ENUM(CSC_D_WEIGHT_FORMAT_0, WEIGHT_FORMAT, COMPRESSED),
+};
+
+static const uint8_t map_img_fmt[][2] = {
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_R8), 1},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_R10), 2},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_R12), 2},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_R16), 2},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_R16_I), 2},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_R16_F), 2},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_A16B16G16R16), 8},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_X16B16G16R16), 8},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_A16B16G16R16_F), 8},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_A16Y16U16V16), 8},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_V16U16Y16A16), 8},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_A16Y16U16V16_F), 8},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_A8B8G8R8), 4},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_A8R8G8B8), 4},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_B8G8R8A8), 4},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_R8G8B8A8), 4},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_X8B8G8R8), 4},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_X8R8G8B8), 4},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_B8G8R8X8), 4},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_R8G8B8X8), 4},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_A2B10G10R10), 4},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_A2R10G10B10), 4},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_B10G10R10A2), 4},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_R10G10B10A2), 4},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_A2Y10U10V10), 4},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_V10U10Y10A2), 4},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_A8Y8U8V8), 4},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_V8U8Y8A8), 4},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_Y8___U8V8_N444), 1},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_Y8___V8U8_N444), 1},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_Y10___U10V10_N444), 2},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_Y10___V10U10_N444), 2},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_Y12___U12V12_N444), 2},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_Y12___V12U12_N444), 2},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_Y16___U16V16_N444), 2},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			PIXEL_FORMAT, T_Y16___V16U16_N444), 2},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			DATAIN_FORMAT, FEATURE), 2},
+	{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+			DATAIN_FORMAT, PIXEL), 1},
+};
+
+static const uint8_t map_pixel[] = {
+	FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, PIXEL_MAPPING, PITCH_LINEAR),
+};
+
+static const uint8_t map_ram[] = {
+	FIELD_ENUM(CDMA_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE, MCIF),
+	FIELD_ENUM(CDMA_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE, CVIF),
+};
+
+static const uint8_t map_mean[] = {
+	FIELD_ENUM(CDMA_D_MEAN_FORMAT_0, MEAN_FORMAT, DISABLE),
+	FIELD_ENUM(CDMA_D_MEAN_FORMAT_0, MEAN_FORMAT, ENABLE),
+};
+
+#if STAT_ENABLE
+void
+dla_conv_stat_data(struct dla_processor *processor,
+					struct dla_processor_group *group)
+{
+	uint64_t end_time = 0;
+	struct dla_conv_stat_desc *conv_stat;
+
+	conv_stat = &processor->stat_data_desc->conv_stat;
+
+	end_time = dla_get_time_us();
+
+	conv_stat->data_read_stall = cdma_reg_read(D_PERF_DAT_READ_STALL);
+	conv_stat->weight_read_stall = cdma_reg_read(D_PERF_WT_READ_STALL);
+	conv_stat->data_read_latency = cdma_reg_read(D_PERF_DAT_READ_LATENCY);
+	conv_stat->weight_read_latency = cdma_reg_read(D_PERF_WT_READ_LATENCY);
+	conv_stat->nan_data_num = cdma_reg_read(D_NAN_INPUT_DATA_NUM);
+	conv_stat->nan_weight_num = cdma_reg_read(D_NAN_INPUT_WEIGHT_NUM);
+	conv_stat->inf_data_num = cdma_reg_read(D_INF_INPUT_DATA_NUM);
+	conv_stat->inf_weight_num = cdma_reg_read(D_INF_INPUT_WEIGHT_NUM);
+	conv_stat->saturation_count = cacc_reg_read(D_OUT_SATURATION);
+	conv_stat->runtime = (uint32_t)(end_time - group->start_time);
+}
+
+void
+dla_conv_dump_stat(struct dla_processor *processor)
+{
+	struct dla_conv_stat_desc *conv_stat;
+
+	conv_stat = &processor->stat_data_desc->conv_stat;
+
+	dla_debug_conv_stats(conv_stat);
+}
+#endif /* STAT_ENABLE */
+
+static uint32_t
+get_in_format(uint8_t format)
+{
+	uint32_t in_format = 0;
+
+	if (format >= FORMAT_T_R8 && format < FORMAT_FEATURE) {
+		in_format = FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+						DATAIN_FORMAT, PIXEL);
+	} else if (format == FORMAT_FEATURE) {
+		in_format = FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
+						DATAIN_FORMAT, FEATURE);
+	} else {
+		assert(0);
+	}
+
+	return in_format;
+}
+
+void
+dla_conv_set_producer(int32_t group_id, int32_t rdma_group_id)
+{
+	uint32_t reg;
+
+	/* set producer pointer for all sub-modules */
+	reg = group_id << SHIFT(CACC_S_POINTER_0, PRODUCER);
+	cacc_reg_write(S_POINTER, reg);
+	cmac_a_reg_write(S_POINTER, reg);
+	cmac_b_reg_write(S_POINTER, reg);
+	csc_reg_write(S_POINTER, reg);
+	cdma_reg_write(S_POINTER, reg);
+}
+
+int
+dla_conv_enable(struct dla_processor_group *group)
+{
+	uint32_t reg;
+	struct dla_engine *engine = dla_get_engine();
+
+	dla_trace("Enter: %s", __func__);
+
+	do {
+		reg = cdma_reg_read(S_CBUF_FLUSH_STATUS);
+	} while (!(reg & MASK(CDMA_S_CBUF_FLUSH_STATUS_0, FLUSH_DONE)));
+
+	if (engine->stat_enable == (uint32_t)1) {
+		cdma_reg_write(D_PERF_ENABLE, 1);
+		group->start_time = dla_get_time_us();
+	}
+
+	/* enable all sub-modules */
+	reg = FIELD_ENUM(CACC_D_OP_ENABLE_0, OP_EN, ENABLE);
+	cacc_reg_write(D_OP_ENABLE, reg);
+	cmac_a_reg_write(D_OP_ENABLE, reg);
+	cmac_b_reg_write(D_OP_ENABLE, reg);
+	csc_reg_write(D_OP_ENABLE, reg);
+	cdma_reg_write(D_OP_ENABLE, reg);
+
+	dla_trace("Exit: %s", __func__);
+
+	RETURN(0);
+}
+
+void
+dla_conv_rdma_check(struct dla_processor_group *group)
+{
+	group->is_rdma_needed = 0;
+}
+
+static int32_t
+processor_conv_program(struct dla_processor_group *group)
+{
+	int32_t ret = 0;
+	uint32_t reg, high, low, shift, mask;
+	uint32_t stride_x, stride_y, pad_x, pad_y;
+	uint64_t weight_address = 0;
+	uint64_t wmb_address = 0;
+	uint64_t wgs_address = 0;
+	uint64_t input_address = 0;
+	uint64_t output_address = 0;
+	uint32_t atom_size = 0;
+	bool weight_compress_support = false;
+	struct dla_engine *engine = dla_get_engine();
+	struct dla_conv_op_desc *conv_op;
+	struct dla_conv_surface_desc *conv_surface;
+
+	dla_trace("Enter: %s", __func__);
+
+	weight_compress_support = engine->config_data->weight_compress_support;
+	atom_size = engine->config_data->atom_size;
+	conv_op = &group->operation_desc->conv_op;
+	conv_surface = &group->surface_desc->conv_surface;
+
+	if (conv_op->weight_format == WEIGHT_FORMAT_COMPRESSED) {
+		ASSERT_GOTO((weight_compress_support), ret, ERR(INVALID_INPUT), exit);
+		ASSERT_GOTO((conv_surface->wmb_data.address != -1),
+			ret, ERR(INVALID_INPUT), exit);
+		dla_get_dma_cube_address(engine->driver_context,
+					engine->task->task_data,
+					conv_surface->wmb_data.address,
+					conv_surface->wmb_data.offset,
+					(void *)&wmb_address,
+					DESTINATION_DMA);
+		CHECK_ALIGN(wmb_address, atom_size);
+		CHECK_ALIGN(conv_surface->wmb_data.size, 128);
+
+		ASSERT_GOTO((conv_surface->wgs_data.address != -1),
+			ret, ERR(INVALID_INPUT), exit);
+		dla_get_dma_cube_address(engine->driver_context,
+					engine->task->task_data,
+					conv_surface->wgs_data.address,
+					conv_surface->wgs_data.offset,
+					(void *)&wgs_address,
+					DESTINATION_DMA);
+		CHECK_ALIGN(wgs_address, atom_size);
+		CHECK_ALIGN(conv_surface->wgs_data.size, 4);
+	}
+
+	if (conv_surface->weight_data.address != -1) {
+		dla_get_dma_cube_address(engine->driver_context,
+					engine->task->task_data,
+					conv_surface->weight_data.address,
+					conv_surface->weight_data.offset,
+					(void *)&weight_address,
+					DESTINATION_DMA);
+		CHECK_ALIGN(weight_address, atom_size);
+		CHECK_ALIGN(conv_surface->weight_data.size, 128);
+	}
+
+	if (conv_surface->dst_data.address != -1) {
+		dla_get_dma_cube_address(engine->driver_context,
+					engine->task->task_data,
+					conv_surface->dst_data.address,
+					conv_surface->dst_data.offset,
+					(void *)&output_address,
+					DESTINATION_DMA);
+		CHECK_ALIGN(output_address, atom_size);
+		CHECK_ALIGN(conv_surface->dst_data.size, atom_size);
+		CHECK_ALIGN(conv_surface->dst_data.line_stride, atom_size);
+		CHECK_ALIGN(conv_surface->dst_data.surf_stride, atom_size);
+	}
+
+	ret = dla_read_input_address(&conv_surface->src_data, &input_address,
+					group->op_desc->index,
+					group->roi_index,
+					map_img_fmt[conv_op->data_format][1]);
+	if (ret)
+		goto exit;
+
+	CHECK_ALIGN(input_address, atom_size);
+
+	ASSERT_GOTO((conv_op->out_cvt.scale  == 1),
+		ret, ERR(INVALID_INPUT), exit);
+	ASSERT_GOTO((conv_op->out_cvt.offset == 0),
+		ret, ERR(INVALID_INPUT), exit);
+
+	/* check if the register group is idle */
+	reg = cacc_reg_read(S_STATUS);
+	mask = group->id ? MASK(CACC_S_STATUS_0, STATUS_1) :
+		MASK(CACC_S_STATUS_0, STATUS_0);
+	shift = group->id ? SHIFT(CACC_S_STATUS_0, STATUS_1) :
+		SHIFT(CACC_S_STATUS_0, STATUS_0);
+	reg = (reg & mask) >> shift;
+	ASSERT_GOTO((reg == FIELD_ENUM(CACC_S_STATUS_0, STATUS_0, IDLE)),
+		ret, ERR(INVALID_INPUT), exit);
+
+	reg = cmac_a_reg_read(S_STATUS);
+	mask = group->id ? MASK(CMAC_A_S_STATUS_0, STATUS_1) :
+        MASK(CMAC_A_S_STATUS_0, STATUS_0);
+	shift = group->id ? SHIFT(CMAC_A_S_STATUS_0, STATUS_1) :
+		SHIFT(CMAC_A_S_STATUS_0, STATUS_0);
+	reg = (reg & mask) >> shift;
+	ASSERT_GOTO((reg == FIELD_ENUM(CMAC_A_S_STATUS_0, STATUS_0, IDLE)),
+		ret, ERR(INVALID_INPUT), exit);
+
+	reg = cmac_b_reg_read(S_STATUS);
+	mask = group->id ? MASK(CMAC_B_S_STATUS_0, STATUS_1) :
+		MASK(CMAC_B_S_STATUS_0, STATUS_0);
+	shift = group->id ? SHIFT(CMAC_B_S_STATUS_0, STATUS_1) :
+		SHIFT(CMAC_B_S_STATUS_0, STATUS_0);
+	reg = (reg & mask) >> shift;
+	ASSERT_GOTO((reg == FIELD_ENUM(CMAC_B_S_STATUS_0, STATUS_0, IDLE)),
+		ret, ERR(INVALID_INPUT), exit);
+
+	reg = csc_reg_read(S_STATUS);
+	mask = group->id ? MASK(CSC_S_STATUS_0, STATUS_1) :
+		MASK(CSC_S_STATUS_0, STATUS_0);
+	shift = group->id ? SHIFT(CSC_S_STATUS_0, STATUS_1) :
+		SHIFT(CSC_S_STATUS_0, STATUS_0);
+	reg = (reg & mask) >> shift;
+	ASSERT_GOTO((reg == FIELD_ENUM(CSC_S_STATUS_0, STATUS_0, IDLE)),
+		ret, ERR(INVALID_INPUT), exit);
+
+	reg = cdma_reg_read(S_STATUS);
+	mask = group->id ? MASK(CDMA_S_STATUS_0, STATUS_1) :
+		MASK(CDMA_S_STATUS_0, STATUS_0);
+	shift = group->id ? SHIFT(CDMA_S_STATUS_0, STATUS_1) :
+		SHIFT(CDMA_S_STATUS_0, STATUS_0);
+	reg = (reg & mask) >> shift;
+	ASSERT_GOTO((reg == FIELD_ENUM(CDMA_S_STATUS_0, STATUS_0, IDLE)),
+		ret, ERR(INVALID_INPUT), exit);
+
+	/* reverse config each sub-module in CC */
+
+	/* CACC */
+	reg = (map_conv[conv_op->conv_mode]
+		<< SHIFT(CACC_D_MISC_CFG_0, CONV_MODE)) |
+		(map_precision[conv_op->out_precision]
+		<< SHIFT(CACC_D_MISC_CFG_0, PROC_PRECISION));
+	cacc_reg_write(D_MISC_CFG, reg);
+
+	reg = ((conv_surface->dst_data.width - 1)
+		<< SHIFT(CACC_D_DATAOUT_SIZE_0_0, DATAOUT_WIDTH)) |
+		((conv_surface->dst_data.height - 1)
+		<< SHIFT(CACC_D_DATAOUT_SIZE_0_0, DATAOUT_HEIGHT));
+	cacc_reg_write(D_DATAOUT_SIZE_0, reg);
+
+	reg = ((conv_surface->dst_data.channel - 1)
+		<< SHIFT(CACC_D_DATAOUT_SIZE_1_0, DATAOUT_CHANNEL));
+	cacc_reg_write(D_DATAOUT_SIZE_1, reg);
+
+	low = LOW32BITS(output_address);
+	cacc_reg_write(D_DATAOUT_ADDR, low);
+	cacc_reg_write(D_BATCH_NUMBER, conv_op->batch - 1);
+	cacc_reg_write(D_LINE_STRIDE, conv_surface->dst_data.line_stride);
+	cacc_reg_write(D_SURF_STRIDE, conv_surface->dst_data.surf_stride);
+
+	if (conv_surface->dst_data.width == 1 &&
+				conv_surface->dst_data.height == 1) {
+		ASSERT_GOTO((((uint32_t)conv_surface->dst_data.line_stride ==
+			(uint32_t)(conv_surface->dst_data.width * atom_size))),
+			ret, ERR(INVALID_INPUT), exit);
+		reg = (CACC_D_DATAOUT_MAP_0_LINE_PACKED_TRUE <<
+				SHIFT(CACC_D_DATAOUT_MAP_0, LINE_PACKED));
+		reg |= (CACC_D_DATAOUT_MAP_0_SURF_PACKED_TRUE <<
+				SHIFT(CACC_D_DATAOUT_MAP_0, SURF_PACKED));
+	} else {
+		reg = (FIELD_ENUM(CACC_D_DATAOUT_MAP_0, LINE_PACKED, FALSE) <<
+				SHIFT(CACC_D_DATAOUT_MAP_0, LINE_PACKED));
+		reg |= (FIELD_ENUM(CACC_D_DATAOUT_MAP_0, SURF_PACKED, FALSE) <<
+				SHIFT(CACC_D_DATAOUT_MAP_0, SURF_PACKED));
+	}
+	cacc_reg_write(D_DATAOUT_MAP, reg);
+
+	cacc_reg_write(D_CLIP_CFG, conv_op->out_cvt.truncate);
+
+	/* CMAC */
+	reg = (map_conv[conv_op->conv_mode]
+		<< SHIFT(CMAC_A_D_MISC_CFG_0, CONV_MODE)) |
+		(map_precision[conv_op->out_precision]
+		<< SHIFT(CMAC_A_D_MISC_CFG_0, PROC_PRECISION));
+	cmac_a_reg_write(D_MISC_CFG, reg);
+	cmac_b_reg_write(D_MISC_CFG, reg);
+
+	/* CSC */
+	reg = (map_conv[conv_op->conv_mode]
+		<< SHIFT(CSC_D_MISC_CFG_0, CONV_MODE)) |
+		(map_precision[conv_op->out_precision]
+		<< SHIFT(CSC_D_MISC_CFG_0, IN_PRECISION)) |
+		(map_precision[conv_op->out_precision]
+		<< SHIFT(CSC_D_MISC_CFG_0, PROC_PRECISION)) |
+		(conv_op->data_reuse
+		<< SHIFT(CSC_D_MISC_CFG_0, DATA_REUSE)) |
+		(conv_op->weight_reuse
+		<< SHIFT(CSC_D_MISC_CFG_0, WEIGHT_REUSE)) |
+		(conv_op->skip_data_rls
+		<< SHIFT(CSC_D_MISC_CFG_0, SKIP_DATA_RLS)) |
+		(conv_op->skip_weight_rls
+		<< SHIFT(CSC_D_MISC_CFG_0, SKIP_WEIGHT_RLS));
+	csc_reg_write(D_MISC_CFG, reg);
+
+	reg = (get_in_format(conv_op->data_format) <<
+		SHIFT(CSC_D_DATAIN_FORMAT_0, DATAIN_FORMAT));
+	csc_reg_write(D_DATAIN_FORMAT, reg);
+
+	reg = ((conv_op->input_width_csc - 1)
+		<< SHIFT(CSC_D_DATAIN_SIZE_EXT_0_0, DATAIN_WIDTH_EXT)) |
+		((conv_op->input_height_csc - 1)
+		<< SHIFT(CSC_D_DATAIN_SIZE_EXT_0_0, DATAIN_HEIGHT_EXT));
+	csc_reg_write(D_DATAIN_SIZE_EXT_0, reg);
+
+	reg = ((conv_op->input_channel_csc - 1)
+		<< SHIFT(CSC_D_DATAIN_SIZE_EXT_1_0, DATAIN_CHANNEL_EXT));
+	csc_reg_write(D_DATAIN_SIZE_EXT_1, reg);
+
+	reg = ((conv_op->batch - 1)
+		<< SHIFT(CSC_D_BATCH_NUMBER_0, BATCHES));
+	csc_reg_write(D_BATCH_NUMBER, reg);
+	reg = ((conv_op->post_extension)
+		<< SHIFT(CSC_D_POST_Y_EXTENSION_0, Y_EXTENSION));
+	csc_reg_write(D_POST_Y_EXTENSION, reg);
+
+	reg = ((conv_op->entry_per_slice - 1)
+		<< SHIFT(CSC_D_ENTRY_PER_SLICE_0, ENTRIES));
+	csc_reg_write(D_ENTRY_PER_SLICE, reg);
+
+	reg = (map_weight_fmt[conv_op->weight_format]
+		<< SHIFT(CSC_D_WEIGHT_FORMAT_0, WEIGHT_FORMAT));
+	csc_reg_write(D_WEIGHT_FORMAT, reg);
+
+	reg = ((conv_op->kernel_width_csc - 1)
+		<< SHIFT(CSC_D_WEIGHT_SIZE_EXT_0_0, WEIGHT_WIDTH_EXT)) |
+		((conv_op->kernel_height_csc - 1)
+		<< SHIFT(CSC_D_WEIGHT_SIZE_EXT_0_0, WEIGHT_HEIGHT_EXT));
+	csc_reg_write(D_WEIGHT_SIZE_EXT_0, reg);
+
+	reg = ((conv_op->kernel_channel_csc - 1)
+		<< SHIFT(CSC_D_WEIGHT_SIZE_EXT_1_0, WEIGHT_CHANNEL_EXT)) |
+		((conv_surface->dst_data.channel - 1)
+		<< SHIFT(CSC_D_WEIGHT_SIZE_EXT_1_0, WEIGHT_KERNEL));
+	csc_reg_write(D_WEIGHT_SIZE_EXT_1, reg);
+
+	csc_reg_write(D_WEIGHT_BYTES, conv_surface->weight_data.size);
+	csc_reg_write(D_WMB_BYTES, conv_surface->wmb_data.size);
+
+	reg = ((conv_op->input_width_cmac - 1)
+		<< SHIFT(CSC_D_DATAOUT_SIZE_0_0, DATAOUT_WIDTH)) |
+		((conv_op->input_height_cmac - 1)
+		<< SHIFT(CSC_D_DATAOUT_SIZE_0_0, DATAOUT_HEIGHT));
+	csc_reg_write(D_DATAOUT_SIZE_0, reg);
+
+	reg = ((conv_surface->dst_data.channel - 1)
+		<< SHIFT(CSC_D_DATAOUT_SIZE_1_0, DATAOUT_CHANNEL));
+	csc_reg_write(D_DATAOUT_SIZE_1, reg);
+
+	reg = ((conv_surface->dst_data.width *
+				conv_surface->dst_data.height - 1)
+		<< SHIFT(CSC_D_ATOMICS_0, ATOMICS));
+	csc_reg_write(D_ATOMICS, reg);
+	reg = ((conv_op->release - 1)
+		<< SHIFT(CSC_D_RELEASE_0, RLS_SLICES));
+	csc_reg_write(D_RELEASE, reg);
+
+	if (conv_op->conv_mode == CONV_MODE_DIRECT) {
+		stride_x = conv_op->conv_stride_x - 1;
+		stride_y = conv_op->conv_stride_y - 1;
+		pad_x = conv_op->pad_x_left;
+		pad_y = conv_op->pad_y_top;
+	} else {
+		stride_x = 0;
+		stride_y = 0;
+		pad_x = 0;
+		pad_y = 0;
+	}
+
+	reg = (stride_x
+		<< SHIFT(CSC_D_CONV_STRIDE_EXT_0, CONV_X_STRIDE_EXT)) |
+		(stride_y
+		<< SHIFT(CSC_D_CONV_STRIDE_EXT_0, CONV_Y_STRIDE_EXT));
+	csc_reg_write(D_CONV_STRIDE_EXT, reg);
+
+	reg = ((conv_op->dilation_x - 1)
+		<< SHIFT(CSC_D_DILATION_EXT_0, X_DILATION_EXT)) |
+		((conv_op->dilation_y - 1)
+		<< SHIFT(CSC_D_DILATION_EXT_0, Y_DILATION_EXT));
+	csc_reg_write(D_DILATION_EXT, reg);
+
+	reg = (pad_x
+		<< SHIFT(CSC_D_ZERO_PADDING_0, PAD_LEFT)) |
+		(pad_y
+		<< SHIFT(CSC_D_ZERO_PADDING_0, PAD_TOP));
+	csc_reg_write(D_ZERO_PADDING, reg);
+
+	reg = (conv_op->pad_val
+		<< SHIFT(CSC_D_ZERO_PADDING_VALUE_0, PAD_VALUE)) &
+		MASK(CSC_D_ZERO_PADDING_VALUE_0, PAD_VALUE);
+	csc_reg_write(D_ZERO_PADDING_VALUE, reg);
+
+	reg = ((conv_op->data_bank - 1)
+		<< SHIFT(CSC_D_BANK_0, DATA_BANK)) |
+		((conv_op->weight_bank - 1)
+		<< SHIFT(CSC_D_BANK_0, WEIGHT_BANK));
+	csc_reg_write(D_BANK, reg);
+	csc_reg_write(D_PRA_CFG, conv_op->pra_truncate);
+
+	/* CBUF */
+	/* there's no CBUF register */
+
+	/* CDMA */
+	reg = (map_conv[conv_op->conv_mode]
+		<< SHIFT(CDMA_D_MISC_CFG_0, CONV_MODE)) |
+		(map_precision[conv_op->in_precision]
+		<< SHIFT(CDMA_D_MISC_CFG_0, IN_PRECISION)) |
+		(map_precision[conv_op->out_precision]
+		<< SHIFT(CDMA_D_MISC_CFG_0, PROC_PRECISION)) |
+		(conv_op->data_reuse
+		<< SHIFT(CDMA_D_MISC_CFG_0, DATA_REUSE)) |
+		(conv_op->weight_reuse
+		<< SHIFT(CDMA_D_MISC_CFG_0, WEIGHT_REUSE)) |
+		(conv_op->skip_data_rls
+		<< SHIFT(CDMA_D_MISC_CFG_0, SKIP_DATA_RLS)) |
+		(conv_op->skip_weight_rls
+		<< SHIFT(CDMA_D_MISC_CFG_0, SKIP_WEIGHT_RLS));
+	cdma_reg_write(D_MISC_CFG, reg);
+
+	reg = (get_in_format(conv_op->data_format) <<
+		SHIFT(CDMA_D_DATAIN_FORMAT_0, DATAIN_FORMAT)) |
+		(map_img_fmt[conv_op->data_format][0]
+		<< SHIFT(CDMA_D_DATAIN_FORMAT_0, PIXEL_FORMAT)) |
+		(map_pixel[conv_op->pixel_mapping]
+		<< SHIFT(CDMA_D_DATAIN_FORMAT_0, PIXEL_MAPPING)) |
+		(conv_op->pixel_override
+		<< SHIFT(CDMA_D_DATAIN_FORMAT_0, PIXEL_SIGN_OVERRIDE));
+	cdma_reg_write(D_DATAIN_FORMAT, reg);
+
+	reg = ((conv_surface->src_data.width - 1)
+		<< SHIFT(CDMA_D_DATAIN_SIZE_0_0, DATAIN_WIDTH)) |
+		((conv_surface->src_data.height - 1)
+		<< SHIFT(CDMA_D_DATAIN_SIZE_0_0, DATAIN_HEIGHT));
+	cdma_reg_write(D_DATAIN_SIZE_0, reg);
+
+	reg = ((conv_surface->src_data.channel - 1)
+		<< SHIFT(CDMA_D_DATAIN_SIZE_1_0, DATAIN_CHANNEL));
+	cdma_reg_write(D_DATAIN_SIZE_1, reg);
+
+	reg = ((conv_op->input_width_csc - 1)
+		<< SHIFT(CDMA_D_DATAIN_SIZE_EXT_0_0, DATAIN_WIDTH_EXT)) |
+		((conv_op->input_height_csc - 1)
+		<< SHIFT(CDMA_D_DATAIN_SIZE_EXT_0_0, DATAIN_HEIGHT_EXT));
+	cdma_reg_write(D_DATAIN_SIZE_EXT_0, reg);
+
+	reg = (map_ram[conv_surface->src_data.type]
+		<< SHIFT(CDMA_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE));
+	cdma_reg_write(D_DAIN_RAM_TYPE, reg);
+
+	high = HIGH32BITS(input_address);
+	low = LOW32BITS(input_address);
+	cdma_reg_write(D_DAIN_ADDR_HIGH_0, high);
+	cdma_reg_write(D_DAIN_ADDR_LOW_0, low);
+
+	high = HIGH32BITS((input_address + conv_surface->offset_u));
+	low = LOW32BITS(input_address + conv_surface->offset_u);
+	cdma_reg_write(D_DAIN_ADDR_HIGH_1, high);
+	cdma_reg_write(D_DAIN_ADDR_LOW_1, low);
+
+	cdma_reg_write(D_LINE_STRIDE, conv_surface->src_data.line_stride);
+	cdma_reg_write(D_SURF_STRIDE, conv_surface->src_data.surf_stride);
+	cdma_reg_write(D_LINE_UV_STRIDE, conv_surface->in_line_uv_stride);
+
+	reg = ((conv_surface->src_data.line_stride ==
+			((uint32_t)conv_surface->src_data.width * atom_size))
+		<< SHIFT(CDMA_D_DAIN_MAP_0, LINE_PACKED));
+	reg |= ((conv_surface->src_data.surf_stride ==
+			((uint32_t)(conv_surface->src_data.width *
+			conv_surface->src_data.height) * atom_size))
+		<< SHIFT(CDMA_D_DAIN_MAP_0, SURF_PACKED));
+	cdma_reg_write(D_DAIN_MAP, reg);
+
+	reg = ((conv_op->batch - 1)
+		<< SHIFT(CDMA_D_BATCH_NUMBER_0, BATCHES));
+	cdma_reg_write(D_BATCH_NUMBER, reg);
+
+	cdma_reg_write(D_BATCH_STRIDE, conv_op->batch_stride);
+
+	reg = ((conv_op->entry_per_slice - 1)
+		<< SHIFT(CDMA_D_ENTRY_PER_SLICE_0, ENTRIES));
+	cdma_reg_write(D_ENTRY_PER_SLICE, reg);
+
+	reg = ((conv_op->fetch_grain - 1)
+		<< SHIFT(CDMA_D_FETCH_GRAIN_0, GRAINS));
+	cdma_reg_write(D_FETCH_GRAIN, reg);
+
+	reg = (map_weight_fmt[conv_op->weight_format]
+		<< SHIFT(CDMA_D_WEIGHT_FORMAT_0, WEIGHT_FORMAT));
+	cdma_reg_write(D_WEIGHT_FORMAT, reg);
+
+	reg = ((conv_op->bytes_per_kernel - 1)
+		<< SHIFT(CDMA_D_WEIGHT_SIZE_0_0, BYTE_PER_KERNEL));
+	cdma_reg_write(D_WEIGHT_SIZE_0, reg);
+
+	reg = ((conv_surface->dst_data.channel - 1)
+		<< SHIFT(CDMA_D_WEIGHT_SIZE_1_0, WEIGHT_KERNEL));
+	cdma_reg_write(D_WEIGHT_SIZE_1, reg);
+
+	reg = (map_ram[conv_surface->weight_data.type]
+		<< SHIFT(CDMA_D_WEIGHT_RAM_TYPE_0, WEIGHT_RAM_TYPE));
+	cdma_reg_write(D_WEIGHT_RAM_TYPE, reg);
+
+	high = HIGH32BITS(weight_address);
+	low = LOW32BITS(weight_address);
+	cdma_reg_write(D_WEIGHT_ADDR_HIGH, high);
+	cdma_reg_write(D_WEIGHT_ADDR_LOW, low);
+	cdma_reg_write(D_WEIGHT_BYTES, conv_surface->weight_data.size);
+
+	if (conv_op->weight_format == WEIGHT_FORMAT_COMPRESSED) {
+		high = HIGH32BITS(wgs_address);
+		low = LOW32BITS(wgs_address);
+		cdma_reg_write(D_WGS_ADDR_HIGH, high);
+		cdma_reg_write(D_WGS_ADDR_LOW, low);
+
+		high = HIGH32BITS(wmb_address);
+		low = LOW32BITS(wmb_address);
+		cdma_reg_write(D_WMB_ADDR_HIGH, high);
+		cdma_reg_write(D_WMB_ADDR_LOW, low);
+		cdma_reg_write(D_WMB_BYTES, conv_surface->wmb_data.size);
+	}
+
+	reg = (map_mean[conv_op->mean_format]
+		<< SHIFT(CDMA_D_MEAN_FORMAT_0, MEAN_FORMAT));
+	cdma_reg_write(D_MEAN_FORMAT, reg);
+
+	if (conv_op->mean_format == MEAN_FORMAT_ENABLE) {
+		reg = ((conv_op->mean_ry
+			<< SHIFT(CDMA_D_MEAN_GLOBAL_0_0, MEAN_RY)) &
+			MASK(CDMA_D_MEAN_GLOBAL_0_0, MEAN_RY)) |
+			((conv_op->mean_gu
+			<< SHIFT(CDMA_D_MEAN_GLOBAL_0_0, MEAN_GU)) &
+			MASK(CDMA_D_MEAN_GLOBAL_0_0, MEAN_GU));
+		cdma_reg_write(D_MEAN_GLOBAL_0, reg);
+
+		reg = ((conv_op->mean_bv
+			<< SHIFT(CDMA_D_MEAN_GLOBAL_1_0, MEAN_BV))&
+			MASK(CDMA_D_MEAN_GLOBAL_1_0, MEAN_BV)) |
+			((conv_op->mean_ax
+			<< SHIFT(CDMA_D_MEAN_GLOBAL_1_0, MEAN_AX))&
+			MASK(CDMA_D_MEAN_GLOBAL_1_0, MEAN_AX));
+		cdma_reg_write(D_MEAN_GLOBAL_1, reg);
+	}
+
+	if (conv_op->in_cvt.enable) {
+		reg = ((FIELD_ENUM(CDMA_D_CVT_CFG_0, CVT_EN, ENABLE))
+			<< SHIFT(CDMA_D_CVT_CFG_0, CVT_EN)) |
+			(conv_op->in_cvt.truncate
+			<< SHIFT(CDMA_D_CVT_CFG_0, CVT_TRUNCATE));
+		cdma_reg_write(D_CVT_CFG, reg);
+		cdma_reg_write(D_CVT_OFFSET, conv_op->in_cvt.offset);
+		cdma_reg_write(D_CVT_SCALE, conv_op->in_cvt.scale);
+	} else {
+		reg = ((FIELD_ENUM(CDMA_D_CVT_CFG_0, CVT_EN, DISABLE))
+			<< SHIFT(CDMA_D_CVT_CFG_0, CVT_EN));
+		cdma_reg_write(D_CVT_CFG, reg);
+	}
+
+	reg = ((conv_op->conv_stride_x - 1)
+		<< SHIFT(CDMA_D_CONV_STRIDE_0, CONV_X_STRIDE)) |
+		((conv_op->conv_stride_y - 1)
+		<< SHIFT(CDMA_D_CONV_STRIDE_0, CONV_Y_STRIDE));
+	cdma_reg_write(D_CONV_STRIDE, reg);
+
+	reg = (conv_op->pad_x_left <<
+		SHIFT(CDMA_D_ZERO_PADDING_0, PAD_LEFT)) |
+		(conv_op->pad_x_right
+		<< SHIFT(CDMA_D_ZERO_PADDING_0, PAD_RIGHT)) |
+		(conv_op->pad_y_top
+		<< SHIFT(CDMA_D_ZERO_PADDING_0, PAD_TOP)) |
+		(conv_op->pad_y_bottom
+		<< SHIFT(CDMA_D_ZERO_PADDING_0, PAD_BOTTOM));
+	cdma_reg_write(D_ZERO_PADDING,   reg);
+
+	reg = conv_op->pad_val <<
+		SHIFT(CDMA_D_ZERO_PADDING_VALUE_0, PAD_VALUE) &
+		MASK(CDMA_D_ZERO_PADDING_VALUE_0, PAD_VALUE);
+	cdma_reg_write(D_ZERO_PADDING_VALUE, reg);
+	reg = ((conv_op->weight_bank - 1)
+		<< SHIFT(CDMA_D_BANK_0, WEIGHT_BANK)) |
+		((conv_op->data_bank - 1)
+		<< SHIFT(CDMA_D_BANK_0, DATA_BANK));
+	cdma_reg_write(D_BANK, reg);
+
+exit:
+	dla_trace("Exit: %s", __func__);
+	RETURN(ret);
+}
+
+int
+dla_conv_is_ready(struct dla_processor *processor,
+			    struct dla_processor_group *group)
+{
+	return 1;
+}
+
+void
+dla_conv_dump_config(struct dla_processor_group *group)
+{
+	struct dla_conv_op_desc *conv_op;
+	struct dla_conv_surface_desc *conv_surface;
+
+	conv_surface = &group->surface_desc->conv_surface;
+	conv_op = &group->operation_desc->conv_op;
+
+	dla_debug_conv_surface_desc(conv_surface, group->roi_index);
+	dla_debug_conv_op_desc(conv_op, group->roi_index);
+}
+
+int
+dla_conv_program(struct dla_processor_group *group)
+{
+	int32_t ret;
+
+	dla_trace("Enter: %s", __func__);
+
+	ret = processor_conv_program(group);
+	if (ret)
+		goto exit;
+
+exit:
+	dla_trace("Exit: %s", __func__);
+	RETURN(ret);
+}
--- a/drivers/nvdla/dla_engine_internal.h
+++ b/drivers/nvdla/dla_engine_internal.h
@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __FIRMWARE_DLA_ENGINE_INTERNAL_H_
+#define __FIRMWARE_DLA_ENGINE_INTERNAL_H_
+
+#include <opendla.h>
+#include <dla_engine.h>
+#include <dla_interface.h>
+#include <dla_debug.h>
+
+#include "nvdla_interface.h"
+
+#define BITS(num, range) ((((0xFFFFFFFF >> (31 - (1 ? range))) & \
+			(0xFFFFFFFF << (0 ? range))) & num) >> \
+			(0 ? range))
+#define HIGH32BITS(val64bit) ((uint32_t)(val64bit >> 32))
+#define LOW32BITS(val64bit) ((uint32_t)(val64bit))
+
+#ifdef MIN
+#undef MIN
+#endif /* MIN */
+
+#ifdef MAX
+#undef MAX
+#endif /* MAX */
+
+#define MIN(a, b) ((a) > (b) ? (b) : (a))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+/*********************************************************/
+/******************** Utilities **************************/
+/*********************************************************/
+#ifdef DEBUG
+#define CHECK_ALIGN(val, align)		 assert((val&(align-1)) == 0)
+#else
+#define CHECK_ALIGN(val, align)
+#endif /* DEBUG */
+
+#define MASK(reg, field)		(reg##_##field##_FIELD)
+#define FIELD_ENUM(r, f, e)		(r##_##f##_##e)
+#define SHIFT(reg, field)		(reg##_##field##_SHIFT)
+
+#define GLB_REG(name)                GLB_##name##_0
+#define MCIF_REG(name)               MCIF_##name##_0
+#define CVIF_REG(name)               CVIF_##name##_0
+#define BDMA_REG(name)               BDMA_##name##_0
+#define CDMA_REG(name)               CDMA_##name##_0
+#define CSC_REG(name)                CSC_##name##_0
+#define CMAC_A_REG(name)             CMAC_A_##name##_0
+#define CMAC_B_REG(name)             CMAC_B_##name##_0
+#define CACC_REG(name)               CACC_##name##_0
+#define SDP_RDMA_REG(name)           SDP_RDMA_##name##_0
+#define SDP_REG(name)                SDP_##name##_0
+#define PDP_RDMA_REG(name)           PDP_RDMA_##name##_0
+#define PDP_REG(name)                PDP_##name##_0
+#define CDP_RDMA_REG(name)           CDP_RDMA_##name##_0
+#define CDP_REG(name)                CDP_##name##_0
+#define RBK_REG(name)                RBK_##name##_0
+
+/* alias for register read for each sub-module */
+#define glb_reg_read(reg)           reg_read(GLB_REG(reg))
+#define bdma_reg_read(reg)          reg_read(BDMA_REG(reg))
+#define cdma_reg_read(reg)          reg_read(CDMA_REG(reg))
+#define csc_reg_read(reg)           reg_read(CSC_REG(reg))
+#define cmac_a_reg_read(reg)        reg_read(CMAC_A_REG(reg))
+#define cmac_b_reg_read(reg)        reg_read(CMAC_B_REG(reg))
+#define cacc_reg_read(reg)          reg_read(CACC_REG(reg))
+#define sdp_rdma_reg_read(reg)      reg_read(SDP_RDMA_REG(reg))
+#define sdp_reg_read(reg)           reg_read(SDP_REG(reg))
+#define pdp_rdma_reg_read(reg)      reg_read(PDP_RDMA_REG(reg))
+#define pdp_reg_read(reg)           reg_read(PDP_REG(reg))
+#define cdp_rdma_reg_read(reg)      reg_read(CDP_RDMA_REG(reg))
+#define cdp_reg_read(reg)           reg_read(CDP_REG(reg))
+#define rubik_reg_read(reg)         reg_read(RBK_REG(reg))
+
+/* alias for register write for each sub-module */
+#define glb_reg_write(reg, val)      reg_write(GLB_REG(reg), val)
+#define bdma_reg_write(reg, val)     reg_write(BDMA_REG(reg), val)
+#define cdma_reg_write(reg, val)     reg_write(CDMA_REG(reg), val)
+#define csc_reg_write(reg, val)      reg_write(CSC_REG(reg), val)
+#define cmac_a_reg_write(reg, val)   reg_write(CMAC_A_REG(reg), val)
+#define cmac_b_reg_write(reg, val)   reg_write(CMAC_B_REG(reg), val)
+#define cacc_reg_write(reg, val)     reg_write(CACC_REG(reg), val)
+#define sdp_rdma_reg_write(reg, val) reg_write(SDP_RDMA_REG(reg), val)
+#define sdp_reg_write(reg, val)      reg_write(SDP_REG(reg), val)
+#define pdp_rdma_reg_write(reg, val) reg_write(PDP_RDMA_REG(reg), val)
+#define pdp_reg_write(reg, val)      reg_write(PDP_REG(reg), val)
+#define cdp_rdma_reg_write(reg, val) reg_write(CDP_RDMA_REG(reg), val)
+#define cdp_reg_write(reg, val)      reg_write(CDP_REG(reg), val)
+#define rubik_reg_write(reg, val)    reg_write(RBK_REG(reg), val)
+
+void reg_write(uint32_t addr, uint32_t reg);
+uint32_t reg_read(uint32_t addr);
+
+/**
+ * Operation descriptor cache functions
+ */
+void
+dla_put_op_desc(struct dla_common_op_desc *op_desc);
+struct dla_common_op_desc
+*dla_get_op_desc(struct dla_task *task,
+			   int16_t index,
+			   uint8_t op_type,
+			   uint8_t roi_index);
+void
+dla_dump_op_desc(struct dla_common_op_desc *desc);
+void
+dla_get_refcount(struct dla_common_op_desc *op_desc);
+void
+dla_init_op_cache(struct dla_engine *engine);
+
+/**
+ * Operation completion handler
+ */
+int
+dla_op_completion(struct dla_processor *processor,
+		      struct dla_processor_group *group);
+
+int32_t
+dla_read_lut(struct dla_engine *engine, int16_t index, void *dst);
+int
+dla_enable_intr(uint32_t mask);
+int
+dla_disable_intr(uint32_t mask);
+int
+utils_get_free_group(struct dla_processor *processor,
+			uint8_t *group_id,
+			uint8_t *rdma_id);
+int32_t
+dla_get_dma_cube_address(void *driver_context,
+						void *task_data,
+						int16_t index,
+						uint32_t offset,
+						void *dst_ptr,
+						uint32_t destination);
+int
+dla_read_input_address(struct dla_data_cube *data,
+		       uint64_t *address,
+		       int16_t op_index,
+		       uint8_t roi_index,
+		       uint8_t bpp);
+
+/**
+ * BDMA operations
+ */
+void
+dla_bdma_set_producer(int32_t group_id, int32_t rdma_group_id);
+int
+dla_bdma_enable(struct dla_processor_group *group);
+int
+dla_bdma_program(struct dla_processor_group *group);
+int
+dla_bdma_is_ready(struct dla_processor *processor,
+			    struct dla_processor_group *group);
+void
+dla_bdma_dump_config(struct dla_processor_group *group);
+void
+dla_bdma_rdma_check(struct dla_processor_group *group);
+
+#if STAT_ENABLE
+void
+dla_bdma_stat_data(struct dla_processor *processor,
+				struct dla_processor_group *group);
+void
+dla_bdma_dump_stat(struct dla_processor *processor);
+
+#else
+static inline void
+dla_bdma_stat_data(struct dla_processor *processor,
+				struct dla_processor_group *group) {}
+static inline void
+dla_bdma_dump_stat(struct dla_processor *processor) {}
+#endif
+
+/**
+ * Convolution operations
+ */
+void
+dla_conv_set_producer(int32_t group_id, int32_t rdma_group_id);
+int
+dla_conv_enable(struct dla_processor_group *group);
+int
+dla_conv_program(struct dla_processor_group *group);
+int
+dla_conv_is_ready(struct dla_processor *processor,
+			    struct dla_processor_group *group);
+void
+dla_conv_dump_config(struct dla_processor_group *group);
+void
+dla_conv_rdma_check(struct dla_processor_group *group);
+
+#if STAT_ENABLE
+void
+dla_conv_stat_data(struct dla_processor *processor,
+				struct dla_processor_group *group);
+void
+dla_conv_dump_stat(struct dla_processor *processor);
+
+#else
+static inline void
+dla_conv_stat_data(struct dla_processor *processor,
+				struct dla_processor_group *group) {}
+static inline void
+dla_conv_dump_stat(struct dla_processor *processor) {}
+#endif /* STAT_ENABLE */
+
+/**
+ * SDP operations
+ */
+void
+dla_sdp_set_producer(int32_t group_id, int32_t rdma_group_id);
+int
+dla_sdp_enable(struct dla_processor_group *group);
+int
+dla_sdp_program(struct dla_processor_group *group);
+int
+dla_sdp_is_ready(struct dla_processor *processor,
+			   struct dla_processor_group *group);
+void
+dla_sdp_dump_config(struct dla_processor_group *group);
+void
+dla_sdp_rdma_check(struct dla_processor_group *group);
+
+#if STAT_ENABLE
+void
+dla_sdp_stat_data(struct dla_processor *processor,
+				struct dla_processor_group *group);
+void
+dla_sdp_dump_stat(struct dla_processor *processor);
+
+#else
+static inline void
+dla_sdp_stat_data(struct dla_processor *processor,
+				struct dla_processor_group *group) {}
+static inline void
+dla_sdp_dump_stat(struct dla_processor *processor) {}
+#endif
+
+/**
+ * PDP operations
+ */
+void
+dla_pdp_set_producer(int32_t group_id, int32_t rdma_group_id);
+int
+dla_pdp_enable(struct dla_processor_group *group);
+int
+dla_pdp_program(struct dla_processor_group *group);
+int
+dla_pdp_is_ready(struct dla_processor *processor,
+			   struct dla_processor_group *group);
+void
+dla_pdp_dump_config(struct dla_processor_group *group);
+void
+dla_pdp_rdma_check(struct dla_processor_group *group);
+
+#if STAT_ENABLE
+void
+dla_pdp_stat_data(struct dla_processor *processor,
+				struct dla_processor_group *group);
+void
+dla_pdp_dump_stat(struct dla_processor *processor);
+
+#else
+static inline void
+dla_pdp_stat_data(struct dla_processor *processor,
+				struct dla_processor_group *group) {}
+static inline void
+dla_pdp_dump_stat(struct dla_processor *processor) {}
+#endif
+
+/**
+ * CDP operations
+ */
+void
+dla_cdp_set_producer(int32_t group_id, int32_t rdma_group_id);
+int
+dla_cdp_enable(struct dla_processor_group *group);
+int
+dla_cdp_program(struct dla_processor_group *group);
+int
+dla_cdp_is_ready(struct dla_processor *processor,
+			   struct dla_processor_group *group);
+void
+dla_cdp_dump_config(struct dla_processor_group *group);
+void
+dla_cdp_rdma_check(struct dla_processor_group *group);
+
+#if STAT_ENABLE
+void
+dla_cdp_stat_data(struct dla_processor *processor,
+				struct dla_processor_group *group);
+void
+dla_cdp_dump_stat(struct dla_processor *processor);
+
+#else
+static inline void
+dla_cdp_stat_data(struct dla_processor *processor,
+				struct dla_processor_group *group) {}
+static inline void
+dla_cdp_dump_stat(struct dla_processor *processor) {}
+#endif
+
+/**
+ * RUBIK operations
+ */
+void
+dla_rubik_set_producer(int32_t group_id, int32_t rdma_group_id);
+int
+dla_rubik_enable(struct dla_processor_group *group);
+int
+dla_rubik_program(struct dla_processor_group *group);
+int
+dla_rubik_is_ready(struct dla_processor *processor,
+			     struct dla_processor_group *group);
+void
+dla_rubik_dump_config(struct dla_processor_group *group);
+void
+dla_rubik_rdma_check(struct dla_processor_group *group);
+
+#if STAT_ENABLE
+void
+dla_rubik_stat_data(struct dla_processor *processor,
+				struct dla_processor_group *group);
+void
+dla_rubik_dump_stat(struct dla_processor *processor);
+
+#else
+static inline void
+dla_rubik_stat_data(struct dla_processor *processor,
+				struct dla_processor_group *group) {}
+static inline void
+dla_rubik_dump_stat(struct dla_processor *processor) {}
+#endif
+
+#endif
--- a/drivers/nvdla/engine.c
+++ b/drivers/nvdla/engine.c
@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <opendla.h>
+#include <dla_debug.h>
+#include <dla_err.h>
+#include <dla_interface.h>
+
+#include "dla_engine_internal.h"
+#include "common.h"
+
+static const uint32_t map_rdma_ptr_addr[] = {
+	0xFFFFFFFF,
+	0xFFFFFFFF,
+	SDP_REG(RDMA_S_POINTER),
+	PDP_REG(RDMA_S_POINTER),
+	CDP_REG(RDMA_S_POINTER),
+	0xFFFFFFFF,
+};
+
+static const uint32_t map_sts_addr[] = {
+	BDMA_REG(STATUS),
+	CACC_REG(S_STATUS),
+	SDP_REG(S_STATUS),
+	PDP_REG(S_STATUS),
+	CDP_REG(S_STATUS),
+	RBK_REG(S_STATUS),
+};
+
+static const uint32_t map_ptr_addr[] = {
+	BDMA_REG(STATUS),
+	CACC_REG(S_POINTER),
+	SDP_REG(S_POINTER),
+	PDP_REG(S_POINTER),
+	CDP_REG(S_POINTER),
+	RBK_REG(S_POINTER),
+};
+
+int32_t dla_enable_intr(uint32_t mask)
+{
+	uint32_t reg = glb_reg_read(S_INTR_MASK);
+
+	reg = reg & (~mask);
+	glb_reg_write(S_INTR_MASK, reg);
+
+	RETURN(0);
+}
+
+int32_t dla_disable_intr(uint32_t mask)
+{
+	uint32_t reg = glb_reg_read(S_INTR_MASK);
+
+	reg = reg | mask;
+	glb_reg_write(S_INTR_MASK, reg);
+
+	RETURN(0);
+}
+
+uint8_t bdma_grp_sts[2] = {
+	FIELD_ENUM(BDMA_STATUS_0, IDLE, YES),
+	FIELD_ENUM(BDMA_STATUS_0, IDLE, YES)
+};
+
+struct dla_roi_desc roi_desc;
+
+/**
+ * Get DMA data cube address
+ */
+int32_t
+dla_get_dma_cube_address(void *driver_context, void *task_data,
+					int16_t index, uint32_t offset, void *dst_ptr,
+					uint32_t destination)
+{
+	int32_t ret = 0;
+	uint64_t *pdst = (uint64_t *)dst_ptr;
+       ret = dla_get_dma_address(driver_context, task_data, index,
+								dst_ptr, destination);
+	if (ret)
+		goto exit;
+
+	pdst[0] += offset;
+
+exit:
+	return ret;
+}
+
+/**
+ * Read input buffer address
+ *
+ * For input layer, in case of static ROI this address is read
+ * from address list and index is specified in data cube. In case
+ * dynamic ROI, it has to be read depending on ROI information
+ * and using surface address
+ *
+ * For all other layers, this address is read from address list
+ * using index specified in data cube
+ */
+int
+dla_read_input_address(struct dla_data_cube *data,
+		       uint64_t *address,
+		       int16_t op_index,
+		       uint8_t roi_index,
+		       uint8_t bpp)
+{
+	uint64_t roi_desc_addr;
+	int32_t ret = ERR(INVALID_INPUT);
+	struct dla_engine *en = dla_get_engine();
+
+	/**
+	 * If memory type is HW then no address required
+	 */
+	if (data->type == DLA_MEM_HW) {
+		ret = 0;
+		goto exit;
+	}
+
+	/**
+	 * If address list index is not -1 means this address has to
+	 * be read from address list
+	 */
+	if (data->address != -1) {
+
+		/**
+		 * But if other parameters indicate that this is input layer
+		 * for dynamic ROI then it is an error
+		 */
+		if (en->network->dynamic_roi &&
+			en->network->input_layer == op_index)
+			goto exit;
+		ret = dla_get_dma_cube_address(en->driver_context,
+						en->task->task_data,
+						data->address,
+						data->offset,
+						(void *)address,
+						DESTINATION_DMA);
+		goto exit;
+	}
+
+	/**
+	 * Check if it is dynamic ROI and this is input layer
+	 */
+	if (en->network->dynamic_roi && en->network->input_layer == op_index) {
+		if (!en->task->surface_addr)
+			goto exit;
+
+		/* Calculate address of ROI descriptor in array */
+		roi_desc_addr = en->task->roi_array_addr;
+
+		/* Read ROI descriptor */
+		ret = dla_data_read(en->driver_context,
+				en->task->task_data,
+				roi_desc_addr,
+				(void *)&roi_desc,
+				sizeof(roi_desc),
+				sizeof(struct dla_roi_array_desc) +
+				roi_index * sizeof(struct dla_roi_desc));
+		if (ret)
+			goto exit;
+
+		/* Calculate ROI address */
+		*address = en->task->surface_addr;
+		*address += (roi_desc.top * data->line_stride) +
+						(bpp * roi_desc.left);
+	}
+
+exit:
+	RETURN(ret);
+}
+
+int
+utils_get_free_group(struct dla_processor *processor,
+		     uint8_t *group_id,
+		     uint8_t *rdma_id)
+{
+	int32_t ret = 0;
+	uint32_t pointer;
+	uint32_t hw_consumer_ptr;
+	uint32_t hw_rdma_ptr;
+
+	hw_rdma_ptr = 0;
+
+	if (processor->op_type == DLA_OP_BDMA) {
+		pointer = reg_read(map_ptr_addr[processor->op_type]);
+		hw_consumer_ptr = ((pointer & MASK(BDMA_STATUS_0, GRP0_BUSY)) >>
+				SHIFT(BDMA_STATUS_0, GRP0_BUSY)) ==
+				FIELD_ENUM(BDMA_STATUS_0, GRP0_BUSY, YES) ?
+				1 : 0;
+	} else {
+		pointer = reg_read(map_ptr_addr[processor->op_type]);
+		hw_consumer_ptr = (pointer & MASK(CDP_S_POINTER_0, CONSUMER)) >>
+				SHIFT(CDP_S_POINTER_0, CONSUMER);
+
+		/**
+		 * Read current consumer pointer for RDMA only if processor
+		 * has RDMA module
+		 */
+		if (map_rdma_ptr_addr[processor->op_type] != 0xFFFFFFFF) {
+			pointer =
+			reg_read(map_rdma_ptr_addr[processor->op_type]);
+			hw_rdma_ptr = (pointer &
+					MASK(CDP_S_POINTER_0, CONSUMER)) >>
+					SHIFT(CDP_S_POINTER_0, CONSUMER);
+		}
+	}
+
+	/**
+	 * If both processors are programmed then exit
+	 */
+	if (processor->group_status == 0x3) {
+		ret = ERR(PROCESSOR_BUSY);
+		goto exit;
+	}
+
+	if (!processor->group_status)
+		/**
+		 * If both groups are idle then use consumer pointer
+		 */
+		*group_id = hw_consumer_ptr;
+	else
+		/**
+		 * Here it is assumed that only one group is idle or busy
+		 * and hence right shift will work to get correct
+		 * group id
+		 */
+		*group_id = !(processor->group_status >> 1);
+
+	/**
+	 * If both groups are idle then read group id from pointer
+	 */
+	if (!processor->rdma_status)
+		*rdma_id = hw_rdma_ptr;
+	else
+		*rdma_id = !(processor->rdma_status >> 1);
+
+exit:
+	RETURN(ret);
+}
--- a/drivers/nvdla/engine_data.c
+++ b/drivers/nvdla/engine_data.c
@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <nvdla_interface.h>
+#include <dla_interface.h>
+
+#include "dla_engine_internal.h"
+
+static union dla_operation_container operation_desc[DLA_OP_NUM][DLA_NUM_GROUPS];
+static union dla_surface_container surface_desc[DLA_OP_NUM][DLA_NUM_GROUPS];
+
+static struct dla_task global_task;
+
+static struct dla_engine engine = {
+	.processors[DLA_OP_BDMA] = {
+		.name = "BDMA",
+		.op_type = DLA_OP_BDMA,
+		.program = dla_bdma_program,
+		.enable = dla_bdma_enable,
+		.set_producer = dla_bdma_set_producer,
+		.is_ready = dla_bdma_is_ready,
+		.dump_config = dla_bdma_dump_config,
+		.rdma_check = dla_bdma_rdma_check,
+		.get_stat_data = dla_bdma_stat_data,
+		.dump_stat = dla_bdma_dump_stat,
+		.consumer_ptr = 0,
+		.roi_index = 0,
+		.group_status = 0,
+		.rdma_status = 0,
+		.last_group = 1,
+		.groups[0] = {
+			.id = 0,
+			.rdma_id = 0,
+			.active = 0,
+			.events = 0,
+			.roi_index = 0,
+			.is_rdma_needed = 0,
+			.lut_index = -1,
+			.operation_desc = &operation_desc[DLA_OP_BDMA][0],
+			.surface_desc = &surface_desc[DLA_OP_BDMA][0],
+		},
+		.groups[1] = {
+			.id = 1,
+			.rdma_id = 0,
+			.active = 0,
+			.events = 0,
+			.roi_index = 0,
+			.is_rdma_needed = 0,
+			.lut_index = -1,
+			.operation_desc = &operation_desc[DLA_OP_BDMA][1],
+			.surface_desc = &surface_desc[DLA_OP_BDMA][1],
+		},
+	},
+	.processors[DLA_OP_CONV] = {
+		.name = "Convolution",
+		.op_type = DLA_OP_CONV,
+		.program = dla_conv_program,
+		.enable = dla_conv_enable,
+		.set_producer = dla_conv_set_producer,
+		.is_ready = dla_conv_is_ready,
+		.dump_config = dla_conv_dump_config,
+		.rdma_check = dla_conv_rdma_check,
+		.get_stat_data = dla_conv_stat_data,
+		.dump_stat = dla_conv_dump_stat,
+		.consumer_ptr = 0,
+		.roi_index = 0,
+		.group_status = 0,
+		.rdma_status = 0,
+		.last_group = 1,
+		.groups[0] = {
+			.id = 0,
+			.rdma_id = 0,
+			.active = 0,
+			.events = 0,
+			.roi_index = 0,
+			.is_rdma_needed = 0,
+			.lut_index = -1,
+			.operation_desc = &operation_desc[DLA_OP_CONV][0],
+			.surface_desc = &surface_desc[DLA_OP_CONV][0],
+		},
+		.groups[1] = {
+			.id = 1,
+			.rdma_id = 0,
+			.active = 0,
+			.events = 0,
+			.roi_index = 0,
+			.is_rdma_needed = 0,
+			.lut_index = -1,
+			.operation_desc = &operation_desc[DLA_OP_CONV][1],
+			.surface_desc = &surface_desc[DLA_OP_CONV][1],
+		},
+	},
+	.processors[DLA_OP_SDP] = {
+		.name = "SDP",
+		.op_type = DLA_OP_SDP,
+		.program = dla_sdp_program,
+		.enable = dla_sdp_enable,
+		.set_producer = dla_sdp_set_producer,
+		.is_ready = dla_sdp_is_ready,
+		.dump_config = dla_sdp_dump_config,
+		.rdma_check = dla_sdp_rdma_check,
+		.get_stat_data = dla_sdp_stat_data,
+		.dump_stat = dla_sdp_dump_stat,
+		.consumer_ptr = 0,
+		.roi_index = 0,
+		.group_status = 0,
+		.rdma_status = 0,
+		.last_group = 1,
+		.groups[0] = {
+			.id = 0,
+			.rdma_id = 0,
+			.active = 0,
+			.events = 0,
+			.roi_index = 0,
+			.is_rdma_needed = 0,
+			.lut_index = -1,
+			.operation_desc = &operation_desc[DLA_OP_SDP][0],
+			.surface_desc = &surface_desc[DLA_OP_SDP][0],
+		},
+		.groups[1] = {
+			.id = 1,
+			.rdma_id = 0,
+			.active = 0,
+			.events = 0,
+			.roi_index = 0,
+			.is_rdma_needed = 0,
+			.lut_index = -1,
+			.operation_desc = &operation_desc[DLA_OP_SDP][1],
+			.surface_desc = &surface_desc[DLA_OP_SDP][1],
+		},
+	},
+	.processors[DLA_OP_PDP] = {
+		.name = "PDP",
+		.op_type = DLA_OP_PDP,
+		.program = dla_pdp_program,
+		.enable = dla_pdp_enable,
+		.set_producer = dla_pdp_set_producer,
+		.is_ready = dla_pdp_is_ready,
+		.dump_config = dla_pdp_dump_config,
+		.rdma_check = dla_pdp_rdma_check,
+		.get_stat_data = dla_pdp_stat_data,
+		.dump_stat = dla_pdp_dump_stat,
+		.consumer_ptr = 0,
+		.roi_index = 0,
+		.group_status = 0,
+		.rdma_status = 0,
+		.last_group = 1,
+		.groups[0] = {
+			.id = 0,
+			.rdma_id = 0,
+			.active = 0,
+			.events = 0,
+			.roi_index = 0,
+			.is_rdma_needed = 0,
+			.lut_index = -1,
+			.operation_desc = &operation_desc[DLA_OP_PDP][0],
+			.surface_desc = &surface_desc[DLA_OP_PDP][0],
+		},
+		.groups[1] = {
+			.id = 1,
+			.rdma_id = 0,
+			.active = 0,
+			.events = 0,
+			.roi_index = 0,
+			.is_rdma_needed = 0,
+			.lut_index = -1,
+			.operation_desc = &operation_desc[DLA_OP_PDP][1],
+			.surface_desc = &surface_desc[DLA_OP_PDP][1],
+		},
+	},
+	.processors[DLA_OP_CDP] = {
+		.name = "CDP",
+		.op_type = DLA_OP_CDP,
+		.program = dla_cdp_program,
+		.enable = dla_cdp_enable,
+		.set_producer = dla_cdp_set_producer,
+		.is_ready = dla_cdp_is_ready,
+		.dump_config = dla_cdp_dump_config,
+		.rdma_check = dla_cdp_rdma_check,
+		.get_stat_data = dla_cdp_stat_data,
+		.dump_stat = dla_cdp_dump_stat,
+		.consumer_ptr = 0,
+		.roi_index = 0,
+		.group_status = 0,
+		.rdma_status = 0,
+		.last_group = 1,
+		.groups[0] = {
+			.id = 0,
+			.rdma_id = 0,
+			.active = 0,
+			.events = 0,
+			.roi_index = 0,
+			.is_rdma_needed = 0,
+			.lut_index = -1,
+			.operation_desc = &operation_desc[DLA_OP_CDP][0],
+			.surface_desc = &surface_desc[DLA_OP_CDP][0],
+		},
+		.groups[1] = {
+			.id = 1,
+			.rdma_id = 0,
+			.active = 0,
+			.events = 0,
+			.roi_index = 0,
+			.is_rdma_needed = 0,
+			.lut_index = -1,
+			.operation_desc = &operation_desc[DLA_OP_CDP][1],
+			.surface_desc = &surface_desc[DLA_OP_CDP][1],
+		},
+	},
+
+	.processors[DLA_OP_RUBIK] = {
+		.name = "RUBIK",
+		.op_type = DLA_OP_RUBIK,
+		.program = dla_rubik_program,
+		.enable = dla_rubik_enable,
+		.set_producer = dla_rubik_set_producer,
+		.is_ready = dla_rubik_is_ready,
+		.dump_config = dla_rubik_dump_config,
+		.rdma_check = dla_rubik_rdma_check,
+		.get_stat_data = dla_rubik_stat_data,
+		.dump_stat = dla_rubik_dump_stat,
+		.consumer_ptr = 0,
+		.roi_index = 0,
+		.group_status = 0,
+		.rdma_status = 0,
+		.last_group = 1,
+		.groups[0] = {
+			.id = 0,
+			.rdma_id = 0,
+			.active = 0,
+			.events = 0,
+			.roi_index = 0,
+			.is_rdma_needed = 0,
+			.lut_index = -1,
+			.operation_desc = &operation_desc[DLA_OP_RUBIK][0],
+			.surface_desc = &surface_desc[DLA_OP_RUBIK][0],
+		},
+		.groups[1] = {
+			.id = 1,
+			.rdma_id = 0,
+			.active = 0,
+			.events = 0,
+			.roi_index = 0,
+			.is_rdma_needed = 0,
+			.lut_index = -1,
+			.operation_desc = &operation_desc[DLA_OP_RUBIK][1],
+			.surface_desc = &surface_desc[DLA_OP_RUBIK][1],
+		},
+	},
+
+};
+
+struct dla_engine *dla_get_engine(void)
+{
+	return &engine;
+}
+
+int32_t dla_register_driver(void **engine_context, void *driver_context)
+{
+	*engine_context = &engine;
+	engine.task = &global_task;
+	engine.driver_context = driver_context;
+	engine.task->task_data = NULL;
+
+	dla_init_op_cache(&engine);
+
+	RETURN(0);
+}
+
+uint32_t reg_read(uint32_t addr)
+{
+	return dla_reg_read(engine.driver_context, addr);
+}
+
+void reg_write(uint32_t addr, uint32_t reg)
+{
+	dla_reg_write(engine.driver_context, addr, reg);
+}
--- a/drivers/nvdla/engine_debug.c
+++ b/drivers/nvdla/engine_debug.c
@ -0,0 +1,551 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <dla_debug.h>
+#include <dla_interface.h>
+#include <dla_sched.h>
+
+#include "engine_debug.h"
+
+#if DEBUG_NETWORK_DATA
+
+void
+dla_debug_network_desc(struct dla_network_desc *nd)
+{
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW dla_network_desc\n");
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("op desc index      = %d\n", nd->operation_desc_index);
+	dla_debug("surface desc index = %d\n", nd->surface_desc_index);
+	dla_debug("dep graph index    = %d\n", nd->dependency_graph_index);
+	dla_debug("lut data index     = %d\n", nd->lut_data_index);
+	dla_debug("stat_list_index    = %d\n", nd->stat_list_index);
+	dla_debug("roi array index    = %d\n", nd->roi_array_index);
+	dla_debug("surface index      = %d\n", nd->surface_index);
+	dla_debug("num rois           = %u\n", nd->num_rois);
+	dla_debug("num ops            = %u\n", nd->num_operations);
+	dla_debug("num luts           = %u\n", nd->num_luts);
+	dla_debug("num addr           = %u\n", nd->num_addresses);
+	dla_debug("input layer        = %u\n", nd->input_layer);
+	dla_debug("dynamic roi        = %u\n", nd->dynamic_roi);
+}
+
+static void
+dla_debug_bdma_transfer(struct dla_bdma_transfer_desc *tr, int32_t id)
+{
+	dla_debug("transfer[%d]            = [ dla_bdma_transfer_desc =>\n", id);
+	dla_debug("    source_address      = %x\n", tr->source_address);
+	dla_debug("    destination_address = %x\n", tr->destination_address);
+	dla_debug("    line_size           = %x\n", tr->line_size);
+	dla_debug("    line_repeat         = %x\n", tr->line_repeat);
+	dla_debug("    source_line         = %x\n", tr->source_line);
+	dla_debug("    destination_line    = %x\n", tr->destination_line);
+	dla_debug("    surface_repeat      = %x\n", tr->surface_repeat);
+	dla_debug("    source_surface      = %x\n", tr->source_surface);
+	dla_debug("    destination_surface = %x\n", tr->destination_surface);
+}
+
+void
+dla_debug_bdma_surface_desc(struct dla_bdma_surface_desc *desc, int32_t roi)
+{
+	int32_t i;
+
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW ROI[%d]: dla_bdma_surface_desc\n", roi);
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("source_type      = %u\n", desc->source_type);
+	dla_debug("destination_type = %u\n", desc->destination_type);
+	dla_debug("num_transfers    = %u\n", desc->num_transfers);
+	for (i = 0; i < desc->num_transfers; i++)
+		dla_debug_bdma_transfer(&desc->transfers[i], i);
+}
+
+void
+dla_debug_bdma_op_desc(struct dla_bdma_op_desc *desc, int32_t roi)
+{
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW ROI[%d]: dla_bdma_op_desc\n", roi);
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("num_transfers    = %u\n", desc->num_transfers);
+}
+
+void
+dla_debug_address_info(struct dla_task *tk)
+{
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW address list\n");
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("task base address        = %llu\n", tk->base);
+	dla_debug("op desc address          = %llu\n", tk->operation_desc_addr);
+	dla_debug("surface desc address     = %llu\n", tk->surface_desc_addr);
+	dla_debug("dependency graph address = %llu\n", tk->dependency_graph_addr);
+	dla_debug("LUT data address         = %llu\n", tk->lut_data_addr);
+	dla_debug("stat address             = %llu\n", tk->stat_data_addr);
+	dla_debug("ROI array address        = %llu\n", tk->roi_array_addr);
+	dla_debug("surface address          = %llu\n", tk->surface_addr);
+}
+
+void
+dla_debug_op_desc(struct dla_common_op_desc *desc, int32_t roi)
+{
+	int32_t i;
+
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW ROI[%d]: dla_common_op_desc\n", roi);
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("[%p] Operation index %d ROI %d dep_count %d type %d\n",
+			(unsigned int *)desc, desc->index, desc->roi_index,
+			desc->dependency_count, desc->op_type);
+	dla_debug("consumers = [ dla_consumer =>\n");
+	for (i = 0; i < DLA_OP_NUM; i++)
+		dla_debug(" [ %d %d ]", desc->consumers[i].index,
+					desc->consumers[i].event);
+	dla_debug("]");
+	dla_debug("fused_parent = [ dla_consumer =>\n");
+	dla_debug(" [ %d %d ]", desc->fused_parent.index,
+					desc->fused_parent.event);
+	dla_debug("]");
+}
+
+static void
+dla_debug_data_cube(struct dla_data_cube *cube)
+{
+	dla_debug("    type          = %u\n", cube->type);
+	dla_debug("    address       = %d\n", cube->address);
+	dla_debug("    width         = %x\n", cube->width);
+	dla_debug("    height        = %x\n", cube->height);
+	dla_debug("    channel       = %x\n", cube->channel);
+	dla_debug("    size          = %u\n", cube->size);
+	dla_debug("    line_stride   = %u\n", cube->line_stride);
+	dla_debug("    surf_stride   = %u\n", cube->surf_stride);
+	dla_debug("    plane_stride  = %u\n", cube->plane_stride);
+	dla_debug("]");
+}
+
+static void
+dla_debug_converter(struct dla_cvt_param *cvt)
+{
+	dla_debug("[ scale = %d, truncate = %u, enable = %u, offset = %d ]\n",
+			cvt->scale, cvt->truncate, cvt->enable, cvt->offset);
+}
+
+static void
+dla_debug_float_data(struct dla_float_data *float_data)
+{
+	dla_debug("[ scale = %d, shifter = %d ]\n",
+			float_data->scale, float_data->shifter);
+}
+
+static void
+dla_debug_dla_slope(union dla_slope *slope)
+{
+	dla_debug("    data_i =\n");
+	dla_debug_float_data(&slope->data_i);
+	dla_debug("    data_f = %u\n", slope->data_f);
+}
+
+static void
+dla_debug_lut_offset(union dla_lut_offset *offset)
+{
+	dla_debug("    exp_offset = %d\n", offset->exp_offset);
+	dla_debug("    frac_bits  = %d\n", offset->frac_bits);
+}
+
+void
+dla_debug_lut_params(struct dla_lut_param *lut_param)
+{
+	int32_t i, j;
+
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW dla_lut_param\n");
+	dla_debug("---------------------------------------------------------\n");
+
+	dla_debug("linear_exp_table            = [\n");
+	for (i = 0; i < (1<<LUT_LINEAR_EXP_TABLE_ENTRY_LOG2)+1; i++)
+		dla_debug(" %u", lut_param->linear_exp_table[i]);
+	dla_debug("]");
+
+	dla_debug("linear_only_table           = [\n");
+	for (j = 0; j < (1<<LUT_LINEAR_ONLY_TABLE_ENTRY_LOG2)+1; j++)
+		dla_debug(" %u\n", lut_param->linear_only_table[j]);
+	dla_debug("]\n");
+
+	dla_debug("linear_exp_offset           =\n");
+	dla_debug_lut_offset(&lut_param->linear_exp_offset);
+	dla_debug("linear_only_offset          =\n");
+	dla_debug_lut_offset(&lut_param->linear_only_offset);
+	dla_debug("linear_exp_start            = %llu\n",
+				lut_param->linear_exp_start);
+	dla_debug("linear_exp_end            = %llu\n",
+				lut_param->linear_exp_end);
+	dla_debug("linear_only_start           = %llu\n",
+				lut_param->linear_only_start);
+	dla_debug("linear_only_end           = %llu\n",
+				lut_param->linear_only_end);
+	dla_debug("linear_exp_underflow_slope  =\n");
+	dla_debug_dla_slope(&lut_param->linear_exp_underflow_slope);
+	dla_debug("linear_exp_overflow_slope   =\n");
+	dla_debug_dla_slope(&lut_param->linear_exp_overflow_slope);
+	dla_debug("linear_only_underflow_slope =\n");
+	dla_debug_dla_slope(&lut_param->linear_only_underflow_slope);
+	dla_debug("linear_only_overflow_slope  =\n");
+	dla_debug_dla_slope(&lut_param->linear_only_overflow_slope);
+	dla_debug("hybrid_priority             = %u\n",
+				lut_param->hybrid_priority);
+	dla_debug("underflow_priority          = %u\n",
+				lut_param->underflow_priority);
+	dla_debug("overflow_priority           = %u\n",
+				lut_param->overflow_priority);
+	dla_debug("method                      = %u\n",
+				lut_param->method);
+}
+
+void
+dla_debug_bdma_stats(struct dla_bdma_stat_desc *stat)
+{
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW STATS: dla_bdma_stat_desc\n");
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("read_stall   = %u\n", stat->read_stall);
+	dla_debug("write_stall  = %u\n", stat->write_stall);
+	dla_debug("runtime      = %u\n", stat->runtime);
+}
+
+void
+dla_debug_conv_surface_desc(struct dla_conv_surface_desc *desc, int32_t roi)
+{
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW ROI[%d]: dla_conv_surface_desc\n", roi);
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("weight_data         = [ dla_data_cube =>\n");
+	dla_debug_data_cube(&desc->weight_data);
+	dla_debug("wmb_data            = [ dla_data_cube =>\n");
+	dla_debug_data_cube(&desc->wmb_data);
+	dla_debug("wgs_data            = [ dla_data_cube =>\n");
+	dla_debug_data_cube(&desc->wgs_data);
+	dla_debug("src_data            = [ dla_data_cube =>\n");
+	dla_debug_data_cube(&desc->src_data);
+	dla_debug("dst_data            = [ dla_data_cube =>\n");
+	dla_debug_data_cube(&desc->dst_data);
+	dla_debug("offset_u            = %lld\n", desc->offset_u);
+	dla_debug("in_line_uv_stride   = %u\n", desc->in_line_uv_stride);
+}
+
+void
+dla_debug_conv_op_desc(struct dla_conv_op_desc *desc, int32_t roi)
+{
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW ROI[%d]: dla_conv_op_desc\n", roi);
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("conv_mode          = %u\n", desc->conv_mode);
+	dla_debug("data_reuse         = %u\n", desc->data_reuse);
+	dla_debug("weight_reuse       = %u\n", desc->weight_reuse);
+	dla_debug("skip_data_rls      = %u\n", desc->skip_data_rls);
+	dla_debug("skip_weight_rls    = %u\n", desc->skip_weight_rls);
+	dla_debug("entry_per_slice    = %u\n", desc->entry_per_slice);
+	dla_debug("data_format        = %u\n", desc->data_format);
+	dla_debug("pixel_mapping      = %u\n", desc->pixel_mapping);
+	dla_debug("fetch_grain        = %u\n", desc->fetch_grain);
+	dla_debug("batch              = %u\n", desc->batch);
+	dla_debug("weight_format      = %u\n", desc->weight_format);
+	dla_debug("data_bank          = %u\n", desc->data_bank);
+	dla_debug("weight_bank        = %u\n", desc->weight_bank);
+	dla_debug("batch_stride       = %u\n", desc->batch_stride);
+	dla_debug("post_extension     = %u\n", desc->post_extension);
+	dla_debug("pixel_override     = %u\n", desc->pixel_override);
+	dla_debug("release            = %u\n", desc->release);
+	dla_debug("input_width_csc    = %u\n", desc->input_width_csc);
+	dla_debug("input_height_csc   = %u\n", desc->input_height_csc);
+	dla_debug("input_channel_csc  = %u\n", desc->input_channel_csc);
+	dla_debug("kernel_width_csc   = %u\n", desc->kernel_width_csc);
+	dla_debug("kernel_height_csc  = %u\n", desc->kernel_height_csc);
+	dla_debug("kernel_channel_csc = %u\n", desc->kernel_channel_csc);
+	dla_debug("input_width_cmac   = %u\n", desc->input_width_cmac);
+	dla_debug("input_height_cmac  = %u\n", desc->input_height_cmac);
+	dla_debug("bytes_per_kernel   = %u\n", desc->bytes_per_kernel);
+	dla_debug("mean_ry            = %d\n", desc->mean_ry);
+	dla_debug("mean_gu            = %d\n", desc->mean_gu);
+	dla_debug("mean_bv            = %d\n", desc->mean_bv);
+	dla_debug("mean_ax            = %d\n", desc->mean_ax);
+	dla_debug("mean_format        = %u\n", desc->mean_format);
+	dla_debug("conv_stride_x      = %u\n", desc->conv_stride_x);
+	dla_debug("conv_stride_y      = %u\n", desc->conv_stride_y);
+	dla_debug("pad_x_left         = %u\n", desc->pad_x_left);
+	dla_debug("pad_x_right        = %u\n", desc->pad_x_right);
+	dla_debug("pad_y_top          = %u\n", desc->pad_y_top);
+	dla_debug("pad_y_bottom       = %u\n", desc->pad_y_bottom);
+	dla_debug("dilation_x         = %u\n", desc->dilation_x);
+	dla_debug("dilation_y         = %u\n", desc->dilation_y);
+	dla_debug("pra_truncate       = %u\n", desc->pra_truncate);
+	dla_debug("in_precision       = %u\n", desc->in_precision);
+	dla_debug("out_precision      = %u\n", desc->out_precision);
+	dla_debug("pad_val            = %d\n", desc->pad_val);
+	dla_debug("in_cvt             =\n");
+	dla_debug_converter(&desc->in_cvt);
+	dla_debug("out_cvt            =\n");
+	dla_debug_converter(&desc->out_cvt);
+}
+
+void
+dla_debug_conv_stats(struct dla_conv_stat_desc *stat)
+{
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW STATS: dla_conv_stat_desc\n");
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("data_read_stall      = %u\n", stat->data_read_stall);
+	dla_debug("weight_read_stall    = %u\n", stat->weight_read_stall);
+	dla_debug("data_read_latency    = %u\n", stat->data_read_latency);
+	dla_debug("weight_read_latency  = %u\n", stat->weight_read_latency);
+	dla_debug("saturation_count     = %u\n", stat->saturation_count);
+	dla_debug("nan_data_num         = %u\n", stat->nan_data_num);
+	dla_debug("nan_weight_num       = %u\n", stat->nan_weight_num);
+	dla_debug("inf_data_num         = %u\n", stat->inf_data_num);
+	dla_debug("inf_weight_num       = %u\n", stat->inf_weight_num);
+	dla_debug("runtime              = %u\n", stat->runtime);
+}
+
+void
+dla_debug_pdp_surface_desc(struct dla_pdp_surface_desc *desc, int32_t roi)
+{
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW ROI[%d]: dla_pdp_surface_desc\n", roi);
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("src_data            = [ dla_data_cube =>\n");
+	dla_debug_data_cube(&desc->src_data);
+	dla_debug("dst_data            = [ dla_data_cube =>\n");
+	dla_debug_data_cube(&desc->dst_data);
+}
+
+void
+dla_debug_pdp_op_desc(struct dla_pdp_op_desc *desc, int32_t roi)
+{
+	int32_t i;
+
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW ROI[%d]: dla_pdp_op_desc\n", roi);
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("precision               = %u\n", desc->precision);
+	dla_debug("padding_value           = [\n");
+	for (i = 0; i < PDP_PAD_VAL_NUM; i++)
+		dla_debug(" %d\n", desc->padding_value[i]);
+	dla_debug("]\n");
+	dla_debug("split_num               = %u\n", desc->split_num);
+	dla_debug("partial_in_width_first  = %u\n",
+					desc->partial_in_width_first);
+	dla_debug("partial_in_width_mid    = %u\n", desc->partial_in_width_mid);
+	dla_debug("partial_in_width_last   = %u\n", desc->partial_in_width_last);
+	dla_debug("partial_width_first     = %u\n", desc->partial_width_first);
+	dla_debug("partial_width_mid       = %u\n", desc->partial_width_mid);
+	dla_debug("partial_width_last      = %u\n", desc->partial_width_last);
+	dla_debug("pool_mode               = %u\n", desc->pool_mode);
+	dla_debug("pool_width              = %u\n", desc->pool_width);
+	dla_debug("pool_height             = %u\n", desc->pool_height);
+	dla_debug("stride_x                = %u\n", desc->stride_x);
+	dla_debug("stride_y                = %u\n", desc->stride_y);
+	dla_debug("pad_left                = %u\n", desc->pad_left);
+	dla_debug("pad_right               = %u\n", desc->pad_right);
+	dla_debug("pad_top                 = %u\n", desc->pad_top);
+	dla_debug("pad_bottom              = %u\n", desc->pad_bottom);
+}
+
+void
+dla_debug_pdp_stats(struct dla_pdp_stat_desc *stat)
+{
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW STATS: dla_pdp_stat_desc\n");
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("inf_input_num   = %u\n", stat->inf_input_num);
+	dla_debug("nan_input_num   = %u\n", stat->nan_input_num);
+	dla_debug("nan_output_num  = %u\n", stat->nan_output_num);
+	dla_debug("write_stall     = %u\n", stat->write_stall);
+	dla_debug("runtime         = %u\n", stat->runtime);
+}
+
+void
+dla_debug_cdp_surface_desc(struct dla_cdp_surface_desc *desc, int32_t roi)
+{
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW ROI[%d]: dla_cdp_surface_desc\n", roi);
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("src_data            = [ dla_data_cube =>\n");
+	dla_debug_data_cube(&desc->src_data);
+	dla_debug("dst_data            = [ dla_data_cube =>\n");
+	dla_debug_data_cube(&desc->dst_data);
+}
+
+void
+dla_debug_cdp_op_desc(struct dla_cdp_op_desc *desc, int32_t roi)
+{
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW ROI[%d]: dla_cdp_op_desc\n", roi);
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("in_precision      = %u\n", desc->in_precision);
+	dla_debug("out_precision     = %u\n", desc->out_precision);
+	dla_debug("lut_index         = %d\n", desc->lut_index);
+	dla_debug("in_cvt             =\n");
+	dla_debug_converter(&desc->in_cvt);
+	dla_debug("out_cvt             =\n");
+	dla_debug_converter(&desc->out_cvt);
+	dla_debug("local_size        = %u\n", desc->local_size);
+	dla_debug("bypass_sqsum      = %u\n", desc->bypass_sqsum);
+	dla_debug("bypass_out_mul    = %u\n", desc->bypass_out_mul);
+}
+
+void
+dla_debug_cdp_stats(struct dla_cdp_stat_desc *stat)
+{
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW STATS: dla_cdp_stat_desc\n");
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("nan_input_num     = %u\n", stat->nan_input_num);
+	dla_debug("inf_input_num     = %u\n", stat->inf_input_num);
+	dla_debug("nan_output_num    = %u\n", stat->nan_output_num);
+	dla_debug("write_stall       = %u\n", stat->write_stall);
+	dla_debug("lut_uflow         = %u\n", stat->lut_uflow);
+	dla_debug("lut_oflow         = %u\n", stat->lut_oflow);
+	dla_debug("lut_hybrid        = %u\n", stat->lut_hybrid);
+	dla_debug("lut_le_hit        = %u\n", stat->lut_le_hit);
+	dla_debug("lut_lo_hit        = %u\n", stat->lut_lo_hit);
+	dla_debug("saturation_count  = %u\n", stat->saturation_count);
+	dla_debug("runtime           = %u\n", stat->runtime);
+}
+
+void
+dla_debug_rubik_surface_desc(struct dla_rubik_surface_desc *desc, int32_t roi)
+{
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW ROI[%d]: dla_rubik_surface_desc\n", roi);
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("src_data            = [ dla_data_cube =>\n");
+	dla_debug_data_cube(&desc->src_data);
+	dla_debug("dst_data            = [ dla_data_cube =>\n");
+	dla_debug_data_cube(&desc->dst_data);
+}
+
+void
+dla_debug_rubik_op_desc(struct dla_rubik_op_desc *desc, int32_t roi)
+{
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW ROI[%d]: dla_rubik_op_desc\n", roi);
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("mode       = %u\n", desc->mode);
+	dla_debug("precision  = %u\n", desc->precision);
+	dla_debug("stride_x   = %u\n", desc->stride_x);
+	dla_debug("stride_y   = %u\n", desc->stride_y);
+}
+
+void
+dla_debug_rubik_stats(struct dla_rubik_stat_desc *stat)
+{
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW STATS: dla_rubik_stat_desc\n");
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("read_stall   = %u\n", stat->read_stall);
+	dla_debug("write_stall  = %u\n", stat->write_stall);
+	dla_debug("runtime      = %u\n", stat->runtime);
+}
+
+static void
+dla_debug_sdp_op(struct dla_sdp_op *sdp_op)
+{
+	dla_debug("    enable         = %u\n", sdp_op->enable);
+	dla_debug("    alu_type       = %u\n", sdp_op->alu_type);
+	dla_debug("    type           = %u\n", sdp_op->type);
+	dla_debug("    mode           = %u\n", sdp_op->mode);
+	dla_debug("    act            = %u\n", sdp_op->act);
+	dla_debug("    shift_value    = %u\n", sdp_op->shift_value);
+	dla_debug("    truncate       = %u\n", sdp_op->truncate);
+	dla_debug("    precision      = %u\n", sdp_op->precision);
+	dla_debug("    alu_operand    = %d\n", sdp_op->alu_operand);
+	dla_debug("    mul_operand    = %d\n", sdp_op->mul_operand);
+	dla_debug("cvt.alu_cvt          =\n");
+	dla_debug_converter(&sdp_op->cvt.alu_cvt);
+	dla_debug("cvt.mul_cvt          =\n");
+	dla_debug_converter(&sdp_op->cvt.mul_cvt);
+	dla_debug("]\n");
+}
+
+void
+dla_debug_sdp_surface_desc(struct dla_sdp_surface_desc *desc, int32_t roi)
+{
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW ROI[%d]: dla_sdp_surface_desc\n", roi);
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("src_data            = [ dla_data_cube =>\n");
+	dla_debug_data_cube(&desc->src_data);
+	dla_debug("x1_data             = [ dla_data_cube =>\n");
+	dla_debug_data_cube(&desc->x1_data);
+	dla_debug("x2_data             = [ dla_data_cube =>\n");
+	dla_debug_data_cube(&desc->x2_data);
+	dla_debug("y_data              = [ dla_data_cube =>\n");
+	dla_debug_data_cube(&desc->y_data);
+	dla_debug("dst_data            = [ dla_data_cube =>\n");
+	dla_debug_data_cube(&desc->dst_data);
+}
+
+void
+dla_debug_sdp_op_desc(struct dla_sdp_op_desc *desc, int32_t roi)
+{
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW ROI[%d]: dla_sdp_op_desc\n", roi);
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("src_precision    = %u\n", desc->src_precision);
+	dla_debug("dst_precision    = %u\n", desc->dst_precision);
+	dla_debug("lut_index        = %d\n", desc->lut_index);
+	dla_debug("out_cvt          =\n");
+	dla_debug_converter(&desc->out_cvt);
+	dla_debug("conv_mode        = %u\n", desc->conv_mode);
+	dla_debug("batch_num        = %u\n", desc->batch_num);
+	dla_debug("batch_stride     = %u\n", desc->batch_stride);
+	dla_debug("x1_op            = [ dla_sdp_op =>\n");
+	dla_debug_sdp_op(&desc->x1_op);
+	dla_debug("x2_op            = [ dla_sdp_op =>\n");
+	dla_debug_sdp_op(&desc->x2_op);
+	dla_debug("y_op             = [ dla_sdp_op =>\n");
+	dla_debug_sdp_op(&desc->y_op);
+}
+
+void
+dla_debug_sdp_stats(struct dla_sdp_stat_desc *stat)
+{
+	dla_debug("*********************************************************\n");
+	dla_debug("NVDLA FW STATS: dla_sdp_stat_desc\n");
+	dla_debug("---------------------------------------------------------\n");
+	dla_debug("nan_input_num     = %u\n", stat->nan_input_num);
+	dla_debug("inf_input_num     = %u\n", stat->inf_input_num);
+	dla_debug("nan_output_num    = %u\n", stat->nan_output_num);
+	dla_debug("wdma_write_stall  = %u\n", stat->wdma_write_stall);
+	dla_debug("lut_underflow     = %u\n", stat->lut_underflow);
+	dla_debug("lut_overflow      = %u\n", stat->lut_overflow);
+	dla_debug("lut_hybrid        = %u\n", stat->lut_hybrid);
+	dla_debug("lut_le_hit        = %u\n", stat->lut_le_hit);
+	dla_debug("lut_lo_hit        = %u\n", stat->lut_lo_hit);
+	dla_debug("saturation_count  = %u\n", stat->saturation_count);
+	dla_debug("runtime           = %u\n", stat->runtime);
+}
+#endif /* DEBUG_NETWORK_DATA */
--- a/drivers/nvdla/engine_debug.h
+++ b/drivers/nvdla/engine_debug.h
@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __FIRMWARE_ENGINE_DEBUG_H_
+#define __FIRMWARE_ENGINE_DEBUG_H_
+
+#include <dla_debug.h>
+#include <dla_interface.h>
+
+#if DEBUG_NETWORK_DATA
+void
+dla_debug_op_desc(struct dla_common_op_desc *desc, int32_t roi);
+void
+dla_debug_network_desc(struct dla_network_desc *network_desc);
+void
+dla_debug_address_info(struct dla_task *task);
+void
+dla_debug_bdma_surface_desc(struct dla_bdma_surface_desc *desc, int32_t roi);
+void
+dla_debug_bdma_op_desc(struct dla_bdma_op_desc *desc, int32_t roi);
+void
+dla_debug_bdma_stats(struct dla_bdma_stat_desc *stat);
+void
+dla_debug_conv_surface_desc(struct dla_conv_surface_desc *desc, int32_t roi);
+void
+dla_debug_conv_op_desc(struct dla_conv_op_desc *desc, int32_t roi);
+void
+dla_debug_conv_stats(struct dla_conv_stat_desc *stat);
+void
+dla_debug_sdp_op_desc(struct dla_sdp_op_desc *desc, int32_t roi);
+void
+dla_debug_sdp_surface_desc(struct dla_sdp_surface_desc *desc, int32_t roi);
+void
+dla_debug_sdp_stats(struct dla_sdp_stat_desc *stat);
+void
+dla_debug_pdp_surface_desc(struct dla_pdp_surface_desc *desc, int32_t roi);
+void
+dla_debug_pdp_op_desc(struct dla_pdp_op_desc *desc, int32_t roi);
+void
+dla_debug_pdp_stats(struct dla_pdp_stat_desc *stat);
+void
+dla_debug_cdp_surface_desc(struct dla_cdp_surface_desc *desc, int32_t roi);
+void
+dla_debug_cdp_op_desc(struct dla_cdp_op_desc *desc, int32_t roi);
+void
+dla_debug_cdp_stats(struct dla_cdp_stat_desc *stat);
+void
+dla_debug_rubik_op_desc(struct dla_rubik_op_desc *desc, int32_t roi);
+void
+dla_debug_rubik_surface_desc(struct dla_rubik_surface_desc *desc, int32_t roi);
+void
+dla_debug_rubik_stats(struct dla_rubik_stat_desc *stat);
+void
+dla_debug_lut_params(struct dla_lut_param *lut_param);
+
+#else
+
+static inline void
+dla_debug_op_desc(struct dla_common_op_desc *desc, int32_t roi) {}
+static inline void
+dla_debug_network_desc(struct dla_network_desc *network_desc) {}
+static inline void
+dla_debug_address_info(struct dla_task *task) {}
+static inline void
+dla_debug_bdma_surface_desc(struct dla_bdma_surface_desc *desc, int32_t roi) {}
+static inline void
+dla_debug_bdma_op_desc(struct dla_bdma_op_desc *desc, int32_t roi) {}
+static inline void
+dla_debug_bdma_stats(struct dla_bdma_stat_desc *stat) {}
+static inline void
+dla_debug_conv_surface_desc(struct dla_conv_surface_desc *desc, int32_t roi) {}
+static inline void
+dla_debug_conv_op_desc(struct dla_conv_op_desc *desc, int32_t roi) {}
+static inline void
+dla_debug_conv_stats(struct dla_conv_stat_desc *stat) {}
+static inline void
+dla_debug_sdp_op_desc(struct dla_sdp_op_desc *desc, int32_t roi) {}
+static inline void
+dla_debug_sdp_surface_desc(struct dla_sdp_surface_desc *desc, int32_t roi) {}
+static inline void
+dla_debug_sdp_stats(struct dla_sdp_stat_desc *stat) {}
+static inline void
+dla_debug_pdp_surface_desc(struct dla_pdp_surface_desc *desc, int32_t roi) {}
+static inline void
+dla_debug_pdp_op_desc(struct dla_pdp_op_desc *desc, int32_t roi) {}
+static inline void
+dla_debug_pdp_stats(struct dla_pdp_stat_desc *stat) {}
+static inline void
+dla_debug_cdp_surface_desc(struct dla_cdp_surface_desc *desc, int32_t roi) {}
+static inline void
+dla_debug_cdp_op_desc(struct dla_cdp_op_desc *desc, int32_t roi) {}
+static inline void
+dla_debug_cdp_stats(struct dla_cdp_stat_desc *stat) {}
+static inline void
+dla_debug_rubik_op_desc(struct dla_rubik_op_desc *desc, int32_t roi) {}
+static inline void
+dla_debug_rubik_surface_desc(struct dla_rubik_surface_desc *desc, int32_t roi) {}
+static inline void
+dla_debug_rubik_stats(struct dla_rubik_stat_desc *stat) {}
+static inline void
+dla_debug_lut_params(struct dla_lut_param *lut_param) {}
+
+#endif /* DEBUG_NETWORK_DATA */
+#endif /* __FIRMWARE_ENGINE_DEBUG_H_ */
--- a/drivers/nvdla/engine_isr.c
+++ b/drivers/nvdla/engine_isr.c
@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <opendla.h>
+#include <dla_debug.h>
+#include <dla_engine.h>
+#include <dla_interface.h>
+
+#include "dla_engine_internal.h"
+
+int32_t dla_isr_handler(void *engine_data)
+{
+	uint32_t mask;
+	uint32_t reg;
+	struct dla_processor *processor = NULL;
+	struct dla_processor_group *group;
+	struct dla_engine *engine = (struct dla_engine *)engine_data;
+
+	mask = glb_reg_read(S_INTR_MASK);
+	reg = glb_reg_read(S_INTR_STATUS);
+
+	dla_trace("Enter: dla_isr_handler, reg:%x, mask:%x\n", reg, mask);
+	if (reg & MASK(GLB_S_INTR_STATUS_0, CACC_DONE_STATUS0)) {
+		processor = &engine->processors[DLA_OP_CONV];
+		group = &processor->groups[0];
+		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
+	}
+	if (reg & MASK(GLB_S_INTR_STATUS_0, CACC_DONE_STATUS1)) {
+		processor = &engine->processors[DLA_OP_CONV];
+		group = &processor->groups[1];
+		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
+	}
+	if (reg & MASK(GLB_S_INTR_STATUS_0, SDP_DONE_STATUS0)) {
+		processor = &engine->processors[DLA_OP_SDP];
+		group = &processor->groups[0];
+		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
+	}
+	if (reg & MASK(GLB_S_INTR_STATUS_0, SDP_DONE_STATUS1)) {
+		processor = &engine->processors[DLA_OP_SDP];
+		group = &processor->groups[1];
+		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
+	}
+	if (reg & MASK(GLB_S_INTR_STATUS_0, CDP_DONE_STATUS0)) {
+		processor = &engine->processors[DLA_OP_CDP];
+		group = &processor->groups[0];
+		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
+	}
+	if (reg & MASK(GLB_S_INTR_STATUS_0, CDP_DONE_STATUS1)) {
+		processor = &engine->processors[DLA_OP_CDP];
+		group = &processor->groups[1];
+		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
+	}
+	if (reg & MASK(GLB_S_INTR_STATUS_0, RUBIK_DONE_STATUS0)) {
+		processor = &engine->processors[DLA_OP_RUBIK];
+		group = &processor->groups[0];
+		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
+	}
+	if (reg & MASK(GLB_S_INTR_STATUS_0, RUBIK_DONE_STATUS1)) {
+		processor = &engine->processors[DLA_OP_RUBIK];
+		group = &processor->groups[1];
+		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
+	}
+	if (reg & MASK(GLB_S_INTR_STATUS_0, PDP_DONE_STATUS0)) {
+		processor = &engine->processors[DLA_OP_PDP];
+		group = &processor->groups[0];
+		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
+	}
+	if (reg & MASK(GLB_S_INTR_STATUS_0, PDP_DONE_STATUS1)) {
+		processor = &engine->processors[DLA_OP_PDP];
+		group = &processor->groups[1];
+		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
+	}
+	if (reg & MASK(GLB_S_INTR_STATUS_0, BDMA_DONE_STATUS0)) {
+		processor = &engine->processors[DLA_OP_BDMA];
+		group = &processor->groups[0];
+		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
+	}
+	if (reg & MASK(GLB_S_INTR_STATUS_0, BDMA_DONE_STATUS1)) {
+		processor = &engine->processors[DLA_OP_BDMA];
+		group = &processor->groups[1];
+		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
+	}
+	if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_DAT_DONE_STATUS0)) {
+		processor = &engine->processors[DLA_OP_CONV];
+		group = &processor->groups[0];
+		group->events |= (1 << DLA_EVENT_CDMA_DT_DONE);
+	}
+	if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_DAT_DONE_STATUS1)) {
+		processor = &engine->processors[DLA_OP_CONV];
+		group = &processor->groups[1];
+		group->events |= (1 << DLA_EVENT_CDMA_DT_DONE);
+	}
+	if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_WT_DONE_STATUS0)) {
+		processor = &engine->processors[DLA_OP_CONV];
+		group = &processor->groups[0];
+		group->events |= (1 << DLA_EVENT_CDMA_WT_DONE);
+	}
+	if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_WT_DONE_STATUS1)) {
+		processor = &engine->processors[DLA_OP_CONV];
+		group = &processor->groups[1];
+		group->events |= (1 << DLA_EVENT_CDMA_WT_DONE);
+	}
+
+	glb_reg_write(S_INTR_STATUS, reg);
+
+	mask = glb_reg_read(S_INTR_MASK);
+	reg = glb_reg_read(S_INTR_STATUS);
+
+	dla_trace("Exit: dla_isr_handler, reg:%x, mask:%x\n", reg, mask);
+	RETURN(0);
+}
--- a/drivers/nvdla/include/dla_debug.h
+++ b/drivers/nvdla/include/dla_debug.h
@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __FIRMWARE_DLA_DEBUG_H_
+#define __FIRMWARE_DLA_DEBUG_H_
+
+#define STRINGIFY(s) #s
+#define DEFER_STRINGIFY(s) STRINGIFY(s)
+#define FILELINE DEFER_STRINGIFY(__LINE__)
+#define FILENAME DEFER_STRINGIFY(__FILE__)
+
+#define LOG_EVENT_BDMA_SHIFT		0U
+#define LOG_EVENT_CONV_SHIFT		4U
+#define LOG_EVENT_SDP_SHIFT		8U
+#define LOG_EVENT_PDP_SHIFT		12U
+#define LOG_EVENT_CDP_SHIFT		16U
+#define LOG_EVENT_RBK_SHIFT		20U
+#define LOG_EVENT_GROUP_SHIFT		24U
+#define LOG_EVENT_ROI_SHIFT		28U
+
+#define LOG_TASK_START			1
+#define LOG_TASK_END			2
+#define LOG_READ_OP_CONFIG_START	3
+#define LOG_READ_OP_CONFIG_END		4
+#define LOG_READ_SURF_CONFIG_START	5
+#define LOG_READ_SURF_CONFIG_END	6
+#define LOG_PROGRAM_START		7
+#define LOG_PROGRAM_END			8
+#define LOG_OPERATION_START		9
+#define LOG_OPERATION_END		10
+
+#define LOG_EVENT(roi, group, processor, event)
+
+/**
+ * Used to enable/disable reading stat registers
+ */
+#define STAT_ENABLE		1
+
+/**
+ * Used to print debug network data
+ */
+#define DEBUG_NETWORK_DATA		0
+
+#define pr_dump_stack(format, ...)
+#define dla_trace(format, ...)
+
+#define assert(condition)
+
+#define RETURN(err) { return (err); }
+
+#define DEBUG_ASSERT
+
+#ifdef DEBUG_ASSERT
+#define ASSERT_GOTO(_condition, _ret, _err_value, _goto)	\
+do {								\
+	if (!(_condition)) {					\
+		dla_error("Assertion Fail(" FILENAME FILELINE "):" \
+					STRINGIFY(_condition));	\
+		_ret = _err_value;				\
+		goto _goto;					\
+	} else {						\
+		_ret = 0;					\
+	}							\
+} while (0)
+#else
+#define ASSERT_GOTO(_condition, _ret, _err_value, _goto) assert(condition)
+#endif /* DEBUG_ASSERT */
+
+#endif
--- a/drivers/nvdla/include/dla_engine.h
+++ b/drivers/nvdla/include/dla_engine.h
@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DLA_ENGINE_H_
+#define __DLA_ENGINE_H_
+
+#include <dla_interface.h>
+#include <dla_sched.h>
+
+struct dla_processor_group {
+	uint8_t id;
+	uint8_t rdma_id;
+	uint8_t active;
+	uint8_t events;
+	uint8_t roi_index;
+	uint8_t is_rdma_needed;
+	uint8_t pending;
+	int32_t lut_index;
+	uint8_t programming;
+	uint64_t start_time;
+
+	struct dla_common_op_desc *op_desc;
+	struct dla_common_op_desc *consumers[DLA_OP_NUM];
+	struct dla_common_op_desc *fused_parent;
+	union dla_operation_container *operation_desc;
+	union dla_surface_container *surface_desc;
+};
+
+struct dla_processor {
+	const char *name;
+	uint8_t op_type;
+	uint8_t consumer_ptr;
+	uint8_t roi_index;
+	uint8_t group_status;
+	uint8_t rdma_status;
+	uint8_t last_group;
+
+	struct dla_common_op_desc *tail_op;
+	struct dla_processor_group groups[DLA_NUM_GROUPS];
+	union dla_stat_container *stat_data_desc;
+
+	int32_t (*is_ready)(struct dla_processor *processor,
+				  struct dla_processor_group *group);
+	int32_t (*enable)(struct dla_processor_group *group);
+	int32_t (*program)(struct dla_processor_group *group);
+	void (*set_producer)(int32_t group_id, int32_t rdma_id);
+	void (*dump_config)(struct dla_processor_group *group);
+	void (*rdma_check)(struct dla_processor_group *group);
+	void (*get_stat_data)(struct dla_processor *processor,
+				struct dla_processor_group *group);
+	void (*dump_stat)(struct dla_processor *processor);
+};
+
+struct dla_engine {
+	struct dla_task *task;
+	struct dla_config *config_data;
+	struct dla_network_desc *network;
+	struct dla_processor processors[DLA_OP_NUM];
+
+	uint16_t num_proc_hwl;
+	int32_t status;
+	uint32_t stat_enable;
+
+	void *driver_context;
+};
+
+struct dla_engine *dla_get_engine(void);
+
+#endif
--- a/drivers/nvdla/include/dla_err.h
+++ b/drivers/nvdla/include/dla_err.h
@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __FIRMWARE_DLA_ERR_H_
+#define __FIRMWARE_DLA_ERR_H_
+
+#define ERR(code) -DLA_ERR_##code
+
+#define DLA_ERR_NONE			0
+#define DLA_ERR_INVALID_METHOD		1
+#define DLA_ERR_INVALID_TASK		2
+#define DLA_ERR_INVALID_INPUT		3
+#define DLA_ERR_INVALID_FALC_DMA	4
+#define DLA_ERR_INVALID_QUEUE		5
+#define DLA_ERR_INVALID_PREACTION	6
+#define DLA_ERR_INVALID_POSTACTION	7
+#define DLA_ERR_NO_MEM			8
+#define DLA_ERR_INVALID_DESC_VER	9
+#define DLA_ERR_INVALID_ENGINE_ID	10
+#define DLA_ERR_INVALID_REGION		11
+#define DLA_ERR_PROCESSOR_BUSY		12
+#define DLA_ERR_RETRY			13
+#define DLA_ERR_TASK_STATUS_MISMATCH	14
+
+#endif
--- a/drivers/nvdla/include/dla_interface.h
+++ b/drivers/nvdla/include/dla_interface.h
@ -0,0 +1,886 @@
+/*
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __FIRMWARE_DLA_INTERFACE_H_
+#define __FIRMWARE_DLA_INTERFACE_H_
+
+#include <nvdla_interface.h>
+
+/**
+ * @ingroup Processors
+ * @name DLA Processors
+ * Processor modules in DLA engine. Each processor has it's
+ * own operation a.k.a. HW layer. Network is formed using
+ * graph of these operations
+ * @{
+ */
+#define DLA_OP_BDMA		0
+#define DLA_OP_CONV		1
+#define DLA_OP_SDP		2
+#define DLA_OP_PDP		3
+#define DLA_OP_CDP		4
+#define DLA_OP_RUBIK		5
+/** @} */
+
+/**
+ * @ingroup Processors
+ * @name Maximum number of processors
+ * @brief DLA ash 6 processors
+ * @{
+ */
+#define DLA_OP_NUM		6
+/** @} */
+
+/**
+ * @ingroup Processors
+ * @name Number of groups
+ * @brief Each processor has 2 groups of registers
+ * @{
+ */
+#define DLA_NUM_GROUPS		2
+/** @} */
+
+/**
+ * Network descriptor
+ *
+ * Contains all information to execute a network
+ *
+ * @op_head: Index of first operation of each type in operations list
+ * @num_rois: Number of ROIs
+ * @num_operations: Number of operations in one list
+ * @num_luts: Number of LUTs
+ */
+struct dla_network_desc {
+	int16_t operation_desc_index;
+	int16_t surface_desc_index;
+
+	int16_t dependency_graph_index;
+	int16_t lut_data_index;
+
+	int16_t roi_array_index;
+	int16_t surface_index;
+
+	int16_t stat_list_index;
+	int16_t reserved1;
+
+	int16_t op_head[DLA_OP_NUM];
+
+	uint16_t num_rois;
+	uint16_t num_operations;
+
+	uint16_t num_luts;
+	uint16_t num_addresses;
+
+	int16_t input_layer;
+	uint8_t dynamic_roi;
+	uint8_t reserved0;
+} __packed __aligned(4);
+
+/**
+ * @name Memory types
+ * @brief DLA engnine can read/write to/from 3 memory types
+ * @{
+ */
+#define DLA_MEM_MC			0 /* External DRAM */
+#define DLA_MEM_CV			1 /* CV-SRAM */
+#define DLA_MEM_HW			2 /* DLA sub-module */
+/** @} */
+
+/**
+ * @ingroup Events
+ * @name Operation events
+ * @brief Different events triggered by an operations
+ * @{
+ */
+#define DLA_EVENT_OP_COMPLETED		1
+#define DLA_EVENT_OP_PROGRAMMED		2
+#define DLA_EVENT_OP_ENABLED		3
+#define DLA_EVENT_CDMA_WT_DONE		4
+#define DLA_EVENT_CDMA_DT_DONE		5
+/** @} */
+
+struct dla_consumer {
+	int16_t index; /* the index of dla_common_op_desc in dep_graph_addr */
+	uint8_t event;
+	uint8_t res;
+} __packed __aligned(4);
+
+struct dla_common_op_desc {
+	int16_t index; /* set by ucode */
+	int8_t roi_index;
+	uint8_t op_type;
+
+	uint8_t dependency_count;
+	uint8_t reserved0[3];
+
+	struct dla_consumer consumers[DLA_OP_NUM];
+	struct dla_consumer fused_parent;
+} __packed __aligned(4);
+
+struct dla_roi_array_desc {
+	uint32_t array_length;
+
+	uint32_t array_reserved;
+} __packed __aligned(4);
+
+struct dla_roi_desc {
+	uint32_t left;
+
+	uint32_t top;
+
+	uint32_t right;
+
+	uint32_t bottom;
+} __packed __aligned(4);
+
+/**
+ * @ingroup BDMA
+ * @name Maximum BDMA transfers
+ * @brief BDMA supports multiple transfers in operation. This indicates
+ *        maximum number of transfers possible in one operation.
+ * @{
+ */
+#define NUM_MAX_BDMA_OPS	20
+/** @} */
+
+struct dla_bdma_transfer_desc {
+	int16_t source_address;
+	int16_t destination_address;
+
+	uint32_t line_size;
+
+	uint32_t line_repeat;
+
+	uint32_t source_line;
+
+	uint32_t destination_line;
+
+	uint32_t surface_repeat;
+
+	uint32_t source_surface;
+
+	uint32_t destination_surface;
+} __packed __aligned(4);
+
+struct dla_bdma_surface_desc {
+	uint8_t source_type;
+	uint8_t destination_type;
+	uint16_t num_transfers;
+
+	struct dla_bdma_transfer_desc transfers[NUM_MAX_BDMA_OPS];
+} __packed __aligned(4);
+
+struct dla_bdma_op_desc {
+	uint16_t num_transfers;
+	uint16_t reserved0;
+} __packed __aligned(4);
+
+struct dla_bdma_stat_desc {
+	uint32_t read_stall;
+	uint32_t write_stall;
+	uint32_t runtime;
+} __packed __aligned(4);
+
+/**
+ * @ingroup Convolution
+ * @name Convolution mode
+ * @brief Convolution modes support by DLA
+ * @{
+ */
+#define CONV_MODE_DIRECT	0
+#define CONV_MODE_WINOGRAD	1
+/** @} */
+
+/**
+ * @ingroup Processors
+ * @name Precision BPE mapping
+ * @brief Precision formats and Bit Per Elements mapping
+ * @{
+ */
+#define BPE_PRECISION_INT8		1
+#define BPE_PRECISION_INT16		2
+#define BPE_PRECISION_FP16		2
+/** @} */
+
+
+/**
+ * @ingroup Processors
+ * @name Precision types
+ * @brief Precision formats supported by DLA engine
+ * @{
+ */
+#define PRECISION_INT8		0
+#define PRECISION_INT16		1
+#define PRECISION_FP16		2
+/** @} */
+
+/**
+ * @ingroup Processors
+ * @name Data formats
+ * @brief Data formats supported by DLA engine
+ * @{
+ */
+#define FORMAT_T_R8			0
+#define FORMAT_T_R10			1
+#define FORMAT_T_R12			2
+#define FORMAT_T_R16			3
+#define FORMAT_T_R16_I			4
+#define FORMAT_T_R16_F			5
+#define FORMAT_T_A16B16G16R16		6
+#define FORMAT_T_X16B16G16R16		7
+#define FORMAT_T_A16B16G16R16_F		8
+#define FORMAT_T_A16Y16U16V16		9
+#define FORMAT_T_V16U16Y16A16		10
+#define FORMAT_T_A16Y16U16V16_F		11
+#define FORMAT_T_A8B8G8R8		12
+#define FORMAT_T_A8R8G8B8		13
+#define FORMAT_T_B8G8R8A8		14
+#define FORMAT_T_R8G8B8A8		15
+#define FORMAT_T_X8B8G8R8		16
+#define FORMAT_T_X8R8G8B8		17
+#define FORMAT_T_B8G8R8X8		18
+#define FORMAT_T_R8G8B8X8		19
+#define FORMAT_T_A2B10G10R10		20
+#define FORMAT_T_A2R10G10B10		21
+#define FORMAT_T_B10G10R10A2		22
+#define FORMAT_T_R10G10B10A2		23
+#define FORMAT_T_A2Y10U10V10		24
+#define FORMAT_T_V10U10Y10A2		25
+#define FORMAT_T_A8Y8U8V8			26
+#define FORMAT_T_V8U8Y8A8			27
+#define FORMAT_T_Y8___U8V8_N444		28
+#define FORMAT_T_Y8___V8U8_N444		29
+#define FORMAT_T_Y10___U10V10_N444	30
+#define FORMAT_T_Y10___V10U10_N444	31
+#define FORMAT_T_Y12___U12V12_N444	32
+#define FORMAT_T_Y12___V12U12_N444	33
+#define FORMAT_T_Y16___U16V16_N444	34
+#define FORMAT_T_Y16___V16U16_N444	35
+#define FORMAT_FEATURE			36
+/** @} */
+
+/**
+ * @ingroup Convolution
+ * @name Pixel mapping
+ * @brief Pixel mapping formats supported for image input in Convolution
+ * @{
+ */
+#define MAP_PITCH_LINEAR		0
+/** @} */
+
+/**
+ * @ingroup Convolution
+ * @name Weight formats
+ * @brief Weight data formats supported in Convolution
+ * @{
+ */
+#define WEIGHT_FORMAT_UNCOMPRESSED	0
+#define WEIGHT_FORMAT_COMPRESSED	1
+/** @} */
+
+/**
+ * @ingroup Convolution
+ * @name Mean data format
+ * @brief Mean data formats supported in Convolution
+ * @{
+ */
+#define MEAN_FORMAT_DISABLE     0
+#define MEAN_FORMAT_ENABLE      1
+/** @} */
+
+struct dla_cvt_param {
+	int16_t  scale;
+	uint8_t  truncate;
+	uint8_t  enable;
+
+	int32_t  offset;
+} __packed __aligned(4);
+
+struct dla_data_cube {
+	uint16_t type; /* dla_mem_type */
+	int16_t address; /* offset to the actual IOVA in task.address_list */
+
+	uint32_t offset; /* offset within address */
+	uint32_t size;
+
+	/* cube dimensions */
+	uint16_t width;
+	uint16_t height;
+
+	uint16_t channel;
+	uint16_t reserved0;
+
+	/* stride information */
+	uint32_t line_stride;
+	uint32_t surf_stride;
+
+	/* For Rubik only */
+	uint32_t plane_stride;
+} __packed __aligned(4);
+
+#define PIXEL_OVERRIDE_UINT 0
+#define PIXEL_OVERRIDE_INT  1
+
+struct dla_conv_surface_desc {
+	/* Data cube */
+	struct dla_data_cube weight_data;
+	struct dla_data_cube wmb_data;
+	struct dla_data_cube wgs_data;
+	struct dla_data_cube src_data;
+	struct dla_data_cube dst_data;
+
+	/**
+	 * u_addr = input_data.source_addr + offset_u
+	 * this field should be set when YUV is not interleave format
+	 *
+	 */
+	int64_t offset_u;
+
+	/* line stride for 2nd plane, must be 32bytes aligned */
+	uint32_t in_line_uv_stride;
+} __packed __aligned(4);
+
+struct dla_conv_op_desc {
+	/* Performance parameters */
+
+	/* dla_conv_mode */
+	uint8_t conv_mode;
+	uint8_t data_reuse;
+	uint8_t weight_reuse;
+	uint8_t skip_data_rls;
+
+	uint8_t skip_weight_rls;
+	uint8_t reserved0;
+	uint16_t entry_per_slice;
+
+	/* dla_data_format */
+	uint8_t data_format;
+	/* dla_pixel_mapping */
+	uint8_t pixel_mapping;
+	/* number of free slices before fetch */
+	uint16_t fetch_grain;
+
+	uint8_t reserved_b[8];
+
+	/* batch_num */
+	uint8_t batch;
+	/* dla_weight_format */
+	uint8_t weight_format;
+	uint8_t data_bank;
+	uint8_t weight_bank;
+
+	/* the offset in bytes of each data cube in a batch */
+	uint32_t batch_stride;
+
+	uint8_t post_extension;
+	uint8_t pixel_override;
+	/* number of slices need to be released */
+	uint16_t release;
+
+	 /* The input cube dimension for CSC */
+	uint16_t input_width_csc;
+	uint16_t input_height_csc;
+
+	uint16_t input_channel_csc;
+	uint16_t kernel_width_csc;
+
+	uint16_t kernel_height_csc;
+	uint16_t kernel_channel_csc;
+
+	/* The input cube dimension for CMAC */
+	uint16_t input_width_cmac;
+	uint16_t input_height_cmac;
+
+	/* actual size in bytes */
+	uint32_t bytes_per_kernel;
+
+	/* Algorithm parameters */
+
+	int16_t mean_ry; /* mean value for red in RGB or Y in YUV */
+	int16_t mean_gu; /* mean value for green in RGB or U in YUV */
+
+	int16_t mean_bv; /* mean value for blue in RGB or V in YUV */
+	int16_t mean_ax;
+
+	uint8_t mean_format; /* dla_mean_format */
+	uint8_t conv_stride_x;
+	uint8_t conv_stride_y;
+	uint8_t pad_x_left;
+
+	uint8_t pad_x_right;
+	uint8_t pad_y_top;
+	uint8_t pad_y_bottom;
+	uint8_t dilation_x;
+
+	uint8_t dilation_y;
+	uint8_t reserved2[2];
+
+	/* Precision parameters */
+	uint8_t pra_truncate;
+
+	uint8_t in_precision;
+	/* The output precision from CONV, it's the MAC processing precison */
+	uint8_t out_precision;
+	int16_t pad_val;
+
+	/* input converter parameters */
+	struct dla_cvt_param in_cvt;
+	/* output converter parameters, support truncate only */
+	struct dla_cvt_param out_cvt;
+
+} __packed __aligned(4);
+
+struct dla_conv_stat_desc {
+	uint32_t data_read_stall;
+	uint32_t weight_read_stall;
+	uint32_t data_read_latency;
+	uint32_t weight_read_latency;
+	uint32_t saturation_count;
+	uint32_t nan_data_num;
+	uint32_t nan_weight_num;
+	uint32_t inf_data_num;
+	uint32_t inf_weight_num;
+	uint32_t runtime;
+} __packed __aligned(4);
+
+/**
+ * @ingroup SDP
+ * @name Activation functions
+ * @brief Activation functions supported in SDP
+ * @{
+ */
+#define ACTIVATION_NONE		0
+#define ACTIVATION_RELU		1
+#define ACTIVATION_LUT		2
+#define ACTIVATION_PRELU	3
+/** @} */
+
+/**
+ * @ingroup LUT
+ * @name LUT size
+ * @brief LUT sizes for linear and exponentila LUT
+ * @{
+ */
+#define LUT_LINEAR_EXP_TABLE_ENTRY_LOG2		6
+#define LUT_LINEAR_ONLY_TABLE_ENTRY_LOG2	8
+/** @} */
+
+/**
+ * @ingroup LUT
+ * @name LUT types
+ * @brief DLA supports two types of LUT, linear and exonential
+ * @{
+ */
+#define LUT_LINEAR_EXP_TABLE		0
+#define LUT_LINEAR_ONLY_TABLE		1
+/** @} */
+
+/**
+ * @ingroup LUT
+ * @name LUT methods
+ * @brief DLA supports two types of LUT, linear and exonential
+ * @{
+ */
+#define LUT_METHOD_EXPONENTIAL		0
+#define LUT_METHOD_LINEAR		1
+/** @} */
+
+/**
+ * @ingroup LUT
+ * @name LUT
+ * @brief DLA supports two types of LUT, linear and exonential
+ * @{
+ */
+#define LUT_PRI_LINEAR_EXP		0
+#define LUT_PRI_LINEAR_ONLY		1
+/** @} */
+
+union dla_lut_offset {
+	/**
+	 * Number should be substracted on log domain before look up
+	 * exponetial table it has the same definition as hardware
+	 * thus input scaling should also take into account when
+	 * set this field.
+	 */
+	int8_t exp_offset;
+	/**
+	 * Number of bits should be right shift before looking
+	 * up linear table
+	 */
+	int8_t frac_bits;
+	uint16_t reserved0;
+};
+
+/**
+ * This struct is used to represent floating point values by INT
+ * suppose we have a float point number fp_x, it will be represented
+ * as:
+ *
+ * fp_x = scale_int_x>>(shifter_x)
+ *
+ * This is very useful for INT pipeline;
+ */
+struct dla_float_data {
+	int16_t scale;
+	int8_t shifter;
+	uint8_t reserved0;
+} __packed __aligned(4);
+
+/**
+ * For INT pipeline, we use the struct above to represent a floating number;
+ * For FP16 pipeline, we should store the FP16 encoded value into a uint16_t
+ * container
+ */
+union dla_slope {
+	struct dla_float_data data_i;
+
+	uint16_t data_f;
+};
+
+struct dla_lut_param {
+	/**
+	 * value of expression ((1<<LUT_LINEAR_EXP_TABLE_ENTRY_LOG2)+1) is 65,
+	 * ((1<<LUT_LINEAR_ONLY_TABLE_ENTRY_LOG2)+1) is 257, and int16_t is of
+	 * 2Byte. And below two statement's combined memory size is 644 Byte.
+	 *
+	 * NOTE: below two declaration combined size should always be multiple
+	 * of 4.
+	 */
+	int16_t linear_exp_table[(1<<LUT_LINEAR_EXP_TABLE_ENTRY_LOG2)+1];
+	int16_t linear_only_table[(1<<LUT_LINEAR_ONLY_TABLE_ENTRY_LOG2)+1];
+
+	union dla_lut_offset linear_exp_offset;
+	union dla_lut_offset linear_only_offset;
+
+	/**
+	 * The start and end point of raw table,
+	 * valid when raw_method=LINEAR only
+	 */
+	uint64_t linear_exp_start;
+	uint64_t linear_exp_end;
+	uint64_t linear_only_start;
+	uint64_t linear_only_end;
+
+	union dla_slope linear_exp_underflow_slope;
+	union dla_slope linear_exp_overflow_slope;
+	union dla_slope linear_only_underflow_slope;
+	union dla_slope linear_only_overflow_slope;
+
+	/**
+	 * dla_lut_priority, when both lut are hit(or one overflow,
+	 * the other underflow), which one should be selected as output
+	 */
+	uint8_t hybrid_priority;
+	uint8_t underflow_priority;
+	uint8_t overflow_priority;
+	uint8_t method; /* dla_lut_method */
+} __packed __aligned(4);
+
+struct dla_sdp_surface_desc {
+	/* Data cube */
+	/* source input cube, available when SDP working on offline mode */
+	struct dla_data_cube src_data;
+
+	/* X1 input cube */
+	struct dla_data_cube x1_data;
+
+	/* X2 input cube */
+	struct dla_data_cube x2_data;
+
+	/* Y input cube */
+	struct dla_data_cube y_data;
+
+	/* Output cube */
+	struct dla_data_cube dst_data;
+} __packed __aligned(4);
+
+#define SDP_OP_NONE		0
+#define SDP_OP_MUL		1
+#define SDP_OP_ADD		2
+#define SDP_OP_BOTH		3
+
+#define SDP_ALU_OP_MAX		0
+#define SDP_ALU_OP_MIN		1
+#define SDP_ALU_OP_SUM		2
+#define SDP_ALU_OP_EQL		3
+
+#define SDP_OP_PER_LAYER	0
+#define SDP_OP_PER_KERNEL	1
+#define SDP_OP_PER_POINT	2
+
+struct dla_sdp_cvt {
+	struct dla_cvt_param alu_cvt;
+	struct dla_cvt_param mul_cvt;
+} __packed __aligned(4);
+
+struct dla_sdp_op {
+	uint8_t enable;
+	uint8_t alu_type; /* dla_sdp_alu_op_type */
+	uint8_t type; /* dla_sdp_op_type */
+	uint8_t mode; /* dla_sdp_op_mode */
+
+	uint8_t act; /* dla_act_type */
+	uint8_t shift_value; /* left shift */
+	uint8_t truncate;
+	uint8_t precision;
+
+	int32_t alu_operand;
+	int32_t mul_operand;
+
+	struct dla_sdp_cvt  cvt;
+} __packed __aligned(4);
+
+struct dla_sdp_op_desc {
+	/* Precision parameters */
+	/* dla_precision */
+	uint8_t src_precision;
+	uint8_t dst_precision;
+	int16_t lut_index;
+
+	struct dla_cvt_param out_cvt;
+
+	/* Performance parameters */
+	/* dla_conv_mode */
+	uint8_t conv_mode;
+	uint8_t batch_num;
+	uint16_t reserved0;
+
+	uint32_t batch_stride;	/* will be used when batch_num > 1 */
+
+	/* Algorithm parameters */
+	struct dla_sdp_op x1_op;
+	struct dla_sdp_op x2_op;
+	struct dla_sdp_op y_op;
+} __packed __aligned(4);
+
+struct dla_sdp_stat_desc {
+	uint32_t nan_input_num;
+	uint32_t inf_input_num;
+	uint32_t nan_output_num;
+	uint32_t wdma_write_stall;
+	uint32_t lut_underflow;
+	uint32_t lut_overflow;
+	uint32_t lut_hybrid;
+	uint32_t lut_le_hit;
+	uint32_t lut_lo_hit;
+	uint32_t saturation_count;
+	uint32_t runtime;
+} __packed __aligned(4);
+
+#define POOL_MODE_AVG		0
+#define POOL_MODE_MAX		1
+#define POOL_MODE_MIN		2
+
+#define POOL_SIZE_1		0
+#define POOL_SIZE_2		1
+#define POOL_SIZE_3		2
+#define POOL_SIZE_4		3
+#define POOL_SIZE_5		4
+#define POOL_SIZE_6		5
+#define POOL_SIZE_7		6
+#define POOL_SIZE_8		7
+
+#define PDP_PAD_VAL_NUM	7
+
+struct dla_pdp_surface_desc {
+	/* Data cube */
+	struct dla_data_cube src_data;
+
+	struct dla_data_cube dst_data;
+} __packed __aligned(4);
+
+struct dla_pdp_op_desc {
+	/* Performance parameters */
+	uint16_t  partial_in_width_first;
+	uint16_t  partial_in_width_mid;
+
+	uint16_t  partial_in_width_last;
+	uint16_t  partial_width_first;
+
+	uint16_t  partial_width_mid;
+	uint16_t  partial_width_last;
+
+	uint8_t   split_num;
+
+	/* Algorithm parameters */
+	uint8_t  pool_mode; /* dla_pool_mode */
+	uint8_t  pool_width; /* dla_pool_width */
+	uint8_t  pool_height; /* dla_pool_height */
+
+	uint8_t  stride_x;
+	uint8_t  stride_y;
+
+	/**
+	 * The left/right padding size,
+	 * pad_right might be less than pad_left
+	 */
+	uint8_t  pad_left;
+	uint8_t  pad_right;
+
+	/* The top/bottom padding size */
+	uint8_t  pad_top;
+	uint8_t  pad_bottom;
+
+	/* Precision parameters */
+	uint8_t  precision; /* dla_precision */
+	uint8_t  reserved0;
+	/**
+	 * if input has non-zero "offset", this value should be set
+	 * There'll be 7 different paddding values, the relationship between
+	 * those versions are:
+	 * padding_value[0] = -offset*scaling;
+	 * padding_value[1] = 2*padding_value[0]
+	 * padding_value[2] = 3*padding_value[0]
+	 * ...
+	 * The purpose is to avoid ucode implement FP16
+	 * multiplier(for FP16 mode)
+	 */
+	int32_t  padding_value[PDP_PAD_VAL_NUM];
+} __packed __aligned(4);
+
+struct dla_pdp_stat_desc {
+	uint32_t inf_input_num;
+	uint32_t nan_input_num;
+	uint32_t nan_output_num;
+	uint32_t write_stall;
+	uint32_t runtime;
+} __packed __aligned(4);
+
+struct dla_cdp_surface_desc {
+	/* Data cube */
+	struct dla_data_cube src_data;
+
+	struct dla_data_cube dst_data;
+} __packed __aligned(4);
+
+struct dla_cdp_op_desc {
+	/* Precision parameters */
+
+	/* dla_precision */
+	uint8_t  in_precision;
+	uint8_t  out_precision;
+	int16_t  lut_index;
+
+	struct dla_cvt_param in_cvt;
+	struct dla_cvt_param out_cvt;
+
+	/* Performance parameters */
+
+	/* Algorithm parameters */
+	uint8_t  local_size;
+	uint8_t  bypass_sqsum;
+	uint8_t  bypass_out_mul;
+	uint8_t  reserved0;
+} __packed __aligned(4);
+
+struct dla_cdp_stat_desc {
+	uint32_t nan_input_num;
+	uint32_t inf_input_num;
+	uint32_t nan_output_num;
+	uint32_t write_stall;
+	uint32_t lut_uflow;
+	uint32_t lut_oflow;
+	uint32_t lut_hybrid;
+	uint32_t lut_le_hit;
+	uint32_t lut_lo_hit;
+	uint32_t saturation_count;
+	uint32_t runtime;
+} __packed __aligned(4);
+
+struct dla_rubik_surface_desc {
+	/* Data cube */
+	struct dla_data_cube src_data;
+
+	struct dla_data_cube dst_data;
+} __packed __aligned(4);
+
+/* rubik mode */
+#define RUBIK_MODE_CONTRACT	0
+#define RUBIK_MODE_SPLIT	1
+#define RUBIK_MODE_MERGE	2
+
+struct dla_rubik_op_desc {
+	/* Precision parameters */
+	uint8_t mode;
+	uint8_t precision;
+	uint8_t stride_x;
+	uint8_t stride_y;
+} __packed __aligned(4);
+
+struct dla_rubik_stat_desc {
+	uint32_t read_stall;
+	uint32_t write_stall;
+	uint32_t runtime;
+} __packed __aligned(4);
+
+union dla_surface_container {
+	struct dla_bdma_surface_desc bdma_surface;
+	struct dla_conv_surface_desc conv_surface;
+	struct dla_sdp_surface_desc sdp_surface;
+	struct dla_pdp_surface_desc pdp_surface;
+	struct dla_cdp_surface_desc cdp_surface;
+	struct dla_rubik_surface_desc rubik_surface;
+};
+
+union dla_operation_container {
+	struct dla_bdma_op_desc bdma_op;
+	struct dla_conv_op_desc conv_op;
+	struct dla_sdp_op_desc sdp_op;
+	struct dla_pdp_op_desc pdp_op;
+	struct dla_cdp_op_desc cdp_op;
+	struct dla_rubik_op_desc rubik_op;
+};
+
+union dla_stat_container {
+	struct dla_bdma_stat_desc bdma_stat;
+	struct dla_conv_stat_desc conv_stat;
+	struct dla_sdp_stat_desc sdp_stat;
+	struct dla_pdp_stat_desc pdp_stat;
+	struct dla_cdp_stat_desc cdp_stat;
+	struct dla_rubik_stat_desc rubik_stat;
+};
+
+/**
+ * status notifier structure
+ *
+ * @address: 64-bit timestamp representing the time at which
+ * the notifier was written
+ * @status_engine: status work captured from HW engine
+ * @subframe: NA
+ * @status_task: status word as configured from an action list
+ */
+struct dla_task_status {
+	uint64_t timestamp;
+
+	uint32_t status_engine;
+
+	uint16_t subframe;
+	uint16_t status_task;
+} __packed __aligned(4);
+
+#endif
--- a/drivers/nvdla/include/dla_sched.h
+++ b/drivers/nvdla/include/dla_sched.h
@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DLA_SCHED_H_
+#define __DLA_SCHED_H_
+
+struct dla_task {
+	/* platform specific data to communicate with portability layer */
+	void *task_data;
+	/* task state */
+	uint32_t state;
+	/* Task base address */
+	uint64_t base;
+	/* start address of a list of dla_operation_container */
+	uint64_t operation_desc_addr;
+	/* start address of a list of dla_surface_container */
+	uint64_t surface_desc_addr;
+	/* start address of a list of dla_common_op_desc */
+	uint64_t dependency_graph_addr;
+	/* start address of a list of dla_lut_param */
+	uint64_t lut_data_addr;
+	/*
+	 * start address of a list of dla_roi_desc,
+	 * the first one is dla_roi_array_desc
+	 * valid when network.dynamic_roi is true
+	 */
+	uint64_t roi_array_addr;
+	/* start address of a list of dla_surface_container */
+	uint64_t surface_addr;
+	/* start address of a list of dla_stat_container */
+	uint64_t stat_data_addr;
+} __packed __aligned(256);
+
+/**
+ * @brief			Configuration parameters supported by the engine
+ *
+ * atom_size			Memory smallest access size
+ * bdma_enable			Defines whether bdma is supported
+ * rubik_enable			Defines whether rubik is supported
+ * weight_compress_support	Defines whether weight data compression is supported
+ */
+struct dla_config {
+	uint32_t atom_size;
+	bool bdma_enable;
+	bool rubik_enable;
+	bool weight_compress_support;
+};
+
+#endif
--- a/drivers/nvdla/include/nvdla_interface.h
+++ b/drivers/nvdla/include/nvdla_interface.h
@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __NVDLA_INTERFACE_H_
+#define __NVDLA_INTERFACE_H_
+
+#include <linux/types.h>
+
+/**
+ * @brief			Register driver to firmware
+ *
+ * Implementation in firmware, called by portability layer
+ *
+ * This function must be called once during boot to initialize DLA
+ * engine scheduler and register driver with firmware before submitting
+ * any task. Pass pointer to driver context in @param driver_context
+ * which is passed as param when firmware calls any function
+ * of portability layer. It also updates pointer to engine context
+ * which must be passed in any function call to firmware after this point.
+ *
+ * @param engine_context	Pointer to engine specific data
+ * @param driver_context	Pointer to driver specific data
+ *
+ * @return			0 on success and negative on error
+ */
+int32_t dla_register_driver(void **engine_context, void *driver_context);
+
+/**
+ * @brief			Interrupt handler
+ *
+ * Implementation in firmware, called by portability layer
+ *
+ * This function is called when DLA interrupt is received. Portability layer
+ * should register it's own handler using the mechanism supported by that platform
+ * and call this function from the handler. Call to this function must be
+ * protected by lock to prevent handling interrupt when firmware is programming
+ * layers in process context.
+ *
+ * @param engine_context	Engine specific data received in dla_register_driver
+ *
+ * @return			0 on success and negative on error
+ */
+int32_t dla_isr_handler(void *engine_context);
+
+/**
+ * @brief			Process events recorded in interrupt handler
+ *
+ * Implementation in firmware, called by portability layer
+ *
+ * Interrupt handler just records events and does not process those events.
+ * Portability layer must call this function in thread/process context after
+ * interrupt handler is done.
+ *
+ * @param engine_context	Engine specific data received in dla_register_driver
+ * @param task_complete		Pointer to parameter to indicate task complete,
+				firmare writes 1 to it if all layers are processed.
+ *
+ * @return			0 on success and negative on error
+ *
+ */
+int32_t dla_process_events(void *engine_context, uint32_t *task_complete);
+
+/**
+ * @brief			Clear task from firmware
+ *
+ * Implementation in firmware, called by portability layer
+ *
+ * This function resets engine scheduler state including op descriptor cache,
+ * error values, sub-engine status, events etc and clears previous task state
+ * from firmware. This function can be called by portability layer after
+ * task completion. It is not mandatory to call it but calling it will
+ * ensure clean state before next task execution.
+ *
+ * @param engine_context	Engine specific data received in dla_register_driver
+ *
+ * @return			0 on success and negative on error
+ *
+ */
+void dla_clear_task(void *engine_context);
+
+/**
+ * @brief			Execute task
+ *
+ * Implementation in firmware, called by portability layer
+ *
+ * This function initializes sub-engines and starts task execution. Further
+ * programming and layer scheduling is triggered by events received from
+ * hardware.
+ *
+ * @param engine_context	Engine specific data received in dla_register_driver
+ * @param task_data		Task specific data to be passed when reading task info
+ * @param config_data		Configuration data to be passed
+ *
+ * @return			0 on success and negative on error
+ *
+ */
+int32_t dla_execute_task(void *engine_context, void *task_data, void *config_data);
+
+/**
+ * @brief			Register read
+ *
+ * Implementation in portability layer, called by firmware
+ *
+ * Read DLA HW register. Portability layer is responsible to use correct
+ * base address and for any IO mapping if required.
+ *
+ * @param engine_context	Driver specific data received in dla_register_driver
+ * @param addr			Register offset
+ *
+ * @return			Register value
+ *
+ */
+uint32_t dla_reg_read(void *driver_context, uint32_t addr);
+
+/**
+ * @brief			Register write
+ *
+ * Implementation in portability layer, called by firmware
+ *
+ * Write DLA HW registr. Portability layer is responsible to use correct
+ * base address and for any IO mapping if required.
+ *
+ * @param driver_context	Driver specific data received in dla_register_driver
+ * @param addr			Register offset
+ * @param reg			Value to write
+ *
+ */
+void dla_reg_write(void *driver_context, uint32_t addr, uint32_t reg);
+
+/**
+ * @brief			Read data from DMA mapped memory in local buffer
+ *
+ * Implementation in portability layer, called by firmware
+ *
+ * This function reads data from buffers passed by UMD in local memory.
+ * Addresses for buffers passed by are shared in address list and network
+ * descriptor contains index in address list for those buffers. Firmware
+ * reads this data from buffer shared by UMD into local buffer to consume
+ * the information.
+ *
+ * @param driver_context	Driver specific data received in dla_register_driver
+ * @param task_data		Task specific data received in dla_execute_task
+ * @param src			Index in address list
+ * @param dst			Pointer to local memory
+ * @param size			Size of data to copy
+ * @param offset		Offset from start of UMD buffer
+ *
+ * @return			0 on success and negative on error
+ *
+ */
+int32_t dla_data_read(void *driver_context, void *task_data,
+				uint64_t src, void *dst,
+				uint32_t size, uint64_t offset);
+
+/**
+ * @brief			Write data to DMA mapped memory from local buffer
+ *
+ * Implementation in portability layer, called by firmware
+ *
+ * This function writes data from local buffer to buffer passed by UMD.
+ * Addresses for buffers passed by are shared in address list and network
+ * descriptor contains index in address list for those buffers. Firmware
+ * writes this data to buffer shared by UMD from local buffer to update
+ * the information.
+ *
+ * @param driver_context	Driver specific data received in dla_register_driver
+ * @param task_data		Task specific data received in dla_execute_task
+ * @param src			Pointer to local memory
+ * @param dst			Index in address list
+ * @param size			Size of data to copy
+ * @param offset		Offset from start of UMD buffer
+ *
+ * @return			0 on success and negative on error
+ *
+ */
+int32_t dla_data_write(void *driver_context, void *task_data,
+				void *src, uint64_t dst,
+				uint32_t size, uint64_t offset);
+
+/* Destination for DMA buffer */
+#define DESTINATION_PROCESSOR	0
+#define DESTINATION_DMA		1
+
+/**
+ * @brief			Read DMA address
+ *
+ * Implementation in portability layer, called by firmware
+ *
+ * Some buffers shared by UMD are accessed by processor responsible for
+ * programming DLA HW. It would be companion micro-controller in case of
+ * headed config while main CPU in case of headless config. Also, some
+ * buffers are accessed by DLA DMA engines inside sub-engines. This function
+ * should return proper address accessible by destination user depending
+ * on config.
+ *
+ * @param driver_context	Driver specific data received in dla_register_driver
+ * @param task_data		Task specific data received in dla_execute_task
+ * @param index			Index in address list
+ * @param dst_ptr		Pointer to update address
+ * @param destination		Destination user for DMA address
+ *
+ * @return			0 on success and negative on error
+ *
+ */
+int32_t dla_get_dma_address(void *driver_context, void *task_data,
+					int16_t index, void *dst_ptr,
+					uint32_t destination);
+
+/**
+ * @brief			Read time value in micro-seconds
+ *
+ * Implementation in portability layer, called by firmware
+ *
+ * Read system time in micro-seconds
+ *
+ * @return			Time value in micro-seconds
+ *
+ */
+int64_t dla_get_time_us(void);
+
+/**
+ * @brief			Print debug message
+ *
+ * Implementation in portability layer, called by firmware
+ *
+ * Print debug message to console
+ *
+ * @param str			Format string and variable arguments
+ *
+ */
+void dla_debug(const char *str, ...);
+
+/**
+ * @brief			Print information message
+ *
+ * Implementation in portability layer, called by firmware
+ *
+ * Print information message to console
+ *
+ * @param str			Format string and variable arguments
+ *
+ */
+void dla_info(const char *str, ...);
+
+/**
+ * @brief			Print warning message
+ *
+ * Implementation in portability layer, called by firmware
+ *
+ * Print warning message to console
+ *
+ * @param str			Format string and variable arguments
+ *
+ */
+void dla_warn(const char *str, ...);
+
+/**
+ * @brief			Print error message
+ *
+ * Implementation in portability layer, called by firmware
+ *
+ * Print error message to console
+ *
+ * @param str			Format string and variable arguments
+ *
+ */
+void dla_error(const char *str, ...);
+
+/**
+ * @brief			Fill memory region
+ *
+ * Implementation in portability layer, called by firmware
+ *
+ * Fills the first len bytes of the memory area pointed to by src
+ * with the constant byte ch.
+ *
+ * @param src			Memory area address
+ * @param ch			Byte to fill
+ * @param len			Length of memory area to fill
+ *
+ * @return			Memory area address
+ *
+ */
+void *dla_memset(void *src, int ch, uint64_t len);
+
+/**
+ * @brief			Copy memory
+ *
+ * Implementation in portability layer, called by firmware
+ *
+ * Copies len bytes from memory area src to memory area dest.
+ *
+ * @param dest			Destination memory area address
+ * @param src			Source memory area address
+ * @param len			Length of memory area to copy
+ *
+ * @return			Destination memory area address
+ *
+ */
+void *dla_memcpy(void *dest, const void *src, uint64_t len);
+
+#endif
--- a/drivers/nvdla/include/nvdla_ioctl.h
+++ b/drivers/nvdla/include/nvdla_ioctl.h
@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __LINUX_NVDLA_IOCTL_H
+#define __LINUX_NVDLA_IOCTL_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#if !defined(__KERNEL__)
+#define __user
+#endif
+
+/**
+ * struct nvdla_mem_handle structure for memory handles
+ *
+ * @handle		handle to DMA buffer allocated in userspace
+ * @reserved		Reserved for padding
+ * @offset		offset in bytes from start address of buffer
+ *
+ */
+struct nvdla_mem_handle {
+	__u32 handle;
+	__u32 reserved;
+	__u64 offset;
+};
+
+/**
+ * struct nvdla_ioctl_submit_task structure for single task information
+ *
+ * @num_addresses		total number of entries in address_list
+ * @reserved			Reserved for padding
+ * @address_list		pointer to array of struct nvdla_mem_handle
+ *
+ */
+struct nvdla_ioctl_submit_task {
+#define NVDLA_MAX_BUFFERS_PER_TASK (6144)
+	__u32 num_addresses;
+#define NVDLA_NO_TIMEOUT    (0xffffffff)
+	__u32 timeout;
+	__u64 address_list;
+};
+
+/**
+ * struct nvdla_submit_args structure for task submit
+ *
+ * @tasks		pointer to array of struct nvdla_ioctl_submit_task
+ * @num_tasks		number of entries in tasks
+ * @flags		flags for task submit, no flags defined yet
+ * @version		version of task structure
+ *
+ */
+struct nvdla_submit_args {
+	__u64 tasks;
+	__u16 num_tasks;
+#define NVDLA_MAX_TASKS_PER_SUBMIT	24
+#define NVDLA_SUBMIT_FLAGS_ATOMIC	(1 << 0)
+	__u16 flags;
+	__u32 version;
+};
+
+/**
+ * struct nvdla_gem_create_args for allocating DMA buffer through GEM
+ *
+ * @handle		handle updated by kernel after allocation
+ * @flags		implementation specific flags
+ * @size		size of buffer to allocate
+ */
+struct nvdla_gem_create_args {
+	__u32 handle;
+	__u32 flags;
+	__u64 size;
+};
+
+/**
+ * struct nvdla_gem_map_offset_args for mapping DMA buffer
+ *
+ * @handle		handle of the buffer
+ * @reserved		reserved for padding
+ * @offset		offset updated by kernel after mapping
+ */
+struct nvdla_gem_map_offset_args {
+	__u32 handle;
+	__u32 reserved;
+	__u64 offset;
+};
+
+/**
+ * struct nvdla_gem_destroy_args for destroying DMA buffer
+ *
+ * @handle		handle of the buffer
+ */
+struct nvdla_gem_destroy_args {
+	__u32 handle;
+};
+
+#define DRM_NVDLA_SUBMIT		0x00
+#define DRM_NVDLA_GEM_CREATE		0x01
+#define DRM_NVDLA_GEM_MMAP		0x02
+#define DRM_NVDLA_GEM_DESTROY		0x03
+
+#define DRM_IOCTL_NVDLA_SUBMIT DRM_IOWR(DRM_COMMAND_BASE + DRM_NVDLA_SUBMIT, struct nvdla_submit_args)
+#define DRM_IOCTL_NVDLA_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE + DRM_NVDLA_GEM_CREATE, struct nvdla_gem_create_args)
+#define DRM_IOCTL_NVDLA_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + DRM_NVDLA_GEM_MMAP, struct nvdla_gem_map_offset_args)
+#define DRM_IOCTL_NVDLA_GEM_DESTROY DRM_IOWR(DRM_COMMAND_BASE + DRM_NVDLA_GEM_DESTROY, struct nvdla_gem_destroy_args)
+
+#endif
--- a/drivers/nvdla/include/nvdla_linux.h
+++ b/drivers/nvdla/include/nvdla_linux.h
@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __LINUX_NVDLA_LINUX_H_
+#define __LINUX_NVDLA_LINUX_H_
+
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/kref.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+
+/**
+ * @brief			Task information submitted from user space
+ *
+ * ref				Reference count for task
+ * num_addresses		Number of addresses in address list
+ * nvdla_dev			Pointer to NVDLA device
+ * address_list			Address list
+ * file				DRM file instance
+ */
+struct nvdla_task {
+	struct kref ref;
+	uint32_t num_addresses;
+	struct nvdla_device *nvdla_dev;
+	struct nvdla_mem_handle *address_list;
+	struct drm_file *file;
+};
+
+/**
+ * @brief			Configuration parameters supported by the engine
+ *
+ * atom_size			Memory smallest access size
+ * bdma_enable			Defines whether bdma is supported
+ * rubik_enable			Defines whether rubik is supported
+ * weight_compress_support	Defines whether weight data compression is supported
+ */
+struct nvdla_config
+{
+	uint32_t atom_size;
+	bool bdma_enable;
+	bool rubik_enable;
+	bool weight_compress_support;
+};
+
+/**
+ * @brief			NVDLA device
+ *
+ * irq				Interrupt number associated with this device
+ * ref				Reference count for device
+ * base				IO mapped base address for device
+ * nvdla_lock			Spinlock used for synchronization
+ * drm				DRM device instance
+ * task				Pointer to task in execution
+ * config_data			Pointer to the configuration data
+ * pdev				Pointer to NVDLA platform device
+ * event_notifier		Completion object used to wait for events from HW
+ * engine_context		Private data passed from engine in dla_engine_init
+ */
+struct nvdla_device {
+	int32_t irq;
+	struct kref ref;
+	void __iomem *base;
+	spinlock_t nvdla_lock;
+	struct drm_device *drm;
+	struct nvdla_task *task;
+	struct nvdla_config *config_data;
+	struct platform_device *pdev;
+	struct completion event_notifier;
+
+	void *engine_context;
+};
+
+/**
+ * @brief			Submit task
+ *
+ * This function submits task to NVDLA engine.
+ *
+ * @param nvdla_dev		Pointer to NVDLA device
+ * @param task			Pointer to task
+ * @return			0 on success and negative on error
+ *
+ */
+int32_t nvdla_task_submit(struct nvdla_device *nvdla_dev, struct nvdla_task *task);
+
+/**
+ * @brief			Get DMA address
+ *
+ * This function gets DMA address for given fd
+ *
+ * @param dev			DRM device instance
+ * @param file			DRM file instance
+ * @param fd			File desriptor for DMA buffer
+ * @param addr			Pointer to update DMA address
+ * @return			0 on success and negative on error
+ *
+ */
+int32_t nvdla_gem_dma_addr(struct drm_device *dev, struct drm_file *file,
+					uint32_t fd, dma_addr_t *addr);
+
+/**
+ * @brief			DRM probe
+ *
+ * Probe function for DRM device
+ *
+ * @param nvdla_dev		NVDLA device pointer
+ * @return			0 on success and negative on error
+ *
+ */
+int32_t nvdla_drm_probe(struct nvdla_device *nvdla_dev);
+
+/**
+ * @brief			DRM remove
+ *
+ * Remove function for DRM device
+ *
+ * @param nvdla_dev		NVDLA device pointer
+ *
+ */
+void nvdla_drm_remove(struct nvdla_device *nvdla_dev);
+
+#endif
--- a/drivers/nvdla/include/opendla.h
+++ b/drivers/nvdla/include/opendla.h
@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __OPENDLA_H_
+#define __OPENDLA_H_
+
+#define DLA_2_CONFIG
+
+#ifdef DLA_2_CONFIG
+#include <opendla_small.h>
+#else
+#include <opendla_initial.h>
+#endif
+
+#endif
--- a/drivers/nvdla/include/opendla_initial.h
+++ b/drivers/nvdla/include/opendla_initial.h
--- a/drivers/nvdla/include/opendla_small.h
+++ b/drivers/nvdla/include/opendla_small.h
--- a/drivers/nvdla/nvdla_core_callbacks.c
+++ b/drivers/nvdla/nvdla_core_callbacks.c
@ -0,0 +1,448 @@
+/*
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdarg.h>
+
+#include <linux/dma-buf.h>
+#include <linux/dma-mapping.h>
+#include <linux/fs.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_irq.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/time.h>
+#include <linux/uaccess.h>
+
+#include <nvdla_interface.h>
+#include <nvdla_linux.h>
+#include <nvdla_ioctl.h>
+#include <opendla.h>
+
+static struct nvdla_config nvdla_config_os_initial = {
+	.atom_size = 32,
+	.bdma_enable = true,
+	.rubik_enable = true,
+	.weight_compress_support = true,
+};
+
+static struct nvdla_config nvdla_config_small = {
+	.atom_size = 8,
+	.bdma_enable = false,
+	.rubik_enable = false,
+	.weight_compress_support = false,
+};
+
+static struct nvdla_config nvdla_config_large = {
+	.atom_size = 32,
+	.bdma_enable = false,
+	.rubik_enable = false,
+	.weight_compress_support = false,
+};
+
+void dla_debug(const char *str, ...)
+{
+	va_list args;
+	va_start(args, str);
+	vprintk(pr_fmt(str), args);
+	va_end(args);
+}
+
+void dla_info(const char *str, ...)
+{
+	va_list args;
+	va_start(args, str);
+	vprintk(str, args);
+	va_end(args);
+}
+
+void dla_warn(const char *str, ...)
+{
+	va_list args;
+	va_start(args, str);
+	vprintk(str, args);
+	va_end(args);
+}
+
+void dla_error(const char *str, ...)
+{
+	va_list args;
+	va_start(args, str);
+	vprintk(str, args);
+	va_end(args);
+}
+
+void *dla_memset(void *src, int ch, uint64_t len)
+{
+	memset(src, ch, len);
+	return src;
+}
+
+void *dla_memcpy(void *dest, const void *src, uint64_t len)
+{
+	return memcpy(dest, src, len);
+}
+
+int64_t dla_get_time_us(void)
+{
+	return ktime_get_ns() / NSEC_PER_USEC;
+}
+
+void dla_reg_write(void *driver_context, uint32_t addr, uint32_t reg)
+{
+	struct nvdla_device *nvdla_dev =
+			(struct nvdla_device *)driver_context;
+
+	if (!nvdla_dev)
+		return;
+
+	writel(reg, nvdla_dev->base + addr);
+}
+
+uint32_t dla_reg_read(void *driver_context, uint32_t addr)
+{
+	struct nvdla_device *nvdla_dev =
+			(struct nvdla_device *)driver_context;
+
+	if (!nvdla_dev)
+		return 0;
+
+	return readl(nvdla_dev->base + addr);
+}
+
+static irqreturn_t nvdla_engine_isr(int32_t irq, void *data)
+{
+	unsigned long flags;
+	struct nvdla_device *nvdla_dev = (struct nvdla_device *)data;
+
+	if (!nvdla_dev)
+		return IRQ_NONE;
+
+	spin_lock_irqsave(&nvdla_dev->nvdla_lock, flags);
+	dla_isr_handler(nvdla_dev->engine_context);
+	complete(&nvdla_dev->event_notifier);
+	spin_unlock_irqrestore(&nvdla_dev->nvdla_lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+static int32_t dla_read_dma_address(void *driver_context, void *task_data,
+						int16_t index, void *dst)
+{
+	int32_t ret = 0;
+	struct nvdla_mem_handle *handles;
+	dma_addr_t *phys_addr = (dma_addr_t *)(dst);
+	struct nvdla_device *nvdla_dev =
+			(struct nvdla_device *)driver_context;
+	struct nvdla_task *task = (struct nvdla_task *)task_data;
+
+	if (index == -1 || index > task->num_addresses)
+		return -EINVAL;
+
+	handles = (struct nvdla_mem_handle *)task->address_list;
+	ret = nvdla_gem_dma_addr(nvdla_dev->drm, task->file,
+					handles[index].handle,
+					phys_addr);
+
+	/* Add offset to IOVA address */
+	*phys_addr = *phys_addr + handles[index].offset;
+
+	return ret;
+}
+
+static int32_t dla_read_cpu_address(void *driver_context, void *task_data,
+						int16_t index, void *dst)
+{
+	uint64_t *temp = (uint64_t *)dst;
+	struct nvdla_task *task = (struct nvdla_task *)task_data;
+
+	if (index == -1 || index > task->num_addresses)
+		return -EINVAL;
+
+	*temp = (uint64_t)index;
+	return 0;
+}
+
+int32_t dla_get_dma_address(void *driver_context, void *task_data,
+					int16_t index, void *dst_ptr,
+					uint32_t destination)
+{
+	int32_t ret = 0;
+
+	if (destination == DESTINATION_PROCESSOR) {
+		ret = dla_read_cpu_address(driver_context, task_data,
+						index, dst_ptr);
+	} else if (destination == DESTINATION_DMA) {
+		ret = dla_read_dma_address(driver_context, task_data,
+						index, dst_ptr);
+	} else {
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+int32_t dla_data_write(void *driver_context, void *task_data,
+				void *src, uint64_t dst,
+				uint32_t size, uint64_t offset)
+{
+	int32_t ret;
+	void *ptr = NULL;
+	struct dma_buf *buf;
+	struct dma_buf_map map;
+	struct nvdla_mem_handle *handles;
+	struct nvdla_task *task = (struct nvdla_task *)task_data;
+	uint64_t dma_addr = 0;
+
+	dla_get_dma_address(driver_context, task_data,dst, (void *)&dma_addr, DESTINATION_DMA);
+	handles = task->address_list;
+	buf = dma_buf_get(handles[dst].handle);
+	if (IS_ERR(buf)) {
+		pr_err("%s: Failed get dma_buf for handle=%d\n", __func__,
+						handles[dst].handle);
+		return -EFAULT;
+	}
+
+	ret = dma_buf_begin_cpu_access(buf, DMA_BIDIRECTIONAL);
+	if (ret)
+		goto put_dma_buf;
+
+	ret = dma_buf_vmap(buf, &map);
+	ptr = ret ? NULL : map.vaddr;
+	if (!ptr) {
+		pr_err("%s: Failed to vmap dma_buf for handle=%d\n", __func__,
+						handles[dst].handle);
+		ret = -ENOMEM;
+		goto end_cpu_access;
+	}
+
+	memcpy((void *)((uint8_t *)ptr + offset), src, size);
+	dma_buf_vunmap(buf, &map);
+
+end_cpu_access:
+	dma_buf_end_cpu_access(buf, DMA_BIDIRECTIONAL);
+
+put_dma_buf:
+	dma_buf_put(buf);
+
+	return ret;
+}
+
+int32_t dla_data_read(void *driver_context, void *task_data,
+				uint64_t src, void *dst,
+				uint32_t size, uint64_t offset)
+{
+	int32_t ret;
+	void *ptr = NULL;
+	struct dma_buf *buf;
+	struct dma_buf_map map;
+	struct nvdla_mem_handle *handles;
+	struct nvdla_task *task = (struct nvdla_task *)task_data;
+	uint64_t dma_addr = 0;
+
+	dla_get_dma_address(driver_context, task_data, src, (void *)&dma_addr, DESTINATION_DMA);
+	handles = task->address_list;
+
+	buf = dma_buf_get(handles[src].handle);
+	if (IS_ERR(buf)) {
+		pr_err("%s: Failed get dma_buf for handle=%d\n", __func__,
+						handles[src].handle);
+		return -EFAULT;
+	}
+
+	ret = dma_buf_begin_cpu_access(buf, DMA_BIDIRECTIONAL);
+	if (ret)
+		goto put_dma_buf;
+
+	ret = dma_buf_vmap(buf, &map);
+	ptr = ret ? NULL : map.vaddr;
+	if (!ptr) {
+		pr_err("%s: Failed to vmap dma_buf for handle=%d\n", __func__,
+						handles[src].handle);
+		ret = -ENOMEM;
+		goto end_cpu_access;
+	}
+
+	memcpy(dst, (void *)(((uint8_t *)ptr) + offset), size);
+	dma_buf_vunmap(buf, &map);
+
+end_cpu_access:
+	dma_buf_end_cpu_access(buf, DMA_BIDIRECTIONAL);
+
+put_dma_buf:
+	dma_buf_put(buf);
+
+	return ret;
+}
+
+int32_t nvdla_task_submit(struct nvdla_device *nvdla_dev, struct nvdla_task *task)
+{
+	int32_t err = 0;
+	uint32_t task_complete = 0;
+
+	nvdla_dev->task = task;
+
+	err = dla_execute_task(nvdla_dev->engine_context, (void *)task, nvdla_dev->config_data);
+	if (err) {
+		pr_err("Task execution failed\n");
+		return err;
+	}
+
+	pr_debug("Wait for task complete\n");
+
+	while (1) {
+		unsigned long flags;
+
+		wait_for_completion(&nvdla_dev->event_notifier);
+
+		spin_lock_irqsave(&nvdla_dev->nvdla_lock, flags);
+
+		err = dla_process_events(nvdla_dev->engine_context, &task_complete);
+
+		spin_unlock_irqrestore(&nvdla_dev->nvdla_lock, flags);
+
+		if (err || task_complete)
+			break;
+	}
+
+	pr_debug("Task complete\n");
+	dla_clear_task(nvdla_dev->engine_context);
+
+	return err;
+}
+
+/* driver probe and init */
+static const struct of_device_id nvdla_of_match[] = {
+	{
+		.compatible = "nvidia,nvdla_os_initial",
+		.data = &nvdla_config_os_initial,
+	},
+	{
+		.compatible = "nvidia,nv_small",
+		.data = &nvdla_config_small,
+	},
+	{
+		.compatible = "nvidia,nv_large",
+		.data = &nvdla_config_large,
+	},
+	{ },
+};
+
+static int32_t nvdla_probe(struct platform_device *pdev)
+{
+	int32_t err = 0;
+	struct resource *res;
+	struct nvdla_device *nvdla_dev;
+	struct device *dev = &pdev->dev;
+	const struct of_device_id *match;
+
+	if (!pdev->dev.of_node)
+		return -EINVAL;
+
+	match = of_match_device(nvdla_of_match, &pdev->dev);
+	if (!match) {
+		pr_err("Missing DT entry!\n");
+		return -EINVAL;
+	}
+
+	pr_err("Probe NVDLA config %s\n", match->compatible);
+
+	nvdla_dev = devm_kzalloc(dev, sizeof(*nvdla_dev), GFP_KERNEL);
+	if (!nvdla_dev)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, nvdla_dev);
+	nvdla_dev->pdev = pdev;
+	nvdla_dev->config_data = (struct nvdla_config *)match->data;
+
+	init_completion(&nvdla_dev->event_notifier);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	nvdla_dev->base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(nvdla_dev->base))
+		return PTR_ERR(nvdla_dev->base);
+
+	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "no irq resource\n");
+		return -EINVAL;
+	}
+	nvdla_dev->irq = res->start;
+
+	err = devm_request_irq(&pdev->dev, nvdla_dev->irq,
+				nvdla_engine_isr, 0,
+				dev_name(&pdev->dev), nvdla_dev);
+	if (err)
+		return err;
+
+	dla_register_driver(&nvdla_dev->engine_context, (void *)nvdla_dev);
+	dla_clear_task(nvdla_dev->engine_context);
+
+	err = nvdla_drm_probe(nvdla_dev);
+	if (err)
+		dev_err(&pdev->dev, "failed to register drm device\n");
+
+	return err;
+}
+
+static int32_t __exit nvdla_remove(struct platform_device *pdev)
+{
+	struct nvdla_device *nvdla_dev = dev_get_drvdata(&pdev->dev);
+
+	nvdla_drm_remove(nvdla_dev);
+
+	return 0;
+}
+
+static struct platform_driver nvdla_driver = {
+	.probe = nvdla_probe,
+	.remove = __exit_p(nvdla_remove),
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = "NVDLA",
+		.of_match_table = of_match_ptr(nvdla_of_match),
+	},
+};
+module_platform_driver(nvdla_driver);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("NVIDIA");
+MODULE_DESCRIPTION("Nvidia Deep Learning Accelerator driver");
--- a/drivers/nvdla/nvdla_gem.c
+++ b/drivers/nvdla/nvdla_gem.c
@ -0,0 +1,475 @@
+/*
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <drm/drm_device.h>
+#include <drm/drm_drv.h>
+#include <drm/drm_gem.h>
+#include <drm/drm_gem_cma_helper.h>
+
+#include <linux/dma-buf.h>
+#include <linux/dma-mapping.h>
+#include <linux/dma-map-ops.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+
+#include <nvdla_linux.h>
+#include <nvdla_ioctl.h>
+#include <opendla.h>
+#define to_nvdla_obj(x) container_of(x, struct nvdla_gem_object, object)
+
+struct nvdla_gem_object {
+	struct drm_gem_object object;
+
+	void *kvaddr;
+	dma_addr_t dma_addr;
+	unsigned long dma_attrs;
+};
+
+static int32_t nvdla_fill_task_desc(struct nvdla_ioctl_submit_task *local_task,
+				struct nvdla_task *task)
+{
+	struct nvdla_mem_handle *handles;
+
+	/* update task desc fields */
+	task->num_addresses = local_task->num_addresses;
+
+	handles = kzalloc(local_task->num_addresses *
+				sizeof(struct nvdla_mem_handle), GFP_KERNEL);
+	if (handles == NULL)
+		return -EFAULT;
+
+	/* get user addresses list */
+	if (copy_from_user(handles,
+		(void __user *)local_task->address_list,
+		(task->num_addresses *
+			sizeof(struct nvdla_mem_handle)))) {
+		pr_err("failed to copy address list from user ptr\n");
+		kfree(handles);
+		return -EFAULT;
+	}
+
+	task->address_list = handles;
+	return 0;
+}
+
+static int32_t nvdla_submit(struct drm_device *drm, void *arg,
+					struct drm_file *file)
+{
+	int32_t err = 0;
+	struct nvdla_task *task;
+	struct nvdla_ioctl_submit_task local_task;
+	struct nvdla_ioctl_submit_task __user *user_task;
+	struct nvdla_device *nvdla_dev = dev_get_drvdata(drm->dev);
+	struct nvdla_submit_args *args =
+			(struct nvdla_submit_args *)arg;
+
+	user_task = (struct nvdla_ioctl_submit_task __user *)
+			(uintptr_t)args->tasks;
+	if (!user_task)
+		return -EINVAL;
+
+	/* IOCTL copy descriptors */
+	if (copy_from_user(&local_task, (void __user *)user_task,
+			(sizeof(*user_task))))
+		return -EFAULT;
+
+	task = kzalloc(sizeof(*task), GFP_KERNEL);
+	if (task == NULL)
+		return -EFAULT;
+
+	nvdla_dev->task = task;
+	kref_init(&task->ref);
+	task->nvdla_dev = nvdla_dev;
+	task->file = file;
+
+	/* update task desc fields */
+	err = nvdla_fill_task_desc(&local_task, task);
+	if (err)
+		goto free_task_desc;
+
+	err = nvdla_task_submit(nvdla_dev, task);
+
+	kfree(task->address_list);
+
+free_task_desc:
+	kfree(task);
+	return err;
+}
+
+static int32_t nvdla_gem_alloc(struct nvdla_gem_object *nobj)
+{
+	struct drm_gem_object *dobj = &nobj->object;
+	struct drm_device *drm = dobj->dev;
+
+	nobj->dma_attrs = DMA_ATTR_WRITE_COMBINE;
+
+	nobj->kvaddr = dma_alloc_attrs(drm->dev, dobj->size, &nobj->dma_addr,
+						GFP_KERNEL, nobj->dma_attrs);
+
+	if (!nobj->kvaddr)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void nvdla_gem_free(struct nvdla_gem_object *nobj)
+{
+	struct drm_gem_object *dobj = &nobj->object;
+	struct drm_device *drm = dobj->dev;
+
+	dma_free_attrs(drm->dev, dobj->size, nobj->kvaddr, nobj->dma_addr,
+				nobj->dma_attrs);
+}
+
+static void nvdla_gem_free_object(struct drm_gem_object *dobj)
+{
+	struct nvdla_gem_object *nobj;
+
+	drm_gem_free_mmap_offset(dobj);
+
+	nobj = to_nvdla_obj(dobj);
+
+	nvdla_gem_free(nobj);
+
+	kfree(nobj);
+}
+
+static struct sg_table
+*nvdla_drm_gem_prime_get_sg_table(struct drm_gem_object *dobj)
+{
+	int32_t ret;
+	struct sg_table *sgt;
+	struct drm_device *drm = dobj->dev;
+	struct nvdla_gem_object *nobj = to_nvdla_obj(dobj);
+
+	sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
+	if (!sgt)
+		return ERR_PTR(-ENOMEM);
+
+	ret = dma_get_sgtable_attrs(drm->dev, sgt, nobj->kvaddr,
+			nobj->dma_addr, dobj->size,
+			nobj->dma_attrs);
+	if (ret) {
+		DRM_ERROR("failed to allocate sgt, %d\n", ret);
+		kfree(sgt);
+		return ERR_PTR(ret);
+	}
+
+	return sgt;
+}
+
+static int nvdla_drm_gem_prime_vmap(struct drm_gem_object *obj, struct dma_buf_map *map)
+{
+	struct nvdla_gem_object *nobj = to_nvdla_obj(obj);
+	if (nobj->dma_attrs & DMA_ATTR_NO_KERNEL_MAPPING)
+		return -ENOMEM;
+	dma_buf_map_set_vaddr(map, nobj->kvaddr);
+	return 0;
+}
+
+static void nvdla_drm_gem_prime_vunmap(struct drm_gem_object *obj, struct dma_buf_map *map)
+{
+    /* Nothing to do */
+}
+
+static const struct drm_gem_object_funcs nvdla_gem_funcs = {
+	.free			= nvdla_gem_free_object,
+	.export			= drm_gem_prime_export,
+	.vmap			= nvdla_drm_gem_prime_vmap,
+	.vunmap			= nvdla_drm_gem_prime_vunmap,
+	.get_sg_table	= nvdla_drm_gem_prime_get_sg_table,
+	.vm_ops			= &drm_gem_cma_vm_ops,
+};
+
+static struct nvdla_gem_object *
+nvdla_gem_create_object(struct drm_device *drm, uint32_t size)
+{
+	int32_t ret;
+	struct drm_gem_object *dobj;
+	struct nvdla_gem_object *nobj;
+
+	size = round_up(size, PAGE_SIZE);
+
+	nobj = kzalloc(sizeof(*nobj), GFP_KERNEL);
+	if (!nobj)
+		return ERR_PTR(-ENOMEM);
+
+	dobj = &nobj->object;
+	dobj->funcs = &nvdla_gem_funcs;
+
+	drm_gem_private_object_init(drm, dobj, size);
+
+	ret = nvdla_gem_alloc(nobj);
+	if (ret)
+		goto free_nvdla_obj;
+
+	return nobj;
+
+free_nvdla_obj:
+	kfree(nobj);
+	return ERR_PTR(ret);
+}
+
+static struct nvdla_gem_object *
+nvdla_gem_create_with_handle(struct drm_file *file_priv,
+				struct drm_device *drm, uint32_t size,
+				uint32_t *handle)
+{
+	int32_t ret;
+	struct drm_gem_object *dobj;
+	struct nvdla_gem_object *nobj;
+
+	nobj = nvdla_gem_create_object(drm, size);
+	if (IS_ERR(nobj))
+		return ERR_CAST(nobj);
+
+	dobj = &nobj->object;
+
+	ret = drm_gem_handle_create(file_priv, dobj, handle);
+	if (ret)
+		goto free_drm_object;
+
+	drm_gem_object_put(dobj);
+
+	return nobj;
+
+free_drm_object:
+	nvdla_gem_free_object(dobj);
+
+	return ERR_PTR(ret);
+}
+
+static int32_t nvdla_gem_create(struct drm_device *drm, void *data,
+				struct drm_file *file)
+{
+	struct nvdla_gem_object *nobj;
+	struct nvdla_gem_create_args *args = data;
+
+	nobj = nvdla_gem_create_with_handle(file, drm, args->size,
+					 &args->handle);
+	if (IS_ERR(nobj))
+		return PTR_ERR(nobj);
+
+	return 0;
+}
+
+static int32_t nvdla_drm_gem_object_mmap(struct drm_gem_object *dobj,
+					struct vm_area_struct *vma)
+{
+	int32_t ret;
+	struct nvdla_gem_object *nobj = to_nvdla_obj(dobj);
+	struct drm_device *drm = dobj->dev;
+
+	vma->vm_flags &= ~VM_PFNMAP;
+	vma->vm_pgoff = 0;
+
+	ret = dma_mmap_attrs(drm->dev, vma, nobj->kvaddr, nobj->dma_addr,
+			     dobj->size, nobj->dma_attrs);
+	if (ret)
+		drm_gem_vm_close(vma);
+
+	return ret;
+}
+
+static int32_t nvdla_drm_gem_mmap_buf(struct drm_gem_object *obj,
+				struct vm_area_struct *vma)
+{
+	int32_t ret;
+
+	ret = drm_gem_mmap_obj(obj, obj->size, vma);
+	if (ret)
+		return ret;
+
+	return nvdla_drm_gem_object_mmap(obj, vma);
+}
+
+static int32_t nvdla_drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	int32_t ret;
+	struct drm_gem_object *obj;
+
+	ret = drm_gem_mmap(filp, vma);
+	if (ret)
+		return ret;
+
+	obj = vma->vm_private_data;
+
+	return nvdla_drm_gem_object_mmap(obj, vma);
+}
+
+int32_t nvdla_gem_dma_addr(struct drm_device *dev, struct drm_file *file,
+			uint32_t fd, dma_addr_t *addr)
+{
+	int32_t ret;
+	uint32_t handle;
+	struct nvdla_gem_object *nobj;
+	struct drm_gem_object *dobj;
+
+	ret = drm_gem_prime_fd_to_handle(dev, file, fd, &handle);
+	if (ret)
+		return ret;
+
+	dobj = drm_gem_object_lookup(file, handle);
+	if (!dobj)
+		return -EINVAL;
+
+	nobj = to_nvdla_obj(dobj);
+
+	*addr = nobj->dma_addr;
+
+	drm_gem_object_put(dobj);
+
+	return 0;
+}
+
+static int32_t nvdla_gem_map_offset(struct drm_device *drm, void *data,
+				struct drm_file *file)
+{
+	int32_t ret;
+	struct drm_gem_object *dobj;
+	struct nvdla_gem_map_offset_args *args = data;
+
+	dobj = drm_gem_object_lookup(file, args->handle);
+	if (!dobj)
+		return -EINVAL;
+
+	ret = drm_gem_create_mmap_offset(dobj);
+	if (ret)
+		goto out;
+
+	args->offset = drm_vma_node_offset_addr(&dobj->vma_node);
+
+out:
+	drm_gem_object_put(dobj);
+
+	return 0;
+}
+
+static int32_t nvdla_gem_destroy(struct drm_device *drm, void *data,
+				struct drm_file *file)
+{
+	struct nvdla_gem_destroy_args *args = data;
+
+	return drm_gem_handle_delete(file, args->handle);
+}
+
+static const struct file_operations nvdla_drm_fops = {
+	.owner = THIS_MODULE,
+	.open = drm_open,
+	.release = drm_release,
+	.unlocked_ioctl = drm_ioctl,
+	.mmap = nvdla_drm_gem_mmap,
+	.poll = drm_poll,
+	.read = drm_read,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = drm_compat_ioctl,
+#endif
+	.llseek = noop_llseek,
+};
+
+static const struct drm_ioctl_desc nvdla_drm_ioctls[] = {
+	DRM_IOCTL_DEF_DRV(NVDLA_SUBMIT, nvdla_submit, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(NVDLA_GEM_CREATE, nvdla_gem_create, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(NVDLA_GEM_MMAP, nvdla_gem_map_offset, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(NVDLA_GEM_DESTROY, nvdla_gem_destroy, DRM_RENDER_ALLOW),
+};
+
+static struct drm_driver nvdla_drm_driver = {
+	.driver_features = DRIVER_GEM | DRIVER_RENDER,
+
+	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
+	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
+	.gem_prime_import = drm_gem_prime_import,
+	.gem_prime_mmap		= nvdla_drm_gem_mmap_buf,
+
+	.ioctls = nvdla_drm_ioctls,
+	.num_ioctls = ARRAY_SIZE(nvdla_drm_ioctls),
+	.fops = &nvdla_drm_fops,
+
+	.name = "nvdla",
+	.desc = "NVDLA driver",
+	.date = "20171017",
+	.major = 0,
+	.minor = 0,
+	.patchlevel = 0,
+};
+
+int32_t nvdla_drm_probe(struct nvdla_device *nvdla_dev)
+{
+	int32_t err;
+	struct drm_device *drm;
+	struct drm_driver *driver = &nvdla_drm_driver;
+	struct resource res_cma;
+	struct device_node *node;
+
+	drm = drm_dev_alloc(driver, &nvdla_dev->pdev->dev);
+	if (IS_ERR(drm))
+		return PTR_ERR(drm);
+
+	nvdla_dev->drm = drm;
+
+	err = drm_dev_register(drm, 0);
+	if (err < 0)
+		goto unref;
+
+	/**
+	 * TODO Register separate driver for memory and use DT node to
+	 * read memory range
+	 */
+	node = of_parse_phandle(drm->dev->of_node, "memory-region", 0);
+	if(node ){
+		dev_info(drm->dev, "Get mem from memory-region\n");
+		of_address_to_resource(node, 0, &res_cma);
+		err = dma_declare_coherent_memory(drm->dev, res_cma.start, res_cma.start,resource_size(&res_cma));
+	} else {
+		dev_info(drm->dev, "NVDLA using the default mem.\n");
+		err = dma_declare_coherent_memory(drm->dev, 0xC0000000, 0xC0000000, 0x40000000);
+	}
+
+	if (err < 0) {
+		goto unref;
+	}
+
+	return 0;
+
+unref:
+	drm_dev_put(drm);
+	return err;
+}
+
+void nvdla_drm_remove(struct nvdla_device *nvdla_dev)
+{
+	drm_dev_unregister(nvdla_dev->drm);
+	drm_dev_put(nvdla_dev->drm);
+}
--- a/drivers/nvdla/pdp.c
+++ b/drivers/nvdla/pdp.c
@ -0,0 +1,528 @@
+/*
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <opendla.h>
+#include <dla_debug.h>
+#include <dla_err.h>
+#include <dla_interface.h>
+
+#include "common.h"
+#include "dla_engine_internal.h"
+#include "engine_debug.h"
+
+#define MAX_SPLIT_NUM	64
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(a)	(sizeof(a) / sizeof((a[0])))
+#endif
+
+static const uint8_t map_ram[] = {
+	FIELD_ENUM(PDP_RDMA_D_SRC_RAM_CFG_0, SRC_RAM_TYPE, MC),
+	FIELD_ENUM(PDP_RDMA_D_SRC_RAM_CFG_0, SRC_RAM_TYPE, CV),
+};
+
+static const uint8_t map_pool[] = {
+	FIELD_ENUM(PDP_D_OPERATION_MODE_CFG_0,
+			POOLING_METHOD, POOLING_METHOD_AVERAGE),
+	FIELD_ENUM(PDP_D_OPERATION_MODE_CFG_0,
+			POOLING_METHOD, POOLING_METHOD_MAX),
+	FIELD_ENUM(PDP_D_OPERATION_MODE_CFG_0,
+			POOLING_METHOD, POOLING_METHOD_MIN),
+};
+
+static const uint8_t map_precision[] = {
+	FIELD_ENUM(PDP_D_DATA_FORMAT_0, INPUT_DATA, INT8),
+	FIELD_ENUM(PDP_D_DATA_FORMAT_0, INPUT_DATA, INT16),
+	FIELD_ENUM(PDP_D_DATA_FORMAT_0, INPUT_DATA, FP16),
+};
+
+static const uint8_t map_pool_kernel[] = {
+	FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_1),
+	FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_2),
+	FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_3),
+	FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_4),
+	FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_5),
+	FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_6),
+	FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_7),
+	FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_8),
+};
+
+/* The reciprocal of kernel width: 1/1, 1/2, 1/3, ... */
+static const uint32_t recip_kernel_size[2][8] = {
+	/*
+	 * INT8/16
+	 * 1      1/2     1/3     1/4     1/5     1/6     1/7     1/8
+	 */
+	{0x10000, 0x8000, 0x5555, 0x4000, 0x3333, 0x2aaa, 0x2492, 0x2000},
+	{0x7c00, 0x7800, 0x7555,  0x7400, 0x7266, 0x7155, 0x7092, 0x7000},
+};
+
+#if STAT_ENABLE
+void
+dla_pdp_stat_data(struct dla_processor *processor,
+					struct dla_processor_group *group)
+{
+	uint64_t end_time = 0;
+	struct dla_pdp_stat_desc *pdp_stat;
+
+	pdp_stat = &processor->stat_data_desc->pdp_stat;
+
+	end_time = dla_get_time_us();
+
+	pdp_stat->write_stall = pdp_reg_read(D_PERF_WRITE_STALL);
+	pdp_stat->runtime = (uint32_t)(end_time - group->start_time);
+}
+
+void
+dla_pdp_dump_stat(struct dla_processor *processor)
+{
+	struct dla_pdp_stat_desc *pdp_stat;
+
+	pdp_stat = &processor->stat_data_desc->pdp_stat;
+
+	dla_debug_pdp_stats(pdp_stat);
+}
+#endif /* STAT_ENABLE */
+
+static uint32_t
+get_fly_mode(uint8_t type)
+{
+	uint32_t val;
+
+	val = type == DLA_MEM_HW ?
+			FIELD_ENUM(PDP_D_OPERATION_MODE_CFG_0,
+						FLYING_MODE, ON_FLYING) :
+			FIELD_ENUM(PDP_D_OPERATION_MODE_CFG_0,
+						FLYING_MODE, OFF_FLYING);
+
+	return val;
+}
+
+void
+dla_pdp_set_producer(int32_t group_id, int32_t rdma_group_id)
+{
+	uint32_t reg;
+
+	dla_trace("Enter: %s", __func__);
+
+	dla_debug("group id %d rdma id %d\n", group_id, rdma_group_id);
+
+	reg = group_id << SHIFT(PDP_S_POINTER_0, PRODUCER);
+	pdp_reg_write(S_POINTER, reg);
+
+	reg = rdma_group_id << SHIFT(PDP_RDMA_S_POINTER_0, PRODUCER);
+	pdp_rdma_reg_write(S_POINTER, reg);
+
+	dla_trace("Exit: %s", __func__);
+}
+
+int
+dla_pdp_enable(struct dla_processor_group *group)
+{
+	int32_t ret = 0;
+	uint32_t reg;
+	struct dla_engine *engine = dla_get_engine();
+
+	dla_trace("Enter: %s", __func__);
+
+	if (!group) {
+		ret = ERR(INVALID_INPUT);
+		goto exit;
+	}
+
+	if (engine->stat_enable == (uint32_t)1) {
+		reg = FIELD_ENUM(PDP_D_PERF_ENABLE_0, DMA_EN, ENABLE);
+		pdp_reg_write(D_PERF_ENABLE, reg);
+		group->start_time = dla_get_time_us();
+	}
+
+	dla_debug("rdma needed %u\n", group->is_rdma_needed);
+
+	/**
+	 * enable all sub-modules
+	 */
+	if (group->is_rdma_needed) {
+		reg = FIELD_ENUM(PDP_RDMA_D_OP_ENABLE_0, OP_EN, ENABLE);
+		pdp_rdma_reg_write(D_OP_ENABLE, reg);
+	}
+	reg = FIELD_ENUM(PDP_D_OP_ENABLE_0, OP_EN, ENABLE);
+	pdp_reg_write(D_OP_ENABLE, reg);
+
+exit:
+	dla_trace("Exit: %s", __func__);
+	RETURN(ret);
+}
+
+void
+dla_pdp_rdma_check(struct dla_processor_group *group)
+{
+	struct dla_pdp_surface_desc *pdp_surface;
+
+	pdp_surface = &group->surface_desc->pdp_surface;
+
+	group->is_rdma_needed = 0;
+
+	if (pdp_surface->src_data.type != DLA_MEM_HW)
+		group->is_rdma_needed = 1;
+}
+
+static int
+validate_strides(uint8_t stride_x, uint8_t stride_y)
+{
+	int32_t ret = 0;
+
+	if (stride_x < 1 || stride_y < 1 || stride_x > 8 || stride_y > 8) {
+		dla_error("Invalid Stride (x[%d], y[%d])\n", stride_x, stride_y);
+		ret = ERR(INVALID_INPUT);
+	}
+
+	RETURN(ret);
+}
+
+static int
+vaildate_pdp_configs(struct dla_processor_group *group)
+{
+	int32_t ret = 0;
+	struct dla_pdp_op_desc *pdp_op;
+	struct dla_pdp_surface_desc *pdp_surface;
+
+	dla_trace("Enter: %s", __func__);
+
+	pdp_op = &group->operation_desc->pdp_op;
+	pdp_surface = &group->surface_desc->pdp_surface;
+
+	if (pdp_surface->dst_data.type == DLA_MEM_HW) {
+		dla_error("Destination buffer for PDP has to be either MC or CV");
+		ret = ERR(INVALID_INPUT);
+		goto exit;
+	}
+
+	ret = validate_data_cube(pdp_surface->src_data, pdp_surface->dst_data,
+								DLA_MEM_HW);
+	if (ret)
+		goto exit;
+
+	ret = validate_precision(pdp_op->precision, ARRAY_SIZE(map_precision));
+	if (ret)
+		goto exit;
+
+	ret = validate_strides(pdp_op->stride_x, pdp_op->stride_y);
+	if (ret)
+		goto exit;
+
+	if (pdp_op->split_num > MAX_SPLIT_NUM) {
+		dla_error("Invalid split_num: %u\n", pdp_op->split_num);
+		ret = ERR(INVALID_INPUT);
+		goto exit;
+	}
+
+	if (pdp_op->pool_width >= ARRAY_SIZE(map_pool_kernel)) {
+		dla_error("Invalid pool_width: %u\n", pdp_op->pool_width);
+		ret = ERR(INVALID_INPUT);
+		goto exit;
+	}
+
+	if (pdp_op->pool_height >= ARRAY_SIZE(map_pool_kernel)) {
+		dla_error("Invalid pool_height: %u\n", pdp_op->pool_height);
+		ret = ERR(INVALID_INPUT);
+		goto exit;
+	}
+
+	if (pdp_op->pool_mode >= ARRAY_SIZE(map_pool)) {
+		dla_error("Invalid pool_mode: %u\n", pdp_op->pool_mode);
+		ret = ERR(INVALID_INPUT);
+		goto exit;
+	}
+
+exit:
+	dla_trace("Exit: %s", __func__);
+	RETURN(ret);
+}
+
+static int
+processor_pdp_program(struct dla_processor_group *group)
+{
+	int32_t ret = 0;
+	uint32_t reg, high, low;
+	uint64_t input_address = 0;
+	uint64_t output_address = 0;
+	struct dla_engine *engine = dla_get_engine();
+	struct dla_pdp_op_desc *pdp_op;
+	struct dla_pdp_surface_desc *pdp_surface;
+
+	dla_trace("Enter: %s", __func__);
+
+	pdp_op = &group->operation_desc->pdp_op;
+	pdp_surface = &group->surface_desc->pdp_surface;
+
+	ret = vaildate_pdp_configs(group);
+	if (ret)
+		goto exit;
+
+	ret = dla_read_input_address(&pdp_surface->src_data,
+					&input_address,
+					group->op_desc->index,
+					group->roi_index,
+					1);
+	if (ret)
+		goto exit;
+
+	if (pdp_surface->dst_data.address != -1)
+		dla_get_dma_cube_address(engine->driver_context,
+					engine->task->task_data,
+					pdp_surface->dst_data.address,
+					pdp_surface->dst_data.offset,
+					(void *)&output_address,
+					DESTINATION_DMA);
+
+	if (pdp_surface->src_data.type != DLA_MEM_HW) {
+		/* PDP RDMA */
+		pdp_rdma_reg_write(D_DATA_CUBE_IN_WIDTH,
+				pdp_surface->src_data.width - 1);
+		pdp_rdma_reg_write(D_DATA_CUBE_IN_HEIGHT,
+				pdp_surface->src_data.height - 1);
+		pdp_rdma_reg_write(D_DATA_CUBE_IN_CHANNEL,
+				pdp_surface->src_data.channel - 1);
+
+		high = HIGH32BITS(input_address);
+		low  = LOW32BITS(input_address);
+		pdp_rdma_reg_write(D_SRC_BASE_ADDR_HIGH, high);
+		pdp_rdma_reg_write(D_SRC_BASE_ADDR_LOW, low);
+		pdp_rdma_reg_write(D_SRC_LINE_STRIDE,
+				pdp_surface->src_data.line_stride);
+		pdp_rdma_reg_write(D_SRC_SURFACE_STRIDE,
+				pdp_surface->src_data.surf_stride);
+
+		reg = (map_precision[pdp_op->precision]
+			<< SHIFT(PDP_RDMA_D_DATA_FORMAT_0, INPUT_DATA));
+		pdp_rdma_reg_write(D_DATA_FORMAT, reg);
+
+		reg = map_ram[pdp_surface->src_data.type]
+			<< SHIFT(PDP_RDMA_D_SRC_RAM_CFG_0, SRC_RAM_TYPE);
+		pdp_rdma_reg_write(D_SRC_RAM_CFG, reg);
+
+		reg = ((pdp_op->split_num - 1)
+			 << SHIFT(PDP_RDMA_D_OPERATION_MODE_CFG_0, SPLIT_NUM));
+		pdp_rdma_reg_write(D_OPERATION_MODE_CFG, reg);
+
+		reg = (map_pool_kernel[pdp_op->pool_width]
+			<< SHIFT(PDP_RDMA_D_POOLING_KERNEL_CFG_0,
+							KERNEL_WIDTH)) |
+			((pdp_op->stride_x - 1)
+			<< SHIFT(PDP_RDMA_D_POOLING_KERNEL_CFG_0,
+							KERNEL_STRIDE_WIDTH));
+		pdp_rdma_reg_write(D_POOLING_KERNEL_CFG, reg);
+
+		reg = (pdp_op->pad_left
+			<< SHIFT(PDP_RDMA_D_POOLING_PADDING_CFG_0, PAD_WIDTH));
+		pdp_rdma_reg_write(D_POOLING_PADDING_CFG, reg);
+
+		reg = ((pdp_op->partial_in_width_first == 0 ? 0 :
+				pdp_op->partial_in_width_first - 1)
+			<< SHIFT(PDP_RDMA_D_PARTIAL_WIDTH_IN_0,
+				PARTIAL_WIDTH_IN_FIRST)) |
+			((pdp_op->partial_in_width_mid == 0 ? 0 :
+				pdp_op->partial_in_width_mid - 1)
+			<< SHIFT(PDP_RDMA_D_PARTIAL_WIDTH_IN_0,
+				PARTIAL_WIDTH_IN_MID)) |
+			((pdp_op->partial_in_width_last == 0 ? 0 :
+				pdp_op->partial_in_width_last - 1)
+			<< SHIFT(PDP_RDMA_D_PARTIAL_WIDTH_IN_0,
+				PARTIAL_WIDTH_IN_LAST));
+		pdp_rdma_reg_write(D_PARTIAL_WIDTH_IN, reg);
+	} else {
+		ASSERT_GOTO(pdp_op->split_num == 1, ret,
+					ERR(INVALID_INPUT), exit);
+	}
+
+	reg = ((pdp_surface->src_data.width - 1)
+		<< SHIFT(PDP_D_DATA_CUBE_IN_WIDTH_0, CUBE_IN_WIDTH));
+	pdp_reg_write(D_DATA_CUBE_IN_WIDTH, reg);
+
+	reg = ((pdp_surface->src_data.height - 1)
+		<< SHIFT(PDP_D_DATA_CUBE_IN_HEIGHT_0, CUBE_IN_HEIGHT));
+	pdp_reg_write(D_DATA_CUBE_IN_HEIGHT, reg);
+
+	reg = ((pdp_surface->src_data.channel - 1)
+		<< SHIFT(PDP_D_DATA_CUBE_IN_CHANNEL_0, CUBE_IN_CHANNEL));
+	pdp_reg_write(D_DATA_CUBE_IN_CHANNEL, reg);
+
+	reg = ((pdp_surface->dst_data.width - 1)
+		<< SHIFT(PDP_D_DATA_CUBE_OUT_WIDTH_0, CUBE_OUT_WIDTH));
+	pdp_reg_write(D_DATA_CUBE_OUT_WIDTH, reg);
+
+	reg = ((pdp_surface->dst_data.height - 1)
+		<< SHIFT(PDP_D_DATA_CUBE_OUT_HEIGHT_0, CUBE_OUT_HEIGHT));
+	pdp_reg_write(D_DATA_CUBE_OUT_HEIGHT, reg);
+
+	reg = ((pdp_surface->dst_data.channel - 1)
+		<< SHIFT(PDP_D_DATA_CUBE_OUT_CHANNEL_0, CUBE_OUT_CHANNEL));
+	pdp_reg_write(D_DATA_CUBE_OUT_CHANNEL, reg);
+
+	reg = (map_pool[pdp_op->pool_mode]
+		<< SHIFT(PDP_D_OPERATION_MODE_CFG_0, POOLING_METHOD)) |
+		(get_fly_mode(pdp_surface->src_data.type)
+		<< SHIFT(PDP_D_OPERATION_MODE_CFG_0, FLYING_MODE)) |
+		((pdp_op->split_num - 1)
+		<< SHIFT(PDP_D_OPERATION_MODE_CFG_0, SPLIT_NUM));
+	pdp_reg_write(D_OPERATION_MODE_CFG, reg);
+
+	reg = ((pdp_op->partial_in_width_first == 0 ? 0 :
+			pdp_op->partial_in_width_first-1)
+		<< SHIFT(PDP_D_PARTIAL_WIDTH_IN_0, PARTIAL_WIDTH_IN_FIRST)) |
+		((pdp_op->partial_in_width_mid == 0 ? 0 :
+			pdp_op->partial_in_width_mid-1)
+		<< SHIFT(PDP_D_PARTIAL_WIDTH_IN_0, PARTIAL_WIDTH_IN_MID)) |
+		((pdp_op->partial_in_width_last == 0 ? 0 :
+			pdp_op->partial_in_width_last-1)
+		<< SHIFT(PDP_D_PARTIAL_WIDTH_IN_0, PARTIAL_WIDTH_IN_LAST));
+	pdp_reg_write(D_PARTIAL_WIDTH_IN, reg);
+
+	reg = ((pdp_op->partial_width_first == 0 ? 0 :
+			pdp_op->partial_width_first-1)
+		<< SHIFT(PDP_D_PARTIAL_WIDTH_OUT_0, PARTIAL_WIDTH_OUT_FIRST)) |
+		((pdp_op->partial_width_mid == 0 ? 0 :
+			pdp_op->partial_width_mid-1)
+		<< SHIFT(PDP_D_PARTIAL_WIDTH_OUT_0, PARTIAL_WIDTH_OUT_MID))   |
+		((pdp_op->partial_width_last == 0 ? 0 :
+			pdp_op->partial_width_last-1)
+		<< SHIFT(PDP_D_PARTIAL_WIDTH_OUT_0, PARTIAL_WIDTH_OUT_LAST));
+	pdp_reg_write(D_PARTIAL_WIDTH_OUT, reg);
+
+	reg = (map_pool_kernel[pdp_op->pool_width]
+		<< SHIFT(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH)) |
+		(map_pool_kernel[pdp_op->pool_height]
+		<< SHIFT(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_HEIGHT))|
+		((pdp_op->stride_x - 1)
+		<< SHIFT(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_STRIDE_WIDTH)) |
+		((pdp_op->stride_y - 1)
+		<< SHIFT(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_STRIDE_HEIGHT));
+	pdp_reg_write(D_POOLING_KERNEL_CFG, reg);
+
+	pdp_reg_write(D_RECIP_KERNEL_WIDTH,
+			recip_kernel_size[pdp_op->precision ==
+					PRECISION_FP16][pdp_op->pool_width]);
+	pdp_reg_write(D_RECIP_KERNEL_HEIGHT,
+			recip_kernel_size[pdp_op->precision ==
+					PRECISION_FP16][pdp_op->pool_height]);
+
+	reg = (pdp_op->pad_left
+		<< SHIFT(PDP_D_POOLING_PADDING_CFG_0, PAD_LEFT)) |
+		(pdp_op->pad_right
+		<< SHIFT(PDP_D_POOLING_PADDING_CFG_0, PAD_RIGHT)) |
+		(pdp_op->pad_top
+		<< SHIFT(PDP_D_POOLING_PADDING_CFG_0, PAD_TOP)) |
+		(pdp_op->pad_bottom
+		<< SHIFT(PDP_D_POOLING_PADDING_CFG_0, PAD_BOTTOM));
+	if (pdp_op->precision == PRECISION_FP16) {
+		int32_t i;
+
+		for (i = 0; i < 7; i++)
+			ASSERT_GOTO(pdp_op->padding_value[i] == 0, ret,
+						ERR(INVALID_INPUT), exit);
+	}
+
+	pdp_reg_write(D_POOLING_PADDING_CFG, reg);
+	pdp_reg_write(D_POOLING_PADDING_VALUE_1_CFG, pdp_op->padding_value[0]);
+	pdp_reg_write(D_POOLING_PADDING_VALUE_2_CFG, pdp_op->padding_value[1]);
+	pdp_reg_write(D_POOLING_PADDING_VALUE_3_CFG, pdp_op->padding_value[2]);
+	pdp_reg_write(D_POOLING_PADDING_VALUE_4_CFG, pdp_op->padding_value[3]);
+	pdp_reg_write(D_POOLING_PADDING_VALUE_5_CFG, pdp_op->padding_value[4]);
+	pdp_reg_write(D_POOLING_PADDING_VALUE_6_CFG, pdp_op->padding_value[5]);
+	pdp_reg_write(D_POOLING_PADDING_VALUE_7_CFG, pdp_op->padding_value[6]);
+
+	if (pdp_surface->src_data.type != DLA_MEM_HW) {
+		pdp_reg_write(D_SRC_LINE_STRIDE,
+				pdp_surface->src_data.line_stride);
+		pdp_reg_write(D_SRC_SURFACE_STRIDE,
+				pdp_surface->src_data.surf_stride);
+	}
+
+	high = HIGH32BITS(output_address);
+	low = LOW32BITS(output_address);
+	pdp_reg_write(D_DST_BASE_ADDR_LOW, low);
+	pdp_reg_write(D_DST_BASE_ADDR_HIGH, high);
+
+	pdp_reg_write(D_DST_LINE_STRIDE, pdp_surface->dst_data.line_stride);
+	pdp_reg_write(D_DST_SURFACE_STRIDE, pdp_surface->dst_data.surf_stride);
+
+	reg = (map_ram[pdp_surface->dst_data.type]
+		<< SHIFT(PDP_D_DST_RAM_CFG_0, DST_RAM_TYPE));
+	pdp_reg_write(D_DST_RAM_CFG, reg);
+
+	reg = (map_precision[pdp_op->precision]
+		<< SHIFT(PDP_D_DATA_FORMAT_0, INPUT_DATA));
+	pdp_reg_write(D_DATA_FORMAT, reg);
+
+exit:
+	dla_trace("Exit: %s", __func__);
+	RETURN(ret);
+}
+
+int
+dla_pdp_is_ready(struct dla_processor *processor,
+			   struct dla_processor_group *group)
+{
+	return 1;
+}
+
+void
+dla_pdp_dump_config(struct dla_processor_group *group)
+{
+	struct dla_pdp_op_desc *pdp_op;
+	struct dla_pdp_surface_desc *pdp_surface;
+
+	pdp_surface = &group->surface_desc->pdp_surface;
+	pdp_op = &group->operation_desc->pdp_op;
+
+	dla_debug_pdp_surface_desc(pdp_surface, group->roi_index);
+	dla_debug_pdp_op_desc(pdp_op, group->roi_index);
+}
+
+int
+dla_pdp_program(struct dla_processor_group *group)
+{
+	int32_t ret;
+
+	dla_trace("Enter: %s", __func__);
+
+	if (!group) {
+		ret = ERR(INVALID_INPUT);
+		goto exit;
+	}
+
+	dla_enable_intr(MASK(GLB_S_INTR_MASK_0, PDP_DONE_MASK1) |
+			MASK(GLB_S_INTR_MASK_0, PDP_DONE_MASK0));
+
+	ret = processor_pdp_program(group);
+	if (ret)
+		goto exit;
+
+exit:
+	dla_trace("Exit: %s", __func__);
+	RETURN(ret);
+}
--- a/drivers/nvdla/rubik.c
+++ b/drivers/nvdla/rubik.c
@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <opendla.h>
+#include <dla_debug.h>
+#include <dla_err.h>
+#include <dla_interface.h>
+
+#include "common.h"
+#include "dla_engine_internal.h"
+#include "engine_debug.h"
+
+static uint8_t map_rubik_mode[] = {
+	FIELD_ENUM(RBK_D_MISC_CFG_0, RUBIK_MODE, CONTRACT),
+	FIELD_ENUM(RBK_D_MISC_CFG_0, RUBIK_MODE, SPLIT),
+	FIELD_ENUM(RBK_D_MISC_CFG_0, RUBIK_MODE, MERGE),
+};
+
+static uint8_t  map_ram_type[] = {
+	FIELD_ENUM(RBK_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE, MCIF),
+	FIELD_ENUM(RBK_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE, CVIF),
+};
+
+static uint8_t  map_precision[] = {
+	FIELD_ENUM(RBK_D_MISC_CFG_0, IN_PRECISION, INT8),
+	FIELD_ENUM(RBK_D_MISC_CFG_0, IN_PRECISION, INT16),
+	FIELD_ENUM(RBK_D_MISC_CFG_0, IN_PRECISION, FP16),
+};
+
+static uint8_t map_bpe[] = {
+	BPE_PRECISION_INT8,
+	BPE_PRECISION_INT16,
+	BPE_PRECISION_FP16,
+};
+
+#if STAT_ENABLE
+void
+dla_rubik_stat_data(struct dla_processor *processor,
+					struct dla_processor_group *group)
+{
+	uint64_t end_time = 0;
+	struct dla_rubik_stat_desc *rubik_stat;
+
+	rubik_stat = &processor->stat_data_desc->rubik_stat;
+
+	end_time = dla_get_time_us();
+
+	rubik_stat->read_stall = rubik_reg_read(D_PERF_READ_STALL);
+	rubik_stat->write_stall = rubik_reg_read(D_PERF_WRITE_STALL);
+	rubik_stat->runtime = (uint32_t)(end_time - group->start_time);
+}
+
+void
+dla_rubik_dump_stat(struct dla_processor *processor)
+{
+	struct dla_rubik_stat_desc *rubik_stat;
+
+	rubik_stat = &processor->stat_data_desc->rubik_stat;
+
+	dla_debug_rubik_stats(rubik_stat);
+}
+#endif /* STAT_ENABLE */
+
+void
+dla_rubik_set_producer(int32_t group_id, int32_t __unused)
+{
+	uint32_t reg;
+
+	/**
+	 * set producer pointer for all sub-modules
+	 */
+	reg = group_id << SHIFT(RBK_S_POINTER_0, PRODUCER);
+	rubik_reg_write(S_POINTER, reg);
+}
+
+int
+dla_rubik_enable(struct dla_processor_group *group)
+{
+	uint32_t reg;
+	struct dla_engine *engine = dla_get_engine();
+
+	dla_trace("Enter: %s", __func__);
+
+	if (engine->stat_enable == (uint32_t)1) {
+		rubik_reg_write(D_PERF_ENABLE, 1);
+		group->start_time = dla_get_time_us();
+	}
+
+	/**
+	 * enable all sub-modules
+	 */
+	reg = FIELD_ENUM(RBK_D_OP_ENABLE_0, OP_EN, ENABLE);
+	rubik_reg_write(D_OP_ENABLE, reg);
+
+	dla_trace("Exit: %s", __func__);
+
+	RETURN(0);
+}
+
+void
+dla_rubik_rdma_check(struct dla_processor_group *group)
+{
+	group->is_rdma_needed = 0;
+}
+
+static int32_t
+processor_rubik_program(struct dla_processor_group *group)
+{
+	int32_t ret = 0;
+	uint32_t reg, high, low;
+	uint64_t input_address = 0;
+	uint64_t output_address = 0;
+	struct dla_engine *engine = dla_get_engine();
+	struct dla_rubik_op_desc *rubik_op;
+	struct dla_rubik_surface_desc *rubik_surface;
+
+	dla_trace("Enter: %s", __func__);
+
+	rubik_op = &group->operation_desc->rubik_op;
+	rubik_surface = &group->surface_desc->rubik_surface;
+
+	/* Argument check */
+	ASSERT_GOTO((rubik_surface->src_data.type != DLA_MEM_HW),
+		ret, ERR(INVALID_INPUT), exit);
+	ASSERT_GOTO((rubik_surface->dst_data.type != DLA_MEM_HW),
+		ret, ERR(INVALID_INPUT), exit);
+
+	/* get the addresses from task descriptor */
+	ret = dla_read_input_address(&rubik_surface->src_data,
+						&input_address,
+						group->op_desc->index,
+						group->roi_index,
+						1);
+	if (ret)
+		goto exit;
+
+	dla_get_dma_cube_address(engine->driver_context,
+				engine->task->task_data,
+				rubik_surface->dst_data.address,
+				rubik_surface->dst_data.offset,
+				(void *)&output_address,
+				DESTINATION_DMA);
+
+	/* config rubik */
+	reg = (((uint32_t)map_rubik_mode[rubik_op->mode]) <<
+			SHIFT(RBK_D_MISC_CFG_0, RUBIK_MODE)) |
+			(((uint32_t)map_precision[rubik_op->precision]) <<
+			SHIFT(RBK_D_MISC_CFG_0, IN_PRECISION));
+	rubik_reg_write(D_MISC_CFG, reg);
+	reg = (((uint32_t)map_ram_type[rubik_surface->src_data.type]) <<
+			SHIFT(RBK_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE));
+	rubik_reg_write(D_DAIN_RAM_TYPE, reg);
+	reg =  ((rubik_surface->src_data.width-1) <<
+			SHIFT(RBK_D_DATAIN_SIZE_0_0, DATAIN_WIDTH)) |
+			((rubik_surface->src_data.height-1) <<
+			SHIFT(RBK_D_DATAIN_SIZE_0_0, DATAIN_HEIGHT));
+	rubik_reg_write(D_DATAIN_SIZE_0, reg);
+	reg =  ((rubik_surface->src_data.channel-1) <<
+			SHIFT(RBK_D_DATAIN_SIZE_1_0, DATAIN_CHANNEL));
+	rubik_reg_write(D_DATAIN_SIZE_1, reg);
+
+	high = HIGH32BITS(input_address);
+	low = LOW32BITS(input_address);
+	rubik_reg_write(D_DAIN_ADDR_LOW, low);
+	rubik_reg_write(D_DAIN_ADDR_HIGH, high);
+	if (rubik_op->mode == RUBIK_MODE_MERGE) {
+		ASSERT_GOTO((rubik_surface->src_data.plane_stride != 0),
+			ret, ERR(INVALID_INPUT), exit);
+		ASSERT_GOTO(((rubik_surface->src_data.plane_stride&0x1F) == 0),
+			ret, ERR(INVALID_INPUT), exit);
+		rubik_reg_write(D_DAIN_PLANAR_STRIDE,
+			rubik_surface->src_data.plane_stride);
+	} else {
+		rubik_reg_write(D_DAIN_SURF_STRIDE,
+			rubik_surface->src_data.surf_stride);
+	}
+	rubik_reg_write(D_DAIN_LINE_STRIDE,
+				rubik_surface->src_data.line_stride);
+
+	reg = (((uint32_t)map_ram_type[rubik_surface->dst_data.type]) <<
+			SHIFT(RBK_D_DAOUT_RAM_TYPE_0, DATAOUT_RAM_TYPE));
+	rubik_reg_write(D_DAOUT_RAM_TYPE, reg);
+	reg =  ((rubik_surface->dst_data.channel-1) <<
+			SHIFT(RBK_D_DATAOUT_SIZE_1_0, DATAOUT_CHANNEL));
+	rubik_reg_write(D_DATAOUT_SIZE_1, reg);
+
+	high = HIGH32BITS(output_address);
+	low = LOW32BITS(output_address);
+	rubik_reg_write(D_DAOUT_ADDR_LOW, low);
+	rubik_reg_write(D_DAOUT_ADDR_HIGH, high);
+
+	rubik_reg_write(D_DAOUT_LINE_STRIDE,
+			rubik_surface->dst_data.line_stride);
+	if (rubik_op->mode != RUBIK_MODE_SPLIT) {
+		rubik_reg_write(D_DAOUT_SURF_STRIDE,
+				rubik_surface->dst_data.surf_stride);
+		if (rubik_op->mode == RUBIK_MODE_CONTRACT) {
+			reg = ((rubik_surface->dst_data.channel *
+				map_bpe[rubik_op->precision] + 31) >> 5) *
+				rubik_surface->src_data.surf_stride;
+			rubik_reg_write(D_CONTRACT_STRIDE_0, reg);
+
+			reg = rubik_op->stride_y *
+				rubik_surface->dst_data.line_stride;
+			rubik_reg_write(D_CONTRACT_STRIDE_1, reg);
+
+			reg = (((uint32_t)(rubik_op->stride_x-1)) <<
+			SHIFT(RBK_D_DECONV_STRIDE_0, DECONV_X_STRIDE)) |
+				(((uint32_t)(rubik_op->stride_y-1)) <<
+			SHIFT(RBK_D_DECONV_STRIDE_0, DECONV_Y_STRIDE));
+			rubik_reg_write(D_DECONV_STRIDE, reg);
+		}
+	} else {
+		rubik_reg_write(D_DAOUT_PLANAR_STRIDE,
+				rubik_surface->dst_data.plane_stride);
+	}
+
+exit:
+	dla_trace("Exit: %s", __func__);
+	RETURN(ret);
+}
+
+int
+dla_rubik_is_ready(struct dla_processor *processor,
+			     struct dla_processor_group *group)
+{
+	return 1;
+}
+
+void
+dla_rubik_dump_config(struct dla_processor_group *group)
+{
+	struct dla_rubik_op_desc *rubik_op;
+	struct dla_rubik_surface_desc *rubik_surface;
+
+	rubik_surface = &group->surface_desc->rubik_surface;
+	rubik_op = &group->operation_desc->rubik_op;
+
+	dla_debug_rubik_surface_desc(rubik_surface, group->roi_index);
+	dla_debug_rubik_op_desc(rubik_op, group->roi_index);
+}
+
+int
+dla_rubik_program(struct dla_processor_group *group)
+{
+	int32_t ret = 0;
+	struct dla_engine *engine = dla_get_engine();
+
+	dla_trace("Enter: %s", __func__);
+
+	if (!engine->config_data->rubik_enable) {
+		dla_error("RUBIK is not supported for this configuration\n");
+		ret = ERR(INVALID_INPUT);
+		goto exit;
+	}
+
+	dla_enable_intr(MASK(GLB_S_INTR_MASK_0, RUBIK_DONE_MASK1) |
+			MASK(GLB_S_INTR_MASK_0, RUBIK_DONE_MASK0));
+
+	ret = processor_rubik_program(group);
+	if (ret)
+		goto exit;
+
+exit:
+	dla_trace("Exit: %s", __func__);
+	RETURN(ret);
+}
--- a/drivers/nvdla/scheduler.c
+++ b/drivers/nvdla/scheduler.c
--- a/drivers/nvdla/sdp.c
+++ b/drivers/nvdla/sdp.c
@ -0,0 +1,817 @@
+/*
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <opendla.h>
+#include <dla_debug.h>
+#include <dla_interface.h>
+
+#include "common.h"
+#include "dla_engine_internal.h"
+#include "engine_debug.h"
+
+static const uint8_t map_ena[] = {
+	FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DISABLE, YES),
+	FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DISABLE, NO),
+};
+
+static const uint8_t map_prelu[] = {
+	FIELD_ENUM(SDP_D_DP_BS_CFG_0, BS_MUL_PRELU, NO),
+	FIELD_ENUM(SDP_D_DP_BS_CFG_0, BS_MUL_PRELU, YES),
+};
+
+static const uint8_t map_bypass[] = {
+	FIELD_ENUM(SDP_D_DP_BS_CFG_0, BS_BYPASS, YES),
+	FIELD_ENUM(SDP_D_DP_BS_CFG_0, BS_BYPASS, NO),
+};
+
+static const uint8_t map_alu_op[] = {
+	FIELD_ENUM(SDP_D_DP_EW_CFG_0, EW_ALU_ALGO, MAX),
+	FIELD_ENUM(SDP_D_DP_EW_CFG_0, EW_ALU_ALGO, MIN),
+	FIELD_ENUM(SDP_D_DP_EW_CFG_0, EW_ALU_ALGO, SUM),
+	FIELD_ENUM(SDP_D_DP_EW_CFG_0, EW_ALU_ALGO, EQL),
+};
+
+static const uint8_t map_alu_src[] = {
+	FIELD_ENUM(SDP_D_DP_BS_ALU_CFG_0, BS_ALU_SRC, MEM),
+	FIELD_ENUM(SDP_D_DP_BS_ALU_CFG_0, BS_ALU_SRC, REG),
+};
+
+static const uint8_t map_fly[] = {
+	FIELD_ENUM(SDP_D_FEATURE_MODE_CFG_0, FLYING_MODE, OFF),
+	FIELD_ENUM(SDP_D_FEATURE_MODE_CFG_0, FLYING_MODE, ON),
+};
+
+static const uint8_t map_dst[] = {
+	FIELD_ENUM(SDP_D_FEATURE_MODE_CFG_0, OUTPUT_DST, MEM),
+	FIELD_ENUM(SDP_D_FEATURE_MODE_CFG_0, OUTPUT_DST, PDP),
+};
+
+
+static const uint8_t map_wg[] = {
+	FIELD_ENUM(SDP_D_FEATURE_MODE_CFG_0, WINOGRAD, OFF),
+	FIELD_ENUM(SDP_D_FEATURE_MODE_CFG_0, WINOGRAD, ON),
+};
+
+static const uint8_t map_precision[] = {
+	FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT8),
+	FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT16),
+	FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, FP16),
+};
+
+static const uint32_t map_proc_precision[3][3] = {
+	{
+		FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT8),
+		FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT8),
+		FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, FP16),
+	},
+	{
+		FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT8),
+		FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT16),
+		FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, FP16),
+	},
+	{
+		FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT8),
+		FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT16),
+		FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, FP16),
+	},
+};
+
+static const uint8_t map_op_type[] = {
+	FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_USE, MUL),
+	FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_USE, MUL),
+	FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_USE, ALU),
+	FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_USE, BOTH),
+};
+
+static const uint8_t map_element_size[] = {
+	FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_SIZE, ONE_BYTE),
+	FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_SIZE, TWO_BYTE),
+	FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_SIZE, TWO_BYTE),
+};
+
+static const uint8_t map_op_mode[] = {
+	FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_MODE, PER_ELEMENT),
+	FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_MODE, PER_KERNEL),
+	FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_MODE, PER_ELEMENT),
+};
+
+static const uint8_t map_ram_type[] = {
+	FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_RAM_TYPE, MC),
+	FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_RAM_TYPE, CV),
+};
+
+static const uint8_t map_perf_dma[] = {
+	FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_DMA_EN, NO),
+	FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_DMA_EN, YES),
+};
+
+static const uint8_t map_perf_lut[] = {
+	FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_LUT_EN, NO),
+	FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_LUT_EN, YES),
+};
+
+static const uint8_t map_perf_sat[] = {
+	FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_SAT_EN, NO),
+	FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_SAT_EN, YES),
+};
+
+static const uint8_t map_perf_nan_inf[] = {
+	FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_NAN_INF_COUNT_EN, NO),
+	FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_NAN_INF_COUNT_EN, YES),
+};
+
+#if STAT_ENABLE
+void
+dla_sdp_stat_data(struct dla_processor *processor,
+					struct dla_processor_group *group)
+{
+	uint64_t end_time = 0;
+	struct dla_sdp_stat_desc *sdp_stat;
+
+	sdp_stat = &processor->stat_data_desc->sdp_stat;
+
+	end_time = dla_get_time_us();
+
+	sdp_stat->nan_input_num = sdp_reg_read(D_STATUS_NAN_INPUT_NUM);
+	sdp_stat->inf_input_num = sdp_reg_read(D_STATUS_INF_INPUT_NUM);
+	sdp_stat->nan_output_num = sdp_reg_read(D_STATUS_NAN_OUTPUT_NUM);
+	sdp_stat->wdma_write_stall = sdp_reg_read(D_PERF_WDMA_WRITE_STALL);
+	sdp_stat->runtime = (uint32_t)(end_time - group->start_time);
+}
+
+void
+dla_sdp_dump_stat(struct dla_processor *processor)
+{
+	struct dla_sdp_stat_desc *sdp_stat;
+
+	sdp_stat = &processor->stat_data_desc->sdp_stat;
+
+	dla_debug_sdp_stats(sdp_stat);
+}
+#endif /* STAT_ENABLE */
+
+void
+dla_sdp_set_producer(int32_t group_id, int32_t rdma_group_id)
+{
+	uint32_t reg;
+
+	/**
+	 * set producer pointer for all sub-modules
+	 */
+	reg = group_id << SHIFT(SDP_S_POINTER_0, PRODUCER);
+	sdp_reg_write(S_POINTER, reg);
+	reg = rdma_group_id << SHIFT(SDP_RDMA_S_POINTER_0, PRODUCER);
+	sdp_rdma_reg_write(S_POINTER, reg);
+}
+
+int
+dla_sdp_enable(struct dla_processor_group *group)
+{
+	uint32_t reg;
+	uint8_t perf_reg;
+	struct dla_engine *engine = dla_get_engine();
+
+	dla_trace("Enter: %s", __func__);
+
+	if (engine->stat_enable == (uint32_t)1) {
+		perf_reg = (map_perf_dma[1] <<
+			SHIFT(SDP_D_PERF_ENABLE_0, PERF_DMA_EN)) |
+			(map_perf_lut[1] <<
+			SHIFT(SDP_D_PERF_ENABLE_0, PERF_LUT_EN)) |
+			(map_perf_sat[1] <<
+			SHIFT(SDP_D_PERF_ENABLE_0, PERF_SAT_EN)) |
+			(map_perf_nan_inf[1] <<
+			SHIFT(SDP_D_PERF_ENABLE_0, PERF_NAN_INF_COUNT_EN));
+
+		sdp_reg_write(D_PERF_ENABLE, perf_reg);
+		group->start_time = dla_get_time_us();
+	}
+
+	/**
+	 * enable all sub-modules
+	 */
+	if (group->is_rdma_needed) {
+		reg = FIELD_ENUM(SDP_RDMA_D_OP_ENABLE_0, OP_EN, ENABLE);
+		sdp_rdma_reg_write(D_OP_ENABLE, reg);
+	}
+	reg = FIELD_ENUM(SDP_D_OP_ENABLE_0, OP_EN, ENABLE);
+	sdp_reg_write(D_OP_ENABLE, reg);
+
+	dla_trace("Exit: %s", __func__);
+
+	RETURN(0);
+}
+
+void
+dla_sdp_rdma_check(struct dla_processor_group *group)
+{
+	uint8_t x1_rdma_ena;
+	uint8_t x2_rdma_ena;
+	uint8_t y_rdma_ena;
+	uint8_t fly;
+	struct dla_sdp_op_desc *sdp_op;
+	struct dla_sdp_surface_desc *sdp_surface;
+
+	sdp_op = &group->operation_desc->sdp_op;
+	sdp_surface = &group->surface_desc->sdp_surface;
+
+	x1_rdma_ena = sdp_op->x1_op.enable;
+	x2_rdma_ena = sdp_op->x2_op.enable;
+	y_rdma_ena  = sdp_op->y_op.enable;
+
+	x1_rdma_ena &= (sdp_op->x1_op.mode != SDP_OP_PER_LAYER);
+	x2_rdma_ena &= (sdp_op->x2_op.mode != SDP_OP_PER_LAYER);
+	y_rdma_ena &= (sdp_op->y_op.mode != SDP_OP_PER_LAYER);
+
+	fly = sdp_surface->src_data.type == DLA_MEM_HW;
+
+	group->is_rdma_needed = (!fly) || (x1_rdma_ena ||
+					x2_rdma_ena || y_rdma_ena);
+}
+
+static int32_t
+processor_sdp_program(struct dla_processor_group *group)
+{
+	int32_t ret = 0;
+	uint64_t src_addr = -1, x1_addr = -1, x2_addr = -1;
+	uint64_t  y_addr = -1, dst_addr = -1;
+	uint32_t reg, high, low;
+	uint8_t fly;
+	uint32_t atom_size;
+	struct dla_sdp_op *x1_op;
+	struct dla_sdp_op *x2_op;
+	struct dla_sdp_op *y_op;
+	uint8_t x1_rdma_ena;
+	uint8_t x2_rdma_ena;
+	uint8_t y_rdma_ena;
+	uint8_t out_dma_ena;
+	struct dla_lut_param lut;
+	struct dla_engine *engine = dla_get_engine();
+	struct dla_sdp_op_desc *sdp_op;
+	struct dla_sdp_surface_desc *sdp_surface;
+
+	dla_trace("Enter: %s", __func__);
+	atom_size = engine->config_data->atom_size;
+
+	sdp_op = &group->operation_desc->sdp_op;
+	sdp_surface = &group->surface_desc->sdp_surface;
+
+	fly = sdp_surface->src_data.type == DLA_MEM_HW;
+	out_dma_ena = sdp_surface->dst_data.type != DLA_MEM_HW;
+	x1_op = &sdp_op->x1_op;
+	x2_op = &sdp_op->x2_op;
+	y_op = &sdp_op->y_op;
+	x1_rdma_ena = x1_op->enable && x1_op->type != SDP_OP_NONE;
+	x2_rdma_ena = x2_op->enable && x2_op->type != SDP_OP_NONE;
+	y_rdma_ena  = y_op->enable && y_op->type != SDP_OP_NONE;
+
+	/* load address */
+	if (!fly) {
+		ret = dla_read_input_address(&sdp_surface->src_data,
+						&src_addr,
+						group->op_desc->index,
+						group->roi_index,
+					    1);
+		if (ret)
+			goto exit;
+		CHECK_ALIGN(src_addr, atom_size);
+	}
+
+	if (out_dma_ena) {
+		dla_get_dma_cube_address(engine->driver_context,
+					engine->task->task_data,
+					sdp_surface->dst_data.address,
+					sdp_surface->dst_data.offset,
+					(void *)&dst_addr,
+					DESTINATION_DMA);
+		CHECK_ALIGN(dst_addr, atom_size);
+	}
+
+	if (sdp_op->lut_index >= 0) {
+		group->lut_index = sdp_op->lut_index;
+		dla_read_lut(engine, sdp_op->lut_index, (void *)&lut);
+		dla_debug_lut_params(&lut);
+	}
+
+
+	x1_rdma_ena &= (x1_op->mode != SDP_OP_PER_LAYER);
+	x2_rdma_ena &= (x2_op->mode != SDP_OP_PER_LAYER);
+	y_rdma_ena &= (y_op->mode != SDP_OP_PER_LAYER);
+
+	if (x1_rdma_ena) {
+		dla_get_dma_cube_address(engine->driver_context,
+					engine->task->task_data,
+					sdp_surface->x1_data.address,
+					sdp_surface->x1_data.offset,
+					(void *)&x1_addr,
+					DESTINATION_DMA);
+		CHECK_ALIGN(x1_addr, atom_size);
+	}
+	if (x2_rdma_ena) {
+		dla_get_dma_cube_address(engine->driver_context,
+					engine->task->task_data,
+					sdp_surface->x2_data.address,
+					sdp_surface->x2_data.offset,
+					(void *)&x2_addr,
+					DESTINATION_DMA);
+		CHECK_ALIGN(x2_addr, atom_size);
+	}
+	if (y_rdma_ena) {
+		dla_get_dma_cube_address(engine->driver_context,
+					engine->task->task_data,
+					sdp_surface->y_data.address,
+					sdp_surface->y_data.offset,
+					(void *)&y_addr,
+					DESTINATION_DMA);
+		CHECK_ALIGN(y_addr, atom_size);
+	}
+
+	reg = (map_fly[0] << SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, FLYING_MODE));
+	sdp_rdma_reg_write(D_FEATURE_MODE_CFG, reg);
+
+	reg = (map_ena[1] << SHIFT(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DISABLE));
+	sdp_rdma_reg_write(D_BRDMA_CFG, reg);
+	reg = (map_ena[1] << SHIFT(SDP_RDMA_D_NRDMA_CFG_0, NRDMA_DISABLE));
+	sdp_rdma_reg_write(D_NRDMA_CFG, reg);
+	reg = (map_ena[1] << SHIFT(SDP_RDMA_D_ERDMA_CFG_0, ERDMA_DISABLE));
+	sdp_rdma_reg_write(D_ERDMA_CFG, reg);
+
+	reg = (map_fly[fly] <<
+			SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, FLYING_MODE)) |
+	(map_wg[sdp_op->conv_mode == CONV_MODE_WINOGRAD] <<
+			SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, WINOGRAD)) |
+	(map_precision[sdp_op->src_precision] <<
+			SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION)) |
+	(map_precision[sdp_op->dst_precision] <<
+			SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, OUT_PRECISION)) |
+	(map_proc_precision[sdp_op->dst_precision][sdp_op->src_precision] <<
+			SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, PROC_PRECISION)) |
+	((sdp_op->batch_num-1) <<
+			SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, BATCH_NUMBER));
+	sdp_rdma_reg_write(D_FEATURE_MODE_CFG, reg);
+
+	if (group->is_rdma_needed) {
+
+		sdp_rdma_reg_write(D_DATA_CUBE_WIDTH,
+					sdp_surface->src_data.width - 1);
+		sdp_rdma_reg_write(D_DATA_CUBE_HEIGHT,
+					sdp_surface->src_data.height - 1);
+		sdp_rdma_reg_write(D_DATA_CUBE_CHANNEL,
+					sdp_surface->src_data.channel - 1);
+
+		/* config SDP source info */
+		if (!fly) {
+			/**
+			 * if not on-the-fly, we have to config
+			 * the source cube info
+			 */
+			high = HIGH32BITS(src_addr);
+			low = LOW32BITS(src_addr);
+			sdp_rdma_reg_write(D_SRC_BASE_ADDR_LOW, low);
+			sdp_rdma_reg_write(D_SRC_BASE_ADDR_HIGH, high);
+			sdp_rdma_reg_write(D_SRC_LINE_STRIDE,
+					sdp_surface->src_data.line_stride);
+			sdp_rdma_reg_write(D_SRC_SURFACE_STRIDE,
+					sdp_surface->src_data.surf_stride);
+			sdp_rdma_reg_write(D_SRC_DMA_CFG,
+				map_ram_type[sdp_surface->src_data.type]);
+		}
+
+		/* config x1 source info */
+		reg = (map_ena[x1_rdma_ena] <<
+				SHIFT(SDP_RDMA_D_BRDMA_CFG_0,
+				BRDMA_DISABLE)) |
+			(map_op_type[x1_op->type] <<
+				SHIFT(SDP_RDMA_D_BRDMA_CFG_0,
+				BRDMA_DATA_USE)) |
+			(map_element_size[x1_op->precision] <<
+				SHIFT(SDP_RDMA_D_BRDMA_CFG_0,
+				BRDMA_DATA_SIZE)) |
+			(map_op_mode[x1_op->mode] <<
+				SHIFT(SDP_RDMA_D_BRDMA_CFG_0,
+				BRDMA_DATA_MODE)) |
+			(map_ram_type[sdp_surface->x1_data.type] <<
+				SHIFT(SDP_RDMA_D_BRDMA_CFG_0,
+				BRDMA_RAM_TYPE));
+		sdp_rdma_reg_write(D_BRDMA_CFG, reg);
+
+		if (x1_rdma_ena) {
+			high = HIGH32BITS(x1_addr);
+			low = LOW32BITS(x1_addr);
+			sdp_rdma_reg_write(D_BS_BASE_ADDR_LOW,
+					low);
+			sdp_rdma_reg_write(D_BS_BASE_ADDR_HIGH,
+					high);
+			sdp_rdma_reg_write(D_BS_LINE_STRIDE,
+					sdp_surface->x1_data.line_stride);
+			sdp_rdma_reg_write(D_BS_SURFACE_STRIDE,
+					sdp_surface->x1_data.surf_stride);
+		}
+
+		/* config x2 source info */
+		reg = (map_ena[x2_rdma_ena] <<
+					SHIFT(SDP_RDMA_D_NRDMA_CFG_0,
+					NRDMA_DISABLE)) |
+			(map_op_type[x2_op->type] <<
+					SHIFT(SDP_RDMA_D_NRDMA_CFG_0,
+					NRDMA_DATA_USE)) |
+			(map_element_size[x2_op->precision] <<
+					SHIFT(SDP_RDMA_D_NRDMA_CFG_0,
+					NRDMA_DATA_SIZE)) |
+			(map_op_mode[x2_op->mode] <<
+					SHIFT(SDP_RDMA_D_NRDMA_CFG_0,
+					NRDMA_DATA_MODE)) |
+			(map_ram_type[sdp_surface->x2_data.type] <<
+					SHIFT(SDP_RDMA_D_NRDMA_CFG_0,
+					NRDMA_RAM_TYPE));
+
+		sdp_rdma_reg_write(D_NRDMA_CFG, reg);
+
+		if (x2_rdma_ena) {
+			high = HIGH32BITS(x2_addr);
+			low = LOW32BITS(x2_addr);
+			sdp_rdma_reg_write(D_BN_BASE_ADDR_LOW,
+					low);
+			sdp_rdma_reg_write(D_BN_BASE_ADDR_HIGH,
+					high);
+			sdp_rdma_reg_write(D_BN_LINE_STRIDE,
+					sdp_surface->x2_data.line_stride);
+			sdp_rdma_reg_write(D_BN_SURFACE_STRIDE,
+					sdp_surface->x2_data.surf_stride);
+		}
+
+		/* config y source info */
+		reg = (map_ena[y_rdma_ena] <<
+				SHIFT(SDP_RDMA_D_ERDMA_CFG_0,
+				ERDMA_DISABLE)) |
+			(map_op_type[y_op->type] <<
+				SHIFT(SDP_RDMA_D_ERDMA_CFG_0,
+				ERDMA_DATA_USE)) |
+			(map_element_size[y_op->precision] <<
+				SHIFT(SDP_RDMA_D_ERDMA_CFG_0,
+				ERDMA_DATA_SIZE)) |
+			(map_op_mode[y_op->mode] <<
+				SHIFT(SDP_RDMA_D_ERDMA_CFG_0,
+				ERDMA_DATA_MODE)) |
+			(map_ram_type[sdp_surface->y_data.type] <<
+				SHIFT(SDP_RDMA_D_ERDMA_CFG_0,
+				ERDMA_RAM_TYPE));
+
+		sdp_rdma_reg_write(D_ERDMA_CFG, reg);
+		if (y_rdma_ena) {
+			high = HIGH32BITS(y_addr);
+			low = LOW32BITS(y_addr);
+			sdp_rdma_reg_write(D_EW_BASE_ADDR_LOW,
+					low);
+			sdp_rdma_reg_write(D_EW_BASE_ADDR_HIGH,
+					high);
+			sdp_rdma_reg_write(D_EW_LINE_STRIDE,
+					sdp_surface->y_data.line_stride);
+			sdp_rdma_reg_write(D_EW_SURFACE_STRIDE,
+					sdp_surface->y_data.surf_stride);
+		}
+	}
+
+	if (sdp_op->lut_index >= 0)
+		update_lut(SDP_S_LUT_ACCESS_CFG_0, &lut,
+					sdp_op->src_precision);
+
+	sdp_reg_write(D_DATA_CUBE_WIDTH, sdp_surface->src_data.width - 1);
+	sdp_reg_write(D_DATA_CUBE_HEIGHT, sdp_surface->src_data.height - 1);
+	sdp_reg_write(D_DATA_CUBE_CHANNEL, sdp_surface->src_data.channel - 1);
+
+	if (out_dma_ena) {
+		high = HIGH32BITS(dst_addr);
+		low = LOW32BITS(dst_addr);
+		sdp_reg_write(D_DST_BASE_ADDR_HIGH,
+				high);
+		sdp_reg_write(D_DST_BASE_ADDR_LOW,
+				low);
+		sdp_reg_write(D_DST_LINE_STRIDE,
+				sdp_surface->dst_data.line_stride);
+		sdp_reg_write(D_DST_SURFACE_STRIDE,
+				sdp_surface->dst_data.surf_stride);
+	}
+
+	/* Config BS module */
+	reg = (map_bypass[x1_op->enable] <<
+			SHIFT(SDP_D_DP_BS_CFG_0,
+			BS_BYPASS)) |
+		(map_bypass[x1_op->type != SDP_OP_MUL &&
+				x1_op->type != SDP_OP_NONE] <<
+			SHIFT(SDP_D_DP_BS_CFG_0,
+			BS_ALU_BYPASS)) |
+		(map_alu_op[x1_op->alu_type] <<
+			SHIFT(SDP_D_DP_BS_CFG_0,
+			BS_ALU_ALGO)) |
+		(map_bypass[x1_op->type != SDP_OP_ADD &&
+			x1_op->type != SDP_OP_NONE] <<
+			SHIFT(SDP_D_DP_BS_CFG_0,
+			BS_MUL_BYPASS)) |
+		(map_prelu[x1_op->act == ACTIVATION_PRELU]
+			<< SHIFT(SDP_D_DP_BS_CFG_0,
+			BS_MUL_PRELU)) |
+		(map_bypass[x1_op->act == ACTIVATION_RELU] <<
+			SHIFT(SDP_D_DP_BS_CFG_0,
+			BS_RELU_BYPASS));
+	sdp_reg_write(D_DP_BS_CFG, reg);
+
+	if (x1_op->enable) {
+		if (x1_op->type == SDP_OP_ADD ||
+				x1_op->type == SDP_OP_BOTH) {
+			reg = (map_alu_src[x1_op->mode == SDP_OP_PER_LAYER] <<
+					SHIFT(SDP_D_DP_BS_ALU_CFG_0,
+					BS_ALU_SRC)) |
+				(x1_op->shift_value <<
+					SHIFT(SDP_D_DP_BS_ALU_CFG_0,
+					BS_ALU_SHIFT_VALUE));
+			sdp_reg_write(D_DP_BS_ALU_CFG, reg);
+		}
+
+		if (x1_op->mode == SDP_OP_PER_LAYER) {
+			sdp_reg_write(D_DP_BS_ALU_SRC_VALUE,
+					x1_op->alu_operand);
+			sdp_reg_write(D_DP_BS_MUL_SRC_VALUE,
+					x1_op->mul_operand);
+		}
+
+		/**
+		 * MUL truncate will take effect no matter
+		 * MUL is bypassed or not
+		 */
+		reg = (map_alu_src[x1_op->mode == SDP_OP_PER_LAYER] <<
+			SHIFT(SDP_D_DP_BS_MUL_CFG_0,
+			BS_MUL_SRC)) |
+		(x1_op->truncate <<
+			SHIFT(SDP_D_DP_BS_MUL_CFG_0,
+			BS_MUL_SHIFT_VALUE));
+		sdp_reg_write(D_DP_BS_MUL_CFG, reg);
+	}
+
+	/* Config BN module */
+	reg = (map_bypass[x2_op->enable] <<
+			SHIFT(SDP_D_DP_BN_CFG_0,
+			BN_BYPASS)) |
+		(map_bypass[x2_op->type != SDP_OP_MUL &&
+			x2_op->type != SDP_OP_NONE] <<
+			SHIFT(SDP_D_DP_BN_CFG_0,
+			BN_ALU_BYPASS)) |
+		(map_alu_op[x2_op->alu_type] <<
+			SHIFT(SDP_D_DP_BN_CFG_0,
+			BN_ALU_ALGO)) |
+		(map_bypass[x2_op->type != SDP_OP_ADD &&
+			x2_op->type != SDP_OP_NONE] <<
+			SHIFT(SDP_D_DP_BN_CFG_0,
+			BN_MUL_BYPASS)) |
+		(map_prelu[x2_op->act == ACTIVATION_PRELU]
+			<< SHIFT(SDP_D_DP_BN_CFG_0,
+			BN_MUL_PRELU)) |
+		(map_bypass[x2_op->act == ACTIVATION_RELU]
+			<< SHIFT(SDP_D_DP_BN_CFG_0,
+			BN_RELU_BYPASS));
+	sdp_reg_write(D_DP_BN_CFG, reg);
+
+	if (x2_op->enable) {
+		if (x2_op->type == SDP_OP_ADD ||
+			x2_op->type == SDP_OP_BOTH) {
+			reg = (map_alu_src[x2_op->mode == SDP_OP_PER_LAYER] <<
+					SHIFT(SDP_D_DP_BN_ALU_CFG_0,
+					BN_ALU_SRC)) |
+				(x2_op->shift_value <<
+					SHIFT(SDP_D_DP_BN_ALU_CFG_0,
+					BN_ALU_SHIFT_VALUE));
+			sdp_reg_write(D_DP_BN_ALU_CFG, reg);
+		}
+
+		if (x2_op->mode == SDP_OP_PER_LAYER) {
+			sdp_reg_write(D_DP_BN_ALU_SRC_VALUE,
+					x2_op->alu_operand);
+			sdp_reg_write(D_DP_BN_MUL_SRC_VALUE,
+					x2_op->mul_operand);
+		}
+
+		reg = (map_alu_src[x2_op->mode == SDP_OP_PER_LAYER] <<
+				SHIFT(SDP_D_DP_BN_MUL_CFG_0,
+				BN_MUL_SRC)) |
+			(x2_op->truncate <<
+				SHIFT(SDP_D_DP_BN_MUL_CFG_0,
+				BN_MUL_SHIFT_VALUE));
+		sdp_reg_write(D_DP_BN_MUL_CFG, reg);
+	}
+
+	/* Config EW module */
+	reg = (map_bypass[y_op->enable] <<
+			SHIFT(SDP_D_DP_EW_CFG_0,
+			EW_BYPASS)) |
+		(map_bypass[y_op->type != SDP_OP_MUL &&
+			y_op->type != SDP_OP_NONE] <<
+			SHIFT(SDP_D_DP_EW_CFG_0,
+			EW_ALU_BYPASS)) |
+		(map_alu_op[y_op->alu_type] <<
+			SHIFT(SDP_D_DP_EW_CFG_0,
+			EW_ALU_ALGO)) |
+		(map_bypass[y_op->type != SDP_OP_ADD &&
+			y_op->type != SDP_OP_NONE] <<
+			SHIFT(SDP_D_DP_EW_CFG_0,
+			EW_MUL_BYPASS)) |
+		((map_prelu[y_op->act == ACTIVATION_PRELU]) <<
+			SHIFT(SDP_D_DP_EW_CFG_0,
+			EW_MUL_PRELU)) |
+		(map_bypass[y_op->act == ACTIVATION_LUT] <<
+			SHIFT(SDP_D_DP_EW_CFG_0,
+			EW_LUT_BYPASS));
+	sdp_reg_write(D_DP_EW_CFG, reg);
+
+	if (y_op->enable) {
+		if (y_op->type == SDP_OP_ADD || y_op->type == SDP_OP_BOTH) {
+			reg = (map_alu_src[y_op->mode == SDP_OP_PER_LAYER] <<
+					SHIFT(SDP_D_DP_EW_ALU_CFG_0,
+					EW_ALU_SRC)) |
+				(map_bypass[y_op->cvt.alu_cvt.enable] <<
+					SHIFT(SDP_D_DP_EW_ALU_CFG_0,
+					EW_ALU_CVT_BYPASS));
+			sdp_reg_write(D_DP_EW_ALU_CFG, reg);
+
+			if (y_op->mode == SDP_OP_PER_LAYER) {
+				sdp_reg_write(D_DP_EW_ALU_SRC_VALUE,
+						y_op->alu_operand);
+			} else {
+				sdp_reg_write(D_DP_EW_ALU_CVT_OFFSET_VALUE,
+						y_op->cvt.alu_cvt.offset);
+				sdp_reg_write(D_DP_EW_ALU_CVT_SCALE_VALUE,
+						y_op->cvt.alu_cvt.scale);
+				sdp_reg_write(D_DP_EW_ALU_CVT_TRUNCATE_VALUE,
+						y_op->cvt.alu_cvt.truncate);
+			}
+		}
+
+		if (y_op->type == SDP_OP_MUL || y_op->type == SDP_OP_BOTH) {
+			reg = (map_alu_src[y_op->mode == SDP_OP_PER_LAYER] <<
+					SHIFT(SDP_D_DP_EW_MUL_CFG_0,
+					EW_MUL_SRC)) |
+				(map_bypass[y_op->cvt.mul_cvt.enable] <<
+					SHIFT(SDP_D_DP_EW_MUL_CFG_0,
+					EW_MUL_CVT_BYPASS));
+			sdp_reg_write(D_DP_EW_MUL_CFG, reg);
+
+			if (y_op->mode == SDP_OP_PER_LAYER) {
+				sdp_reg_write(D_DP_EW_MUL_SRC_VALUE,
+						y_op->mul_operand);
+			} else {
+				sdp_reg_write(D_DP_EW_MUL_CVT_OFFSET_VALUE,
+						y_op->cvt.mul_cvt.offset);
+				sdp_reg_write(D_DP_EW_MUL_CVT_SCALE_VALUE,
+						y_op->cvt.mul_cvt.scale);
+				sdp_reg_write(D_DP_EW_MUL_CVT_TRUNCATE_VALUE,
+						y_op->cvt.mul_cvt.truncate);
+			}
+		}
+
+		sdp_reg_write(D_DP_EW_TRUNCATE_VALUE, y_op->truncate);
+	}
+
+	reg = (map_fly[sdp_surface->src_data.type == DLA_MEM_HW] <<
+			SHIFT(SDP_D_FEATURE_MODE_CFG_0,
+			FLYING_MODE)) |
+		(map_dst[sdp_surface->dst_data.type == DLA_MEM_HW] <<
+			SHIFT(SDP_D_FEATURE_MODE_CFG_0,
+			OUTPUT_DST)) |
+		(map_wg[sdp_op->conv_mode == CONV_MODE_WINOGRAD] <<
+			SHIFT(SDP_D_FEATURE_MODE_CFG_0,
+			WINOGRAD)) |
+		((sdp_op->batch_num - 1) <<
+			SHIFT(SDP_D_FEATURE_MODE_CFG_0,
+			BATCH_NUMBER));
+	sdp_reg_write(D_FEATURE_MODE_CFG, reg);
+	sdp_reg_write(D_DST_DMA_CFG,
+			map_ram_type[sdp_surface->dst_data.type]);
+	if (sdp_op->batch_num > 1)
+		sdp_reg_write(D_DST_BATCH_STRIDE, sdp_op->batch_stride);
+
+	reg =
+	(map_proc_precision[sdp_op->dst_precision][sdp_op->src_precision] <<
+			SHIFT(SDP_D_DATA_FORMAT_0,
+			PROC_PRECISION)) |
+		(map_precision[sdp_op->dst_precision] <<
+			SHIFT(SDP_D_DATA_FORMAT_0,
+			OUT_PRECISION));
+	sdp_reg_write(D_DATA_FORMAT, reg);
+	sdp_reg_write(D_CVT_OFFSET, sdp_op->out_cvt.offset);
+	sdp_reg_write(D_CVT_SCALE, sdp_op->out_cvt.scale);
+	sdp_reg_write(D_CVT_SHIFT, sdp_op->out_cvt.truncate);
+
+exit:
+	dla_trace("Exit: %s", __func__);
+	RETURN(ret);
+}
+
+int
+dla_sdp_is_ready(struct dla_processor *processor,
+			   struct dla_processor_group *group)
+{
+	struct dla_processor_group *next_group;
+	struct dla_sdp_op_desc *sdp_op;
+
+	sdp_op = &group->operation_desc->sdp_op;
+	next_group = &processor->groups[!group->id];
+
+	/**
+	 * Single LUT is shared between two SDP groups, need to make
+	 * sure that usage does not conflict. Also, LUT write
+	 * access is locked when SDP sub-engine is active, so delay
+	 * writing LUT when another group is active.
+	 */
+
+	/**
+	 * if no LUT required for current group then it can be programmed
+	 * without further checks
+	 */
+	if (sdp_op->lut_index == -1)
+		return 1;
+
+	/**
+	 * if same LUT is used for both groups then it can be programmed
+	 * without more checks. Even if another group is active and LUT
+	 * is locked, it would have been programmed by another group.
+	 */
+	if (next_group->lut_index == sdp_op->lut_index)
+		return 1;
+
+	/**
+	 * if LUT index of another group is not -1 means some LUT is programmed,
+	 * then do not program current LUT as we already know current LUT is not
+	 * -1 and neither same as another group.
+	 */
+	if (next_group->lut_index != -1)
+		return 0;
+
+	/**
+	 * if current group needs LUT different than another group and that
+	 * group is not active then program it.
+	 */
+	if (!next_group->active)
+		return 1;
+
+	/**
+	 * if control is here it means current group is using LUT different than
+	 * another group and that group is active. Wait for another group to
+	 * become idle.
+	 */
+	return 0;
+}
+
+void
+dla_sdp_dump_config(struct dla_processor_group *group)
+{
+	struct dla_sdp_op_desc *sdp_op;
+	struct dla_sdp_surface_desc *sdp_surface;
+
+	sdp_surface = &group->surface_desc->sdp_surface;
+	sdp_op = &group->operation_desc->sdp_op;
+
+	dla_debug_sdp_surface_desc(sdp_surface, group->roi_index);
+	dla_debug_sdp_op_desc(sdp_op, group->roi_index);
+}
+
+int
+dla_sdp_program(struct dla_processor_group *group)
+{
+	int32_t ret;
+
+	dla_trace("Enter: %s", __func__);
+	dla_enable_intr(MASK(GLB_S_INTR_MASK_0, SDP_DONE_MASK1) |
+			MASK(GLB_S_INTR_MASK_0, SDP_DONE_MASK0));
+
+	ret = processor_sdp_program(group);
+	if (ret)
+		goto exit;
+
+exit:
+	dla_trace("Exit: %s", __func__);
+	RETURN(ret);
+}