nvdla: add NVDLA driver

Additional update from Prashant Gaikwad <pgaikwad@nvidia.com>
Adapted for Linux 5.13 and the BeagleV Starlight board by
<cybergaszcz@gmail.com>
This commit is contained in:
Farzad Farshchi 2018-09-20 19:08:27 -05:00 committed by Emil Renner Berthing
parent 1aaa011e7e
commit 29e676e7fa
33 changed files with 32588 additions and 0 deletions

View file

@ -236,4 +236,6 @@ source "drivers/interconnect/Kconfig"
source "drivers/counter/Kconfig"
source "drivers/most/Kconfig"
source "drivers/nvdla/Kconfig"
endmenu

View file

@ -189,3 +189,4 @@ obj-$(CONFIG_GNSS) += gnss/
obj-$(CONFIG_INTERCONNECT) += interconnect/
obj-$(CONFIG_COUNTER) += counter/
obj-$(CONFIG_MOST) += most/
obj-$(CONFIG_NVDLA) += nvdla/

5
drivers/nvdla/Kconfig Normal file
View file

@ -0,0 +1,5 @@
config NVDLA
bool "The NVIDIA Deep Learning Accelerator"
default n
depends on DRM
select DRM_GEM_CMA_HELPER

19
drivers/nvdla/Makefile Normal file
View file

@ -0,0 +1,19 @@
ccflags-$(CONFIG_NVDLA) += -I$(srctree)/$(src)
ccflags-$(CONFIG_NVDLA) += -I$(srctree)/$(src)/include
obj-$(CONFIG_NVDLA) += scheduler.o
obj-$(CONFIG_NVDLA) += engine.o
obj-$(CONFIG_NVDLA) += bdma.o
obj-$(CONFIG_NVDLA) += conv.o
obj-$(CONFIG_NVDLA) += sdp.o
obj-$(CONFIG_NVDLA) += cdp.o
obj-$(CONFIG_NVDLA) += pdp.o
obj-$(CONFIG_NVDLA) += rubik.o
obj-$(CONFIG_NVDLA) += cache.o
obj-$(CONFIG_NVDLA) += common.o
obj-$(CONFIG_NVDLA) += engine_data.o
obj-$(CONFIG_NVDLA) += engine_isr.o
obj-$(CONFIG_NVDLA) += engine_debug.o
obj-$(CONFIG_NVDLA) += nvdla_core_callbacks.o
obj-$(CONFIG_NVDLA) += nvdla_gem.o

280
drivers/nvdla/bdma.c Normal file
View file

@ -0,0 +1,280 @@
/*
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <opendla.h>
#include <dla_debug.h>
#include <dla_err.h>
#include <dla_interface.h>
#include "dla_engine_internal.h"
#include "engine_debug.h"
static const uint8_t map_mem[] = {
FIELD_ENUM(BDMA_CFG_CMD_0, SRC_RAM_TYPE, MC),
FIELD_ENUM(BDMA_CFG_CMD_0, SRC_RAM_TYPE, CVSRAM),
};
#if STAT_ENABLE
void
dla_bdma_stat_data(struct dla_processor *processor,
struct dla_processor_group *group)
{
uint64_t end_time = 0;
struct dla_bdma_stat_desc *bdma_stat;
bdma_stat = &processor->stat_data_desc->bdma_stat;
end_time = dla_get_time_us();
if (group->id == (uint32_t)0) {
bdma_stat->read_stall = bdma_reg_read(STATUS_GRP0_READ_STALL);
bdma_stat->write_stall = bdma_reg_read(STATUS_GRP0_WRITE_STALL);
} else {
bdma_stat->read_stall = bdma_reg_read(STATUS_GRP1_READ_STALL);
bdma_stat->write_stall = bdma_reg_read(STATUS_GRP1_WRITE_STALL);
}
bdma_stat->runtime = (uint32_t)(end_time - group->start_time);
}
void
dla_bdma_dump_stat(struct dla_processor *processor)
{
struct dla_bdma_stat_desc *bdma_stat;
bdma_stat = &processor->stat_data_desc->bdma_stat;
dla_debug_bdma_stats(bdma_stat);
}
#endif /* STAT_ENABLE */
void
dla_bdma_set_producer(int32_t group_id, int32_t rdma_group_id)
{
/**
* There is no producer bit for BDMA operation,
* interrupt pointer decides which outstanding request
* to use for this BDMA operation
*/
}
int
dla_bdma_enable(struct dla_processor_group *group)
{
struct dla_engine *engine = dla_get_engine();
dla_debug("Enter: %s\n", __func__);
if (group->surface_desc->bdma_surface.num_transfers == (uint16_t)0) {
group->events |= ((uint8_t)1 << DLA_EVENT_OP_COMPLETED);
goto exit;
}
if (engine->stat_enable == (uint32_t)1) {
bdma_reg_write(CFG_STATUS, FIELD_ENUM(BDMA_CFG_STATUS_0,
STALL_COUNT_EN, YES));
group->start_time = dla_get_time_us();
}
/**
* Launch BDMA transfer
*/
if (group->id == 0)
bdma_reg_write(CFG_LAUNCH0, FIELD_ENUM(BDMA_CFG_LAUNCH0_0,
GRP0_LAUNCH, YES));
else
bdma_reg_write(CFG_LAUNCH1, FIELD_ENUM(BDMA_CFG_LAUNCH1_0,
GRP1_LAUNCH, YES));
exit:
dla_debug("Exit: %s\n", __func__);
return 0;
}
void
dla_bdma_rdma_check(struct dla_processor_group *group)
{
group->is_rdma_needed = 0;
}
/**
* Program BDMA slot for transfer
*/
static int32_t
processor_bdma_program_slot(struct dla_bdma_surface_desc *bdma_surface,
struct dla_bdma_transfer_desc *transfer)
{
int32_t ret = 0;
uint64_t source_addr = 0;
uint64_t destination_addr = 0;
uint32_t high, low, reg;
uint8_t bdma_free_slots = 0;
struct dla_engine *engine = dla_get_engine();
dla_debug("Enter: %s\n", __func__);
/* make sure there're enough free slots */
if (bdma_free_slots <= 0) {
do {
reg = bdma_reg_read(STATUS);
reg = (reg & MASK(BDMA_STATUS_0, FREE_SLOT)) >>
SHIFT(BDMA_STATUS_0, FREE_SLOT);
} while (reg == 0);
bdma_free_slots = (uint8_t)reg;
}
dla_get_dma_address(engine->driver_context, engine->task->task_data,
transfer->source_address,
(void *)&source_addr,
DESTINATION_DMA);
dla_get_dma_address(engine->driver_context, engine->task->task_data,
transfer->destination_address,
(void *)&destination_addr,
DESTINATION_DMA);
ASSERT_GOTO((transfer->line_repeat <= 8192),
ret, ERR(INVALID_INPUT), exit);
ASSERT_GOTO((transfer->surface_repeat <= 8192),
ret, ERR(INVALID_INPUT), exit);
ASSERT_GOTO((transfer->line_size % 32) == 0,
ret, ERR(INVALID_INPUT), exit);
ASSERT_GOTO(transfer->source_line >= transfer->line_size,
ret, ERR(INVALID_INPUT), exit);
ASSERT_GOTO(transfer->destination_line >= transfer->line_size,
ret, ERR(INVALID_INPUT), exit);
ASSERT_GOTO(transfer->source_surface >=
(transfer->source_line * transfer->line_repeat),
ret, ERR(INVALID_INPUT), exit);
ASSERT_GOTO(transfer->destination_surface >=
(transfer->destination_line * transfer->line_repeat),
ret, ERR(INVALID_INPUT), exit);
/* config registers */
high = HIGH32BITS(source_addr);
low = LOW32BITS(source_addr);
bdma_reg_write(CFG_SRC_ADDR_LOW, low);
bdma_reg_write(CFG_SRC_ADDR_HIGH, high);
high = HIGH32BITS(destination_addr);
low = LOW32BITS(destination_addr);
bdma_reg_write(CFG_DST_ADDR_LOW, low);
bdma_reg_write(CFG_DST_ADDR_HIGH, high);
bdma_reg_write(CFG_LINE, (transfer->line_size >> 5) - 1);
reg = (map_mem[bdma_surface->source_type] <<
SHIFT(BDMA_CFG_CMD_0, SRC_RAM_TYPE)) |
(map_mem[bdma_surface->destination_type] <<
SHIFT(BDMA_CFG_CMD_0, DST_RAM_TYPE));
bdma_reg_write(CFG_CMD, reg);
bdma_reg_write(CFG_LINE_REPEAT, transfer->line_repeat - 1);
bdma_reg_write(CFG_SRC_LINE, transfer->source_line);
bdma_reg_write(CFG_DST_LINE, transfer->destination_line);
bdma_reg_write(CFG_SURF_REPEAT, transfer->surface_repeat - 1);
bdma_reg_write(CFG_SRC_SURF, transfer->source_surface);
bdma_reg_write(CFG_DST_SURF, transfer->destination_surface);
bdma_reg_write(CFG_OP, FIELD_ENUM(BDMA_CFG_OP_0, EN, ENABLE));
dla_debug("Exit: %s\n", __func__);
exit:
RETURN(ret);
}
int
dla_bdma_is_ready(struct dla_processor *processor,
struct dla_processor_group *group)
{
struct dla_processor_group *next_group;
next_group = &processor->groups[!group->id];
/**
* If another group is already programmed but not active then
* do not program this operation as BDMA does not really
* have shadow copies for groups. It will end programming
* same group. Wait for another group to get enabled.
*/
if ((processor->group_status & (1 << next_group->id)) &&
!next_group->active)
return 0;
return 1;
}
void
dla_bdma_dump_config(struct dla_processor_group *group)
{
struct dla_bdma_op_desc *bdma_op;
struct dla_bdma_surface_desc *bdma_surface;
bdma_surface = &group->surface_desc->bdma_surface;
bdma_op = &group->operation_desc->bdma_op;
dla_debug_bdma_surface_desc(bdma_surface, group->roi_index);
dla_debug_bdma_op_desc(bdma_op, group->roi_index);
}
int
dla_bdma_program(struct dla_processor_group *group)
{
int32_t i;
int32_t ret = 0;
struct dla_bdma_surface_desc *bdma_surface;
struct dla_engine *engine = dla_get_engine();
dla_debug("Enter: %s\n", __func__);
if (!engine->config_data->bdma_enable) {
dla_error("BDMA is not supported for this configuration\n");
ret = ERR(INVALID_INPUT);
goto exit;
}
bdma_surface = &group->surface_desc->bdma_surface;
dla_debug("Num of transfers %u\n", bdma_surface->num_transfers);
if (bdma_surface->num_transfers == (uint16_t)0)
goto exit;
if (bdma_surface->num_transfers > NUM_MAX_BDMA_OPS) {
dla_error("Invalid number of transfers\n");
ret = ERR(INVALID_INPUT);
goto exit;
}
for (i = 0; i < bdma_surface->num_transfers; i++) {
ret = processor_bdma_program_slot(bdma_surface,
&bdma_surface->transfers[i]);
if (ret)
goto exit;
}
dla_enable_intr(MASK(GLB_S_INTR_MASK_0, BDMA_DONE_MASK1) |
MASK(GLB_S_INTR_MASK_0, BDMA_DONE_MASK0));
exit:
dla_debug("Exit: %s\n", __func__);
RETURN(ret);
}

253
drivers/nvdla/cache.c Normal file
View file

@ -0,0 +1,253 @@
/*
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <opendla.h>
#include <dla_debug.h>
#include <dla_engine.h>
#include <dla_interface.h>
#include "dla_engine_internal.h"
#define DLA_OP_CACHE_SIZE (DLA_NUM_GROUPS * ((DLA_OP_NUM + 2) * 2))
static struct dla_common_op_desc desc_cache[DLA_OP_NUM][DLA_OP_CACHE_SIZE];
static int32_t desc_refcount[DLA_OP_NUM][DLA_OP_CACHE_SIZE];
void
dla_get_refcount(struct dla_common_op_desc *op_desc)
{
int32_t i;
struct dla_common_op_desc *desc = NULL;
if (op_desc == NULL)
return;
if (op_desc->index == -1)
return;
desc = &desc_cache[op_desc->op_type][0];
for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
if (desc->index == op_desc->index &&
desc->roi_index == op_desc->roi_index) {
desc_refcount[op_desc->op_type][i]++;
return;
}
}
}
struct dla_common_op_desc *
dla_get_op_desc(struct dla_task *task, int16_t index,
uint8_t op_type, uint8_t roi_index)
{
int32_t i;
int32_t ret;
uint64_t op_base;
uint64_t dep_graph_addr;
struct dla_common_op_desc *desc = NULL;
struct dla_engine *engine = dla_get_engine();
if (index == -1) {
dla_debug("no desc get due to index==-1\n");
goto exit;
}
dep_graph_addr = (sizeof(struct dla_common_op_desc) *
engine->network->num_operations * roi_index);
desc = &desc_cache[op_type][0];
for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
if (desc->index == index && desc->roi_index == roi_index) {
if (desc->op_type != op_type) {
dla_error("op_cache[op=%u] contains incorrect "
"entry of op[%u]\n", op_type,
desc->op_type);
continue;
}
desc_refcount[op_type][i]++;
goto exit;
}
}
desc = &desc_cache[op_type][0];
for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
if (desc->index == -1) {
op_base = dep_graph_addr +
(sizeof(struct dla_common_op_desc) *
(uint64_t)index);
ret = dla_data_read(engine->driver_context,
task->task_data,
task->dependency_graph_addr,
(void *)(desc),
sizeof(struct dla_common_op_desc),
op_base);
if (ret) {
desc = NULL;
goto exit;
}
if (op_type != desc->op_type) {
/*
* op_type of entry read from DRAM should not
* mismatch with given op_type. If they
* mismatches, then wrong entry is fetched, so
* report this issue by throwing error.
*/
dla_error("Fetched [op_type=%u] from DRAM doesn't "
"match with op_type[%u]\n",
desc->op_type,
op_type);
desc->op_type = op_type;
desc->index = -1;
desc->roi_index = -1;
desc = NULL;
goto exit;
}
desc->index = index;
desc->roi_index = roi_index;
/**
* Refcount must be 0 if we are reading it first time
* from DRAM
*/
assert(desc_refcount[op_type][i] == 0);
desc_refcount[op_type][i]++;
goto exit;
}
}
exit:
return desc;
}
static void
dla_free_op_desc(struct dla_common_op_desc *op_desc)
{
uint64_t op_base;
uint64_t dep_graph_addr;
struct dla_task *task;
struct dla_engine *engine = dla_get_engine();
dla_debug("Enter: %s op desc index %u ROI %d\n", __func__,
op_desc->index, op_desc->roi_index);
task = engine->task;
dep_graph_addr = (sizeof(struct dla_common_op_desc) *
engine->network->num_operations *
op_desc->roi_index);
if (op_desc->index == -1)
goto exit;
if (op_desc == NULL)
goto exit;
/**
* TODO: keeping the depth value hardcoded as 0 for now,
* need to replace it once corresponding implementation is done.
*/
op_base = (dep_graph_addr +
(sizeof(struct dla_common_op_desc) *
(uint64_t)op_desc->index));
/**
* Flush descriptor to DRAM
*/
dla_data_write(engine->driver_context,
task->task_data,
(void *)op_desc,
task->dependency_graph_addr,
sizeof(struct dla_common_op_desc),
op_base);
/**
* Release it
*/
op_desc->index = -1;
op_desc->roi_index = -1;
exit:
dla_debug("Exit: %s\n", __func__);
}
void
dla_put_op_desc(struct dla_common_op_desc *op_desc)
{
int32_t i;
struct dla_common_op_desc *desc;
if (op_desc == NULL)
return;
if (op_desc->index == -1)
return;
desc = &desc_cache[op_desc->op_type][0];
for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
if (desc->index == op_desc->index &&
desc->roi_index == op_desc->roi_index) {
/**
* Refcount can't be 0 when we are trying to free it
*/
assert(desc_refcount[op_desc->op_type][i] > 0);
desc_refcount[op_desc->op_type][i]--;
/**
* Free desc if refcount is 0
*/
if (desc_refcount[op_desc->op_type][i] == 0)
dla_free_op_desc(op_desc);
return;
}
}
}
void
dla_init_op_cache(struct dla_engine *engine)
{
int32_t i, j;
struct dla_common_op_desc *desc = &desc_cache[0][0];
dla_memset((uint8_t *)&desc_cache[0][0], 0, sizeof(desc_cache));
dla_memset((uint8_t *)&desc_refcount[0][0], 0, sizeof(desc_refcount));
for (i = 0; i < DLA_OP_NUM; i++) {
for (j = 0; j < DLA_OP_CACHE_SIZE; j++) {
desc->index = -1;
desc->roi_index = -1;
desc->op_type = (uint8_t)i;
desc++;
}
}
}

384
drivers/nvdla/cdp.c Normal file
View file

@ -0,0 +1,384 @@
/*
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <opendla.h>
#include <dla_debug.h>
#include <dla_err.h>
#include <dla_interface.h>
#include "common.h"
#include "dla_engine_internal.h"
#include "engine_debug.h"
static const uint8_t map_ram[] = {
FIELD_ENUM(CDP_RDMA_D_SRC_DMA_CFG_0, SRC_RAM_TYPE, MC),
FIELD_ENUM(CDP_RDMA_D_SRC_DMA_CFG_0, SRC_RAM_TYPE, CV),
};
static const uint8_t map_precision[] = {
FIELD_ENUM(CDP_RDMA_D_DATA_FORMAT_0, INPUT_DATA, INT8),
FIELD_ENUM(CDP_RDMA_D_DATA_FORMAT_0, INPUT_DATA, INT16),
FIELD_ENUM(CDP_RDMA_D_DATA_FORMAT_0, INPUT_DATA, FP16),
};
static const uint8_t map_perf_dma[] = {
FIELD_ENUM(CDP_D_PERF_ENABLE_0, DMA_EN, DISABLE),
FIELD_ENUM(CDP_D_PERF_ENABLE_0, DMA_EN, ENABLE),
};
static const uint8_t map_perf_lut[] = {
FIELD_ENUM(CDP_D_PERF_ENABLE_0, LUT_EN, DISABLE),
FIELD_ENUM(CDP_D_PERF_ENABLE_0, LUT_EN, ENABLE),
};
#if STAT_ENABLE
void
dla_cdp_stat_data(struct dla_processor *processor,
struct dla_processor_group *group)
{
uint64_t end_time = 0;
struct dla_cdp_stat_desc *cdp_stat;
cdp_stat = &processor->stat_data_desc->cdp_stat;
end_time = dla_get_time_us();
cdp_stat->write_stall = cdp_reg_read(D_PERF_WRITE_STALL);
cdp_stat->lut_uflow = cdp_reg_read(D_PERF_LUT_UFLOW);
cdp_stat->lut_oflow = cdp_reg_read(D_PERF_LUT_OFLOW);
cdp_stat->lut_hybrid = cdp_reg_read(D_PERF_LUT_HYBRID);
cdp_stat->lut_le_hit = cdp_reg_read(D_PERF_LUT_LE_HIT);
cdp_stat->lut_lo_hit = cdp_reg_read(D_PERF_LUT_LO_HIT);
cdp_stat->runtime = (uint32_t)(end_time - group->start_time);
}
void
dla_cdp_dump_stat(struct dla_processor *processor)
{
struct dla_cdp_stat_desc *cdp_stat;
cdp_stat = &processor->stat_data_desc->cdp_stat;
dla_debug_cdp_stats(cdp_stat);
}
#endif /* STAT_ENABLE */
static uint32_t
map_local_size(uint8_t local_size)
{
return ((local_size-1)/2)-1;
}
void
dla_cdp_set_producer(int32_t group_id, int32_t rdma_group_id)
{
uint32_t reg;
/**
* set producer pointer for all sub-modules
*/
reg = group_id << SHIFT(CDP_S_POINTER_0, PRODUCER);
cdp_reg_write(S_POINTER, reg);
reg = group_id << SHIFT(CDP_RDMA_S_POINTER_0, PRODUCER);
cdp_rdma_reg_write(S_POINTER, reg);
}
int
dla_cdp_enable(struct dla_processor_group *group)
{
uint32_t reg;
uint8_t perf_reg;
struct dla_engine *engine = dla_get_engine();
dla_debug("Enter: %s\n", __func__);
if (engine->stat_enable == (uint32_t)1) {
perf_reg = (map_perf_dma[1] <<
SHIFT(CDP_D_PERF_ENABLE_0, DMA_EN)) |
(map_perf_lut[1] <<
SHIFT(CDP_D_PERF_ENABLE_0, LUT_EN));
cdp_reg_write(D_PERF_ENABLE, perf_reg);
group->start_time = dla_get_time_us();
}
/**
* enable all sub-modules
*/
reg = FIELD_ENUM(CDP_RDMA_D_OP_ENABLE_0, OP_EN, ENABLE);
cdp_rdma_reg_write(D_OP_ENABLE, reg);
reg = FIELD_ENUM(CDP_D_OP_ENABLE_0, OP_EN, ENABLE);
cdp_reg_write(D_OP_ENABLE, reg);
dla_debug("Exit: %s\n", __func__);
RETURN(0);
}
void
dla_cdp_rdma_check(struct dla_processor_group *group)
{
group->is_rdma_needed = 1;
}
static int32_t
processor_cdp_program(struct dla_processor_group *group)
{
int32_t ret = 0;
uint32_t reg, high, low;
uint64_t input_address = 0;
uint64_t output_address = 0;
struct dla_lut_param lut;
struct dla_engine *engine = dla_get_engine();
struct dla_cdp_op_desc *cdp_op;
struct dla_cdp_surface_desc *cdp_surface;
dla_debug("Enter: %s\n", __func__);
cdp_op = &group->operation_desc->cdp_op;
cdp_surface = &group->surface_desc->cdp_surface;
/* Argument check */
if (cdp_surface->src_data.type == DLA_MEM_HW) {
dla_error("Invalid source memory type\n");
ret = ERR(INVALID_INPUT);
goto exit;
}
if (cdp_surface->dst_data.type == DLA_MEM_HW) {
dla_error("Invalid destination memory type\n");
ret = ERR(INVALID_INPUT);
goto exit;
}
if (cdp_op->in_precision != cdp_op->out_precision) {
dla_error("CDP does not support precision conversion\n");
ret = ERR(INVALID_INPUT);
goto exit;
}
/* get the addresses from task descriptor */
ret = dla_read_input_address(&cdp_surface->src_data,
&input_address,
group->op_desc->index,
group->roi_index,
1);
if (ret)
goto exit;
dla_get_dma_cube_address(engine->driver_context,
engine->task->task_data,
cdp_surface->dst_data.address,
cdp_surface->dst_data.offset,
(void *)&output_address,
DESTINATION_DMA);
if (cdp_op->lut_index >= 0) {
group->lut_index = cdp_op->lut_index;
dla_read_lut(engine, cdp_op->lut_index, (void *)&lut);
dla_debug_lut_params(&lut);
}
/* config CDP RDMA registers */
reg = ((cdp_surface->src_data.width - 1)
<< SHIFT(CDP_RDMA_D_DATA_CUBE_WIDTH_0, WIDTH));
cdp_rdma_reg_write(D_DATA_CUBE_WIDTH, reg);
reg = ((cdp_surface->src_data.height - 1)
<< SHIFT(CDP_RDMA_D_DATA_CUBE_HEIGHT_0, HEIGHT));
cdp_rdma_reg_write(D_DATA_CUBE_HEIGHT, reg);
reg = ((cdp_surface->src_data.channel - 1)
<< SHIFT(CDP_RDMA_D_DATA_CUBE_CHANNEL_0, CHANNEL));
cdp_rdma_reg_write(D_DATA_CUBE_CHANNEL, reg);
high = HIGH32BITS(input_address);
low = LOW32BITS(input_address);
cdp_rdma_reg_write(D_SRC_BASE_ADDR_LOW, low);
cdp_rdma_reg_write(D_SRC_BASE_ADDR_HIGH, high);
cdp_rdma_reg_write(D_SRC_LINE_STRIDE,
cdp_surface->src_data.line_stride);
cdp_rdma_reg_write(D_SRC_SURFACE_STRIDE,
cdp_surface->src_data.surf_stride);
reg = (map_ram[cdp_surface->src_data.type]
<< SHIFT(CDP_RDMA_D_SRC_DMA_CFG_0, SRC_RAM_TYPE));
cdp_rdma_reg_write(D_SRC_DMA_CFG, reg);
reg = (map_precision[cdp_op->in_precision]
<< SHIFT(CDP_RDMA_D_DATA_FORMAT_0, INPUT_DATA));
cdp_rdma_reg_write(D_DATA_FORMAT, reg);
/* config CDP */
if (cdp_op->lut_index >= 0)
update_lut(CDP_S_LUT_ACCESS_CFG_0, &lut, cdp_op->in_precision);
high = HIGH32BITS(output_address);
low = LOW32BITS(output_address);
cdp_reg_write(D_DST_BASE_ADDR_LOW, low);
cdp_reg_write(D_DST_BASE_ADDR_HIGH, high);
cdp_reg_write(D_DST_LINE_STRIDE, cdp_surface->dst_data.line_stride);
cdp_reg_write(D_DST_SURFACE_STRIDE, cdp_surface->dst_data.surf_stride);
reg = (map_ram[cdp_surface->dst_data.type]
<< SHIFT(CDP_D_DST_DMA_CFG_0, DST_RAM_TYPE));
cdp_reg_write(D_DST_DMA_CFG, reg);
reg = (map_precision[cdp_op->in_precision]
<< SHIFT(CDP_D_DATA_FORMAT_0, INPUT_DATA_TYPE));
cdp_reg_write(D_DATA_FORMAT, reg);
reg = (map_local_size(cdp_op->local_size)
<< SHIFT(CDP_D_LRN_CFG_0, NORMALZ_LEN));
cdp_reg_write(D_LRN_CFG, reg);
reg = (cdp_op->in_cvt.offset
<< SHIFT(CDP_D_DATIN_OFFSET_0, DATIN_OFFSET));
cdp_reg_write(D_DATIN_OFFSET, reg);
reg = (cdp_op->in_cvt.scale
<< SHIFT(CDP_D_DATIN_SCALE_0, DATIN_SCALE));
cdp_reg_write(D_DATIN_SCALE, reg);
reg = (cdp_op->in_cvt.truncate
<< SHIFT(CDP_D_DATIN_SHIFTER_0, DATIN_SHIFTER));
cdp_reg_write(D_DATIN_SHIFTER, reg);
reg = (cdp_op->out_cvt.offset
<< SHIFT(CDP_D_DATOUT_OFFSET_0, DATOUT_OFFSET));
cdp_reg_write(D_DATOUT_OFFSET, reg);
reg = (cdp_op->out_cvt.scale
<< SHIFT(CDP_D_DATOUT_SCALE_0, DATOUT_SCALE));
cdp_reg_write(D_DATOUT_SCALE, reg);
reg = (cdp_op->out_cvt.truncate
<< SHIFT(CDP_D_DATOUT_SHIFTER_0, DATOUT_SHIFTER));
cdp_reg_write(D_DATOUT_SHIFTER, reg);
reg = ((cdp_op->bypass_sqsum ?
FIELD_ENUM(CDP_D_FUNC_BYPASS_0, SQSUM_BYPASS, ENABLE) :
FIELD_ENUM(CDP_D_FUNC_BYPASS_0, SQSUM_BYPASS, DISABLE)) <<
SHIFT(CDP_D_FUNC_BYPASS_0, SQSUM_BYPASS)) |
((cdp_op->bypass_out_mul ?
FIELD_ENUM(CDP_D_FUNC_BYPASS_0, MUL_BYPASS, ENABLE) :
FIELD_ENUM(CDP_D_FUNC_BYPASS_0, MUL_BYPASS, DISABLE)) <<
SHIFT(CDP_D_FUNC_BYPASS_0, MUL_BYPASS));
cdp_reg_write(D_FUNC_BYPASS, reg);
exit:
dla_debug("Exit: %s", __func__);
RETURN(ret);
}
int
dla_cdp_is_ready(struct dla_processor *processor,
struct dla_processor_group *group)
{
struct dla_processor_group *next_group;
struct dla_cdp_op_desc *cdp_op;
cdp_op = &group->operation_desc->cdp_op;
next_group = &processor->groups[!group->id];
/**
* Single LUT is shared between two CDP groups, need to make
* sure that usage does not conflict. Also, LUT write
* access is locked when CDP sub-engine is active, so delay
* writing LUT when another group is active.
*/
/**
* if no LUT required for current group then it can be programmed
* without further checks
*/
if (cdp_op->lut_index == -1)
return 1;
/**
* if same LUT is used for both groups then it can be programmed
* without more checks. Even if another group is active and LUT
* is locked, it would have been programmed by another group.
*/
if (next_group->lut_index == cdp_op->lut_index)
return 1;
/**
* if LUT index of another group is not -1 means some LUT is programmed,
* then do not program current LUT as we already know current LUT is not
* -1 and neither same as another group.
*/
if (next_group->lut_index != -1)
return 0;
/**
* if current group needs LUT different than another group and that
* group is not active then program it.
*/
if (!next_group->active)
return 1;
/**
* if control is here it means current group is using LUT different than
* another group and that group is active. Wait for another group to
* become idle.
*/
return 0;
}
void
dla_cdp_dump_config(struct dla_processor_group *group)
{
struct dla_cdp_op_desc *cdp_op;
struct dla_cdp_surface_desc *cdp_surface;
cdp_surface = &group->surface_desc->cdp_surface;
cdp_op = &group->operation_desc->cdp_op;
dla_debug_cdp_surface_desc(cdp_surface, group->roi_index);
dla_debug_cdp_op_desc(cdp_op, group->roi_index);
}
int
dla_cdp_program(struct dla_processor_group *group)
{
int32_t ret;
dla_debug("Enter: %s", __func__);
dla_enable_intr(MASK(GLB_S_INTR_MASK_0, CDP_DONE_MASK1) |
MASK(GLB_S_INTR_MASK_0, CDP_DONE_MASK0));
ret = processor_cdp_program(group);
if (ret)
goto exit;
exit:
dla_debug("Exit: %s", __func__);
RETURN(ret);
}

324
drivers/nvdla/common.c Normal file
View file

@ -0,0 +1,324 @@
/*
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <opendla.h>
#include <dla_debug.h>
#include <dla_err.h>
#include <dla_interface.h>
#include "common.h"
#include "dla_engine_internal.h"
static const uint8_t map_lut_method[] = {
FIELD_ENUM(CDP_S_LUT_CFG_0, LUT_LE_FUNCTION, EXPONENT),
FIELD_ENUM(CDP_S_LUT_CFG_0, LUT_LE_FUNCTION, LINEAR)
};
static const uint8_t map_lut_out[] = {
FIELD_ENUM(CDP_S_LUT_CFG_0, LUT_UFLOW_PRIORITY, LE),
FIELD_ENUM(CDP_S_LUT_CFG_0, LUT_UFLOW_PRIORITY, LO)
};
static const uint16_t access_data_offset[] = {
CDP_S_LUT_ACCESS_DATA_0 - CDP_S_LUT_ACCESS_CFG_0,
SDP_S_LUT_ACCESS_DATA_0 - SDP_S_LUT_ACCESS_CFG_0,
};
static const uint16_t lut_cfg_offset[] = {
CDP_S_LUT_CFG_0 - CDP_S_LUT_ACCESS_CFG_0,
SDP_S_LUT_CFG_0 - SDP_S_LUT_ACCESS_CFG_0,
};
static const uint16_t lut_info_offset[] = {
CDP_S_LUT_INFO_0 - CDP_S_LUT_ACCESS_CFG_0,
SDP_S_LUT_INFO_0 - SDP_S_LUT_ACCESS_CFG_0,
};
static const uint16_t le_start_offset[] = {
CDP_S_LUT_LE_START_LOW_0 - CDP_S_LUT_ACCESS_CFG_0,
SDP_S_LUT_LE_START_0 - SDP_S_LUT_ACCESS_CFG_0,
};
static const uint16_t le_end_offset[] = {
CDP_S_LUT_LE_END_LOW_0 - CDP_S_LUT_ACCESS_CFG_0,
SDP_S_LUT_LE_END_0 - SDP_S_LUT_ACCESS_CFG_0,
};
static const uint16_t lo_start_offset[] = {
CDP_S_LUT_LO_START_LOW_0 - CDP_S_LUT_ACCESS_CFG_0,
SDP_S_LUT_LO_START_0 - SDP_S_LUT_ACCESS_CFG_0,
};
static const uint16_t lo_end_offset[] = {
CDP_S_LUT_LO_END_LOW_0 - CDP_S_LUT_ACCESS_CFG_0,
SDP_S_LUT_LO_END_0 - SDP_S_LUT_ACCESS_CFG_0,
};
static const uint16_t le_slope_scale_offset[] = {
CDP_S_LUT_LE_SLOPE_SCALE_0 - CDP_S_LUT_ACCESS_CFG_0,
SDP_S_LUT_LE_SLOPE_SCALE_0 - SDP_S_LUT_ACCESS_CFG_0,
};
static const uint16_t le_slope_shift_offset[] = {
CDP_S_LUT_LE_SLOPE_SHIFT_0 - CDP_S_LUT_ACCESS_CFG_0,
SDP_S_LUT_LE_SLOPE_SHIFT_0 - SDP_S_LUT_ACCESS_CFG_0,
};
static const uint16_t lo_slope_scale_offset[] = {
CDP_S_LUT_LO_SLOPE_SCALE_0 - CDP_S_LUT_ACCESS_CFG_0,
SDP_S_LUT_LO_SLOPE_SCALE_0 - SDP_S_LUT_ACCESS_CFG_0,
};
static const uint16_t lo_slope_shift_offset[] = {
CDP_S_LUT_LO_SLOPE_SHIFT_0 - CDP_S_LUT_ACCESS_CFG_0,
SDP_S_LUT_LO_SLOPE_SHIFT_0 - SDP_S_LUT_ACCESS_CFG_0,
};
void update_lut(uint32_t reg_base, struct dla_lut_param *lut,
uint8_t precision)
{
int32_t i;
uint32_t reg;
uint32_t high, low;
int32_t is_sdp = reg_base == SDP_S_LUT_ACCESS_CFG_0;
struct dla_engine *engine = dla_get_engine();
/* program raw table */
reg = (FIELD_ENUM(CDP_S_LUT_ACCESS_CFG_0, LUT_TABLE_ID, LE)
<< SHIFT(CDP_S_LUT_ACCESS_CFG_0, LUT_TABLE_ID)) |
(FIELD_ENUM(CDP_S_LUT_ACCESS_CFG_0, LUT_ACCESS_TYPE, WRITE)
<< SHIFT(CDP_S_LUT_ACCESS_CFG_0, LUT_ACCESS_TYPE));
reg_write(reg_base, reg);
for (i = 0; i < (1<<LUT_LINEAR_EXP_TABLE_ENTRY_LOG2)+1; i++) {
dla_reg_write(engine->driver_context,
reg_base + access_data_offset[is_sdp],
lut->linear_exp_table[i]);
}
/* program density table */
reg = (FIELD_ENUM(CDP_S_LUT_ACCESS_CFG_0, LUT_TABLE_ID, LO)
<< SHIFT(CDP_S_LUT_ACCESS_CFG_0, LUT_TABLE_ID)) |
(FIELD_ENUM(CDP_S_LUT_ACCESS_CFG_0, LUT_ACCESS_TYPE, WRITE)
<< SHIFT(CDP_S_LUT_ACCESS_CFG_0, LUT_ACCESS_TYPE));
dla_reg_write(engine->driver_context, reg_base, reg);
for (i = 0; i < (1<<LUT_LINEAR_ONLY_TABLE_ENTRY_LOG2)+1; i++) {
dla_reg_write(engine->driver_context,
reg_base + access_data_offset[is_sdp],
lut->linear_only_table[i]);
}
/* program other configurations */
reg = (map_lut_method[lut->method] <<
SHIFT(CDP_S_LUT_CFG_0, LUT_LE_FUNCTION)) |
(map_lut_out[lut->hybrid_priority] <<
SHIFT(CDP_S_LUT_CFG_0, LUT_HYBRID_PRIORITY)) |
(map_lut_out[lut->underflow_priority] <<
SHIFT(CDP_S_LUT_CFG_0, LUT_UFLOW_PRIORITY)) |
(map_lut_out[lut->overflow_priority] <<
SHIFT(CDP_S_LUT_CFG_0, LUT_OFLOW_PRIORITY));
dla_reg_write(engine->driver_context,
reg_base + lut_cfg_offset[is_sdp], reg);
if (lut->method == FIELD_ENUM(CDP_S_LUT_CFG_0,
LUT_LE_FUNCTION, EXPONENT)) {
reg = ((((uint32_t)lut->linear_exp_offset.exp_offset) <<
SHIFT(CDP_S_LUT_INFO_0, LUT_LE_INDEX_OFFSET))&
MASK(CDP_S_LUT_INFO_0, LUT_LE_INDEX_OFFSET)) |
((((uint32_t)lut->linear_only_offset.frac_bits) <<
SHIFT(CDP_S_LUT_INFO_0, LUT_LO_INDEX_SELECT))&
MASK(CDP_S_LUT_INFO_0, LUT_LO_INDEX_SELECT));
} else {
reg = ((((uint32_t)lut->linear_exp_offset.frac_bits) <<
SHIFT(CDP_S_LUT_INFO_0, LUT_LE_INDEX_SELECT))&
MASK(CDP_S_LUT_INFO_0, LUT_LE_INDEX_SELECT)) |
((((uint32_t)lut->linear_only_offset.frac_bits) <<
SHIFT(CDP_S_LUT_INFO_0, LUT_LO_INDEX_SELECT))&
MASK(CDP_S_LUT_INFO_0, LUT_LO_INDEX_SELECT));
}
dla_reg_write(engine->driver_context,
reg_base + lut_info_offset[is_sdp], reg);
high = HIGH32BITS(lut->linear_exp_start);
low = LOW32BITS(lut->linear_exp_start);
dla_reg_write(engine->driver_context,
reg_base + le_start_offset[is_sdp], low);
if (!is_sdp)
dla_reg_write(engine->driver_context,
reg_base + le_start_offset[is_sdp] + 4, high);
high = HIGH32BITS(lut->linear_exp_end);
low = LOW32BITS(lut->linear_exp_end);
dla_reg_write(engine->driver_context,
reg_base + le_end_offset[is_sdp], low);
if (!is_sdp)
dla_reg_write(engine->driver_context,
reg_base + le_end_offset[is_sdp] + 4, high);
high = HIGH32BITS(lut->linear_only_start);
low = LOW32BITS(lut->linear_only_start);
dla_reg_write(engine->driver_context,
reg_base + lo_start_offset[is_sdp], low);
if (!is_sdp)
dla_reg_write(engine->driver_context,
reg_base + lo_start_offset[is_sdp] + 4, high);
high = HIGH32BITS(lut->linear_only_end);
low = LOW32BITS(lut->linear_only_end);
dla_reg_write(engine->driver_context,
reg_base + lo_end_offset[is_sdp], low);
if (!is_sdp)
dla_reg_write(engine->driver_context,
reg_base + lo_end_offset[is_sdp] + 4, high);
if (precision == PRECISION_FP16) {
reg = (lut->linear_exp_underflow_slope.data_f <<
SHIFT(CDP_S_LUT_LE_SLOPE_SCALE_0,
LUT_LE_SLOPE_UFLOW_SCALE)) |
(lut->linear_exp_overflow_slope.data_f <<
SHIFT(CDP_S_LUT_LE_SLOPE_SCALE_0,
LUT_LE_SLOPE_OFLOW_SCALE));
dla_reg_write(engine->driver_context,
reg_base + le_slope_scale_offset[is_sdp], reg);
reg = (lut->linear_only_underflow_slope.data_f <<
SHIFT(CDP_S_LUT_LO_SLOPE_SCALE_0,
LUT_LO_SLOPE_UFLOW_SCALE)) |
(lut->linear_only_overflow_slope.data_f <<
SHIFT(CDP_S_LUT_LO_SLOPE_SCALE_0,
LUT_LO_SLOPE_OFLOW_SCALE));
dla_reg_write(engine->driver_context,
reg_base + lo_slope_scale_offset[is_sdp], reg);
} else {
union dla_slope *oslope;
union dla_slope *uslope;
uslope = &lut->linear_exp_underflow_slope;
oslope = &lut->linear_exp_overflow_slope;
reg = ((((uint32_t)uslope->data_i.scale)
<< SHIFT(CDP_S_LUT_LE_SLOPE_SCALE_0,
LUT_LE_SLOPE_UFLOW_SCALE))&
MASK(CDP_S_LUT_LE_SLOPE_SCALE_0,
LUT_LE_SLOPE_UFLOW_SCALE)) |
((((uint32_t)oslope->data_i.scale)
<< SHIFT(CDP_S_LUT_LE_SLOPE_SCALE_0,
LUT_LE_SLOPE_OFLOW_SCALE))&
MASK(CDP_S_LUT_LE_SLOPE_SCALE_0,
LUT_LE_SLOPE_OFLOW_SCALE));
dla_reg_write(engine->driver_context,
reg_base + le_slope_scale_offset[is_sdp], reg);
reg = ((((uint32_t)uslope->data_i.shifter) <<
SHIFT(CDP_S_LUT_LE_SLOPE_SHIFT_0,
LUT_LE_SLOPE_UFLOW_SHIFT))&
MASK(CDP_S_LUT_LE_SLOPE_SHIFT_0,
LUT_LE_SLOPE_UFLOW_SHIFT)) |
((((uint32_t)oslope->data_i.shifter) <<
SHIFT(CDP_S_LUT_LE_SLOPE_SHIFT_0,
LUT_LE_SLOPE_OFLOW_SHIFT))&
MASK(CDP_S_LUT_LE_SLOPE_SHIFT_0,
LUT_LE_SLOPE_OFLOW_SHIFT));
dla_reg_write(engine->driver_context,
reg_base + le_slope_shift_offset[is_sdp], reg);
uslope = &lut->linear_only_underflow_slope;
oslope = &lut->linear_only_overflow_slope;
reg = ((((uint32_t)uslope->data_i.scale) <<
SHIFT(CDP_S_LUT_LO_SLOPE_SCALE_0,
LUT_LO_SLOPE_UFLOW_SCALE))&
MASK(CDP_S_LUT_LO_SLOPE_SCALE_0,
LUT_LO_SLOPE_UFLOW_SCALE)) |
((((uint32_t)oslope->data_i.scale) <<
SHIFT(CDP_S_LUT_LO_SLOPE_SCALE_0,
LUT_LO_SLOPE_OFLOW_SCALE))&
MASK(CDP_S_LUT_LO_SLOPE_SCALE_0,
LUT_LO_SLOPE_OFLOW_SCALE));
dla_reg_write(engine->driver_context,
reg_base + lo_slope_scale_offset[is_sdp], reg);
reg = ((((uint32_t)uslope->data_i.shifter) <<
SHIFT(CDP_S_LUT_LO_SLOPE_SHIFT_0,
LUT_LO_SLOPE_UFLOW_SHIFT))&
MASK(CDP_S_LUT_LO_SLOPE_SHIFT_0,
LUT_LO_SLOPE_UFLOW_SHIFT)) |
((((uint32_t)oslope->data_i.shifter) <<
SHIFT(CDP_S_LUT_LO_SLOPE_SHIFT_0,
LUT_LO_SLOPE_OFLOW_SHIFT))&
MASK(CDP_S_LUT_LO_SLOPE_SHIFT_0,
LUT_LO_SLOPE_OFLOW_SHIFT));
dla_reg_write(engine->driver_context,
reg_base + lo_slope_shift_offset[is_sdp], reg);
}
}
int
validate_data_cube(struct dla_data_cube src_data_cube,
struct dla_data_cube dst_data_cube,
uint8_t mem_type)
{
int32_t ret = 0;
dla_trace("Enter: %s", __func__);
if ((src_data_cube.width > DCUBE_MAX_WIDTH) ||
(src_data_cube.height > DCUBE_MAX_HEIGHT) ||
(src_data_cube.channel > DCUBE_MAX_CHANNEL)) {
dla_error("Invalid SrcInput Cude[W: %u, H: %u, C: %u]",
src_data_cube.width, src_data_cube.height,
src_data_cube.channel);
ret = ERR(INVALID_INPUT);
goto exit;
}
if ((dst_data_cube.width > DCUBE_MAX_WIDTH) ||
(dst_data_cube.height > DCUBE_MAX_HEIGHT) ||
(dst_data_cube.channel > DCUBE_MAX_CHANNEL)) {
dla_error("Invalid DstInput Cude[W: %u, H: %u, C: %u]",
dst_data_cube.width, dst_data_cube.height,
dst_data_cube.channel);
ret = ERR(INVALID_INPUT);
goto exit;
}
if (src_data_cube.type > mem_type) {
dla_error("Invalid src_data.mem_type: %u\n", src_data_cube.type);
ret = ERR(INVALID_INPUT);
goto exit;
}
if (dst_data_cube.type > mem_type) {
dla_error("Invalid dst_data.mem_type: %u\n", dst_data_cube.type);
ret = ERR(INVALID_INPUT);
goto exit;
}
exit:
dla_trace("Exit: %s", __func__);
RETURN(ret);
}
int
validate_precision(uint8_t precision, uint8_t map_precision)
{
int32_t ret = 0;
if (precision >= map_precision) {
dla_error("Invalid precision: %u\n", precision);
ret = ERR(INVALID_INPUT);
}
RETURN(ret);
}

47
drivers/nvdla/common.h Normal file
View file

@ -0,0 +1,47 @@
/*
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __FIRMWARE_COMMON_H_
#define __FIRMWARE_COMMON_H_
#include <dla_interface.h>
#define DCUBE_MAX_WIDTH 8192
#define DCUBE_MAX_HEIGHT 8192
#define DCUBE_MAX_CHANNEL 8192
void update_lut(uint32_t reg_base,
struct dla_lut_param *lut,
uint8_t precision);
int32_t validate_data_cube(struct dla_data_cube src_data_cube,
struct dla_data_cube dst_data_cube,
uint8_t mem_type);
int32_t validate_precision(uint8_t precision,
uint8_t map_precision);
#endif /* __FIRMWARE_COMMON_H_ */

779
drivers/nvdla/conv.c Normal file
View file

@ -0,0 +1,779 @@
/*
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <opendla.h>
#include <dla_debug.h>
#include <dla_err.h>
#include <dla_interface.h>
#include "common.h"
#include "dla_engine_internal.h"
#include "engine_debug.h"
static const uint8_t map_precision[] = {
FIELD_ENUM(CDMA_D_MISC_CFG_0, IN_PRECISION, INT8),
FIELD_ENUM(CDMA_D_MISC_CFG_0, IN_PRECISION, INT16),
FIELD_ENUM(CDMA_D_MISC_CFG_0, IN_PRECISION, FP16),
};
static const uint8_t map_conv[] = {
FIELD_ENUM(CACC_D_MISC_CFG_0, CONV_MODE, DIRECT),
FIELD_ENUM(CACC_D_MISC_CFG_0, CONV_MODE, WINOGRAD),
};
static const uint8_t map_weight_fmt[] = {
FIELD_ENUM(CSC_D_WEIGHT_FORMAT_0, WEIGHT_FORMAT, UNCOMPRESSED),
FIELD_ENUM(CSC_D_WEIGHT_FORMAT_0, WEIGHT_FORMAT, COMPRESSED),
};
static const uint8_t map_img_fmt[][2] = {
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_R8), 1},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_R10), 2},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_R12), 2},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_R16), 2},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_R16_I), 2},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_R16_F), 2},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_A16B16G16R16), 8},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_X16B16G16R16), 8},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_A16B16G16R16_F), 8},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_A16Y16U16V16), 8},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_V16U16Y16A16), 8},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_A16Y16U16V16_F), 8},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_A8B8G8R8), 4},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_A8R8G8B8), 4},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_B8G8R8A8), 4},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_R8G8B8A8), 4},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_X8B8G8R8), 4},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_X8R8G8B8), 4},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_B8G8R8X8), 4},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_R8G8B8X8), 4},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_A2B10G10R10), 4},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_A2R10G10B10), 4},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_B10G10R10A2), 4},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_R10G10B10A2), 4},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_A2Y10U10V10), 4},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_V10U10Y10A2), 4},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_A8Y8U8V8), 4},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_V8U8Y8A8), 4},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_Y8___U8V8_N444), 1},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_Y8___V8U8_N444), 1},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_Y10___U10V10_N444), 2},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_Y10___V10U10_N444), 2},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_Y12___U12V12_N444), 2},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_Y12___V12U12_N444), 2},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_Y16___U16V16_N444), 2},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
PIXEL_FORMAT, T_Y16___V16U16_N444), 2},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
DATAIN_FORMAT, FEATURE), 2},
{FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
DATAIN_FORMAT, PIXEL), 1},
};
static const uint8_t map_pixel[] = {
FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, PIXEL_MAPPING, PITCH_LINEAR),
};
static const uint8_t map_ram[] = {
FIELD_ENUM(CDMA_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE, MCIF),
FIELD_ENUM(CDMA_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE, CVIF),
};
static const uint8_t map_mean[] = {
FIELD_ENUM(CDMA_D_MEAN_FORMAT_0, MEAN_FORMAT, DISABLE),
FIELD_ENUM(CDMA_D_MEAN_FORMAT_0, MEAN_FORMAT, ENABLE),
};
#if STAT_ENABLE
void
dla_conv_stat_data(struct dla_processor *processor,
struct dla_processor_group *group)
{
uint64_t end_time = 0;
struct dla_conv_stat_desc *conv_stat;
conv_stat = &processor->stat_data_desc->conv_stat;
end_time = dla_get_time_us();
conv_stat->data_read_stall = cdma_reg_read(D_PERF_DAT_READ_STALL);
conv_stat->weight_read_stall = cdma_reg_read(D_PERF_WT_READ_STALL);
conv_stat->data_read_latency = cdma_reg_read(D_PERF_DAT_READ_LATENCY);
conv_stat->weight_read_latency = cdma_reg_read(D_PERF_WT_READ_LATENCY);
conv_stat->nan_data_num = cdma_reg_read(D_NAN_INPUT_DATA_NUM);
conv_stat->nan_weight_num = cdma_reg_read(D_NAN_INPUT_WEIGHT_NUM);
conv_stat->inf_data_num = cdma_reg_read(D_INF_INPUT_DATA_NUM);
conv_stat->inf_weight_num = cdma_reg_read(D_INF_INPUT_WEIGHT_NUM);
conv_stat->saturation_count = cacc_reg_read(D_OUT_SATURATION);
conv_stat->runtime = (uint32_t)(end_time - group->start_time);
}
void
dla_conv_dump_stat(struct dla_processor *processor)
{
struct dla_conv_stat_desc *conv_stat;
conv_stat = &processor->stat_data_desc->conv_stat;
dla_debug_conv_stats(conv_stat);
}
#endif /* STAT_ENABLE */
static uint32_t
get_in_format(uint8_t format)
{
uint32_t in_format = 0;
if (format >= FORMAT_T_R8 && format < FORMAT_FEATURE) {
in_format = FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
DATAIN_FORMAT, PIXEL);
} else if (format == FORMAT_FEATURE) {
in_format = FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0,
DATAIN_FORMAT, FEATURE);
} else {
assert(0);
}
return in_format;
}
void
dla_conv_set_producer(int32_t group_id, int32_t rdma_group_id)
{
uint32_t reg;
/* set producer pointer for all sub-modules */
reg = group_id << SHIFT(CACC_S_POINTER_0, PRODUCER);
cacc_reg_write(S_POINTER, reg);
cmac_a_reg_write(S_POINTER, reg);
cmac_b_reg_write(S_POINTER, reg);
csc_reg_write(S_POINTER, reg);
cdma_reg_write(S_POINTER, reg);
}
int
dla_conv_enable(struct dla_processor_group *group)
{
uint32_t reg;
struct dla_engine *engine = dla_get_engine();
dla_trace("Enter: %s", __func__);
do {
reg = cdma_reg_read(S_CBUF_FLUSH_STATUS);
} while (!(reg & MASK(CDMA_S_CBUF_FLUSH_STATUS_0, FLUSH_DONE)));
if (engine->stat_enable == (uint32_t)1) {
cdma_reg_write(D_PERF_ENABLE, 1);
group->start_time = dla_get_time_us();
}
/* enable all sub-modules */
reg = FIELD_ENUM(CACC_D_OP_ENABLE_0, OP_EN, ENABLE);
cacc_reg_write(D_OP_ENABLE, reg);
cmac_a_reg_write(D_OP_ENABLE, reg);
cmac_b_reg_write(D_OP_ENABLE, reg);
csc_reg_write(D_OP_ENABLE, reg);
cdma_reg_write(D_OP_ENABLE, reg);
dla_trace("Exit: %s", __func__);
RETURN(0);
}
void
dla_conv_rdma_check(struct dla_processor_group *group)
{
group->is_rdma_needed = 0;
}
static int32_t
processor_conv_program(struct dla_processor_group *group)
{
int32_t ret = 0;
uint32_t reg, high, low, shift, mask;
uint32_t stride_x, stride_y, pad_x, pad_y;
uint64_t weight_address = 0;
uint64_t wmb_address = 0;
uint64_t wgs_address = 0;
uint64_t input_address = 0;
uint64_t output_address = 0;
uint32_t atom_size = 0;
bool weight_compress_support = false;
struct dla_engine *engine = dla_get_engine();
struct dla_conv_op_desc *conv_op;
struct dla_conv_surface_desc *conv_surface;
dla_trace("Enter: %s", __func__);
weight_compress_support = engine->config_data->weight_compress_support;
atom_size = engine->config_data->atom_size;
conv_op = &group->operation_desc->conv_op;
conv_surface = &group->surface_desc->conv_surface;
if (conv_op->weight_format == WEIGHT_FORMAT_COMPRESSED) {
ASSERT_GOTO((weight_compress_support), ret, ERR(INVALID_INPUT), exit);
ASSERT_GOTO((conv_surface->wmb_data.address != -1),
ret, ERR(INVALID_INPUT), exit);
dla_get_dma_cube_address(engine->driver_context,
engine->task->task_data,
conv_surface->wmb_data.address,
conv_surface->wmb_data.offset,
(void *)&wmb_address,
DESTINATION_DMA);
CHECK_ALIGN(wmb_address, atom_size);
CHECK_ALIGN(conv_surface->wmb_data.size, 128);
ASSERT_GOTO((conv_surface->wgs_data.address != -1),
ret, ERR(INVALID_INPUT), exit);
dla_get_dma_cube_address(engine->driver_context,
engine->task->task_data,
conv_surface->wgs_data.address,
conv_surface->wgs_data.offset,
(void *)&wgs_address,
DESTINATION_DMA);
CHECK_ALIGN(wgs_address, atom_size);
CHECK_ALIGN(conv_surface->wgs_data.size, 4);
}
if (conv_surface->weight_data.address != -1) {
dla_get_dma_cube_address(engine->driver_context,
engine->task->task_data,
conv_surface->weight_data.address,
conv_surface->weight_data.offset,
(void *)&weight_address,
DESTINATION_DMA);
CHECK_ALIGN(weight_address, atom_size);
CHECK_ALIGN(conv_surface->weight_data.size, 128);
}
if (conv_surface->dst_data.address != -1) {
dla_get_dma_cube_address(engine->driver_context,
engine->task->task_data,
conv_surface->dst_data.address,
conv_surface->dst_data.offset,
(void *)&output_address,
DESTINATION_DMA);
CHECK_ALIGN(output_address, atom_size);
CHECK_ALIGN(conv_surface->dst_data.size, atom_size);
CHECK_ALIGN(conv_surface->dst_data.line_stride, atom_size);
CHECK_ALIGN(conv_surface->dst_data.surf_stride, atom_size);
}
ret = dla_read_input_address(&conv_surface->src_data, &input_address,
group->op_desc->index,
group->roi_index,
map_img_fmt[conv_op->data_format][1]);
if (ret)
goto exit;
CHECK_ALIGN(input_address, atom_size);
ASSERT_GOTO((conv_op->out_cvt.scale == 1),
ret, ERR(INVALID_INPUT), exit);
ASSERT_GOTO((conv_op->out_cvt.offset == 0),
ret, ERR(INVALID_INPUT), exit);
/* check if the register group is idle */
reg = cacc_reg_read(S_STATUS);
mask = group->id ? MASK(CACC_S_STATUS_0, STATUS_1) :
MASK(CACC_S_STATUS_0, STATUS_0);
shift = group->id ? SHIFT(CACC_S_STATUS_0, STATUS_1) :
SHIFT(CACC_S_STATUS_0, STATUS_0);
reg = (reg & mask) >> shift;
ASSERT_GOTO((reg == FIELD_ENUM(CACC_S_STATUS_0, STATUS_0, IDLE)),
ret, ERR(INVALID_INPUT), exit);
reg = cmac_a_reg_read(S_STATUS);
mask = group->id ? MASK(CMAC_A_S_STATUS_0, STATUS_1) :
MASK(CMAC_A_S_STATUS_0, STATUS_0);
shift = group->id ? SHIFT(CMAC_A_S_STATUS_0, STATUS_1) :
SHIFT(CMAC_A_S_STATUS_0, STATUS_0);
reg = (reg & mask) >> shift;
ASSERT_GOTO((reg == FIELD_ENUM(CMAC_A_S_STATUS_0, STATUS_0, IDLE)),
ret, ERR(INVALID_INPUT), exit);
reg = cmac_b_reg_read(S_STATUS);
mask = group->id ? MASK(CMAC_B_S_STATUS_0, STATUS_1) :
MASK(CMAC_B_S_STATUS_0, STATUS_0);
shift = group->id ? SHIFT(CMAC_B_S_STATUS_0, STATUS_1) :
SHIFT(CMAC_B_S_STATUS_0, STATUS_0);
reg = (reg & mask) >> shift;
ASSERT_GOTO((reg == FIELD_ENUM(CMAC_B_S_STATUS_0, STATUS_0, IDLE)),
ret, ERR(INVALID_INPUT), exit);
reg = csc_reg_read(S_STATUS);
mask = group->id ? MASK(CSC_S_STATUS_0, STATUS_1) :
MASK(CSC_S_STATUS_0, STATUS_0);
shift = group->id ? SHIFT(CSC_S_STATUS_0, STATUS_1) :
SHIFT(CSC_S_STATUS_0, STATUS_0);
reg = (reg & mask) >> shift;
ASSERT_GOTO((reg == FIELD_ENUM(CSC_S_STATUS_0, STATUS_0, IDLE)),
ret, ERR(INVALID_INPUT), exit);
reg = cdma_reg_read(S_STATUS);
mask = group->id ? MASK(CDMA_S_STATUS_0, STATUS_1) :
MASK(CDMA_S_STATUS_0, STATUS_0);
shift = group->id ? SHIFT(CDMA_S_STATUS_0, STATUS_1) :
SHIFT(CDMA_S_STATUS_0, STATUS_0);
reg = (reg & mask) >> shift;
ASSERT_GOTO((reg == FIELD_ENUM(CDMA_S_STATUS_0, STATUS_0, IDLE)),
ret, ERR(INVALID_INPUT), exit);
/* reverse config each sub-module in CC */
/* CACC */
reg = (map_conv[conv_op->conv_mode]
<< SHIFT(CACC_D_MISC_CFG_0, CONV_MODE)) |
(map_precision[conv_op->out_precision]
<< SHIFT(CACC_D_MISC_CFG_0, PROC_PRECISION));
cacc_reg_write(D_MISC_CFG, reg);
reg = ((conv_surface->dst_data.width - 1)
<< SHIFT(CACC_D_DATAOUT_SIZE_0_0, DATAOUT_WIDTH)) |
((conv_surface->dst_data.height - 1)
<< SHIFT(CACC_D_DATAOUT_SIZE_0_0, DATAOUT_HEIGHT));
cacc_reg_write(D_DATAOUT_SIZE_0, reg);
reg = ((conv_surface->dst_data.channel - 1)
<< SHIFT(CACC_D_DATAOUT_SIZE_1_0, DATAOUT_CHANNEL));
cacc_reg_write(D_DATAOUT_SIZE_1, reg);
low = LOW32BITS(output_address);
cacc_reg_write(D_DATAOUT_ADDR, low);
cacc_reg_write(D_BATCH_NUMBER, conv_op->batch - 1);
cacc_reg_write(D_LINE_STRIDE, conv_surface->dst_data.line_stride);
cacc_reg_write(D_SURF_STRIDE, conv_surface->dst_data.surf_stride);
if (conv_surface->dst_data.width == 1 &&
conv_surface->dst_data.height == 1) {
ASSERT_GOTO((((uint32_t)conv_surface->dst_data.line_stride ==
(uint32_t)(conv_surface->dst_data.width * atom_size))),
ret, ERR(INVALID_INPUT), exit);
reg = (CACC_D_DATAOUT_MAP_0_LINE_PACKED_TRUE <<
SHIFT(CACC_D_DATAOUT_MAP_0, LINE_PACKED));
reg |= (CACC_D_DATAOUT_MAP_0_SURF_PACKED_TRUE <<
SHIFT(CACC_D_DATAOUT_MAP_0, SURF_PACKED));
} else {
reg = (FIELD_ENUM(CACC_D_DATAOUT_MAP_0, LINE_PACKED, FALSE) <<
SHIFT(CACC_D_DATAOUT_MAP_0, LINE_PACKED));
reg |= (FIELD_ENUM(CACC_D_DATAOUT_MAP_0, SURF_PACKED, FALSE) <<
SHIFT(CACC_D_DATAOUT_MAP_0, SURF_PACKED));
}
cacc_reg_write(D_DATAOUT_MAP, reg);
cacc_reg_write(D_CLIP_CFG, conv_op->out_cvt.truncate);
/* CMAC */
reg = (map_conv[conv_op->conv_mode]
<< SHIFT(CMAC_A_D_MISC_CFG_0, CONV_MODE)) |
(map_precision[conv_op->out_precision]
<< SHIFT(CMAC_A_D_MISC_CFG_0, PROC_PRECISION));
cmac_a_reg_write(D_MISC_CFG, reg);
cmac_b_reg_write(D_MISC_CFG, reg);
/* CSC */
reg = (map_conv[conv_op->conv_mode]
<< SHIFT(CSC_D_MISC_CFG_0, CONV_MODE)) |
(map_precision[conv_op->out_precision]
<< SHIFT(CSC_D_MISC_CFG_0, IN_PRECISION)) |
(map_precision[conv_op->out_precision]
<< SHIFT(CSC_D_MISC_CFG_0, PROC_PRECISION)) |
(conv_op->data_reuse
<< SHIFT(CSC_D_MISC_CFG_0, DATA_REUSE)) |
(conv_op->weight_reuse
<< SHIFT(CSC_D_MISC_CFG_0, WEIGHT_REUSE)) |
(conv_op->skip_data_rls
<< SHIFT(CSC_D_MISC_CFG_0, SKIP_DATA_RLS)) |
(conv_op->skip_weight_rls
<< SHIFT(CSC_D_MISC_CFG_0, SKIP_WEIGHT_RLS));
csc_reg_write(D_MISC_CFG, reg);
reg = (get_in_format(conv_op->data_format) <<
SHIFT(CSC_D_DATAIN_FORMAT_0, DATAIN_FORMAT));
csc_reg_write(D_DATAIN_FORMAT, reg);
reg = ((conv_op->input_width_csc - 1)
<< SHIFT(CSC_D_DATAIN_SIZE_EXT_0_0, DATAIN_WIDTH_EXT)) |
((conv_op->input_height_csc - 1)
<< SHIFT(CSC_D_DATAIN_SIZE_EXT_0_0, DATAIN_HEIGHT_EXT));
csc_reg_write(D_DATAIN_SIZE_EXT_0, reg);
reg = ((conv_op->input_channel_csc - 1)
<< SHIFT(CSC_D_DATAIN_SIZE_EXT_1_0, DATAIN_CHANNEL_EXT));
csc_reg_write(D_DATAIN_SIZE_EXT_1, reg);
reg = ((conv_op->batch - 1)
<< SHIFT(CSC_D_BATCH_NUMBER_0, BATCHES));
csc_reg_write(D_BATCH_NUMBER, reg);
reg = ((conv_op->post_extension)
<< SHIFT(CSC_D_POST_Y_EXTENSION_0, Y_EXTENSION));
csc_reg_write(D_POST_Y_EXTENSION, reg);
reg = ((conv_op->entry_per_slice - 1)
<< SHIFT(CSC_D_ENTRY_PER_SLICE_0, ENTRIES));
csc_reg_write(D_ENTRY_PER_SLICE, reg);
reg = (map_weight_fmt[conv_op->weight_format]
<< SHIFT(CSC_D_WEIGHT_FORMAT_0, WEIGHT_FORMAT));
csc_reg_write(D_WEIGHT_FORMAT, reg);
reg = ((conv_op->kernel_width_csc - 1)
<< SHIFT(CSC_D_WEIGHT_SIZE_EXT_0_0, WEIGHT_WIDTH_EXT)) |
((conv_op->kernel_height_csc - 1)
<< SHIFT(CSC_D_WEIGHT_SIZE_EXT_0_0, WEIGHT_HEIGHT_EXT));
csc_reg_write(D_WEIGHT_SIZE_EXT_0, reg);
reg = ((conv_op->kernel_channel_csc - 1)
<< SHIFT(CSC_D_WEIGHT_SIZE_EXT_1_0, WEIGHT_CHANNEL_EXT)) |
((conv_surface->dst_data.channel - 1)
<< SHIFT(CSC_D_WEIGHT_SIZE_EXT_1_0, WEIGHT_KERNEL));
csc_reg_write(D_WEIGHT_SIZE_EXT_1, reg);
csc_reg_write(D_WEIGHT_BYTES, conv_surface->weight_data.size);
csc_reg_write(D_WMB_BYTES, conv_surface->wmb_data.size);
reg = ((conv_op->input_width_cmac - 1)
<< SHIFT(CSC_D_DATAOUT_SIZE_0_0, DATAOUT_WIDTH)) |
((conv_op->input_height_cmac - 1)
<< SHIFT(CSC_D_DATAOUT_SIZE_0_0, DATAOUT_HEIGHT));
csc_reg_write(D_DATAOUT_SIZE_0, reg);
reg = ((conv_surface->dst_data.channel - 1)
<< SHIFT(CSC_D_DATAOUT_SIZE_1_0, DATAOUT_CHANNEL));
csc_reg_write(D_DATAOUT_SIZE_1, reg);
reg = ((conv_surface->dst_data.width *
conv_surface->dst_data.height - 1)
<< SHIFT(CSC_D_ATOMICS_0, ATOMICS));
csc_reg_write(D_ATOMICS, reg);
reg = ((conv_op->release - 1)
<< SHIFT(CSC_D_RELEASE_0, RLS_SLICES));
csc_reg_write(D_RELEASE, reg);
if (conv_op->conv_mode == CONV_MODE_DIRECT) {
stride_x = conv_op->conv_stride_x - 1;
stride_y = conv_op->conv_stride_y - 1;
pad_x = conv_op->pad_x_left;
pad_y = conv_op->pad_y_top;
} else {
stride_x = 0;
stride_y = 0;
pad_x = 0;
pad_y = 0;
}
reg = (stride_x
<< SHIFT(CSC_D_CONV_STRIDE_EXT_0, CONV_X_STRIDE_EXT)) |
(stride_y
<< SHIFT(CSC_D_CONV_STRIDE_EXT_0, CONV_Y_STRIDE_EXT));
csc_reg_write(D_CONV_STRIDE_EXT, reg);
reg = ((conv_op->dilation_x - 1)
<< SHIFT(CSC_D_DILATION_EXT_0, X_DILATION_EXT)) |
((conv_op->dilation_y - 1)
<< SHIFT(CSC_D_DILATION_EXT_0, Y_DILATION_EXT));
csc_reg_write(D_DILATION_EXT, reg);
reg = (pad_x
<< SHIFT(CSC_D_ZERO_PADDING_0, PAD_LEFT)) |
(pad_y
<< SHIFT(CSC_D_ZERO_PADDING_0, PAD_TOP));
csc_reg_write(D_ZERO_PADDING, reg);
reg = (conv_op->pad_val
<< SHIFT(CSC_D_ZERO_PADDING_VALUE_0, PAD_VALUE)) &
MASK(CSC_D_ZERO_PADDING_VALUE_0, PAD_VALUE);
csc_reg_write(D_ZERO_PADDING_VALUE, reg);
reg = ((conv_op->data_bank - 1)
<< SHIFT(CSC_D_BANK_0, DATA_BANK)) |
((conv_op->weight_bank - 1)
<< SHIFT(CSC_D_BANK_0, WEIGHT_BANK));
csc_reg_write(D_BANK, reg);
csc_reg_write(D_PRA_CFG, conv_op->pra_truncate);
/* CBUF */
/* there's no CBUF register */
/* CDMA */
reg = (map_conv[conv_op->conv_mode]
<< SHIFT(CDMA_D_MISC_CFG_0, CONV_MODE)) |
(map_precision[conv_op->in_precision]
<< SHIFT(CDMA_D_MISC_CFG_0, IN_PRECISION)) |
(map_precision[conv_op->out_precision]
<< SHIFT(CDMA_D_MISC_CFG_0, PROC_PRECISION)) |
(conv_op->data_reuse
<< SHIFT(CDMA_D_MISC_CFG_0, DATA_REUSE)) |
(conv_op->weight_reuse
<< SHIFT(CDMA_D_MISC_CFG_0, WEIGHT_REUSE)) |
(conv_op->skip_data_rls
<< SHIFT(CDMA_D_MISC_CFG_0, SKIP_DATA_RLS)) |
(conv_op->skip_weight_rls
<< SHIFT(CDMA_D_MISC_CFG_0, SKIP_WEIGHT_RLS));
cdma_reg_write(D_MISC_CFG, reg);
reg = (get_in_format(conv_op->data_format) <<
SHIFT(CDMA_D_DATAIN_FORMAT_0, DATAIN_FORMAT)) |
(map_img_fmt[conv_op->data_format][0]
<< SHIFT(CDMA_D_DATAIN_FORMAT_0, PIXEL_FORMAT)) |
(map_pixel[conv_op->pixel_mapping]
<< SHIFT(CDMA_D_DATAIN_FORMAT_0, PIXEL_MAPPING)) |
(conv_op->pixel_override
<< SHIFT(CDMA_D_DATAIN_FORMAT_0, PIXEL_SIGN_OVERRIDE));
cdma_reg_write(D_DATAIN_FORMAT, reg);
reg = ((conv_surface->src_data.width - 1)
<< SHIFT(CDMA_D_DATAIN_SIZE_0_0, DATAIN_WIDTH)) |
((conv_surface->src_data.height - 1)
<< SHIFT(CDMA_D_DATAIN_SIZE_0_0, DATAIN_HEIGHT));
cdma_reg_write(D_DATAIN_SIZE_0, reg);
reg = ((conv_surface->src_data.channel - 1)
<< SHIFT(CDMA_D_DATAIN_SIZE_1_0, DATAIN_CHANNEL));
cdma_reg_write(D_DATAIN_SIZE_1, reg);
reg = ((conv_op->input_width_csc - 1)
<< SHIFT(CDMA_D_DATAIN_SIZE_EXT_0_0, DATAIN_WIDTH_EXT)) |
((conv_op->input_height_csc - 1)
<< SHIFT(CDMA_D_DATAIN_SIZE_EXT_0_0, DATAIN_HEIGHT_EXT));
cdma_reg_write(D_DATAIN_SIZE_EXT_0, reg);
reg = (map_ram[conv_surface->src_data.type]
<< SHIFT(CDMA_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE));
cdma_reg_write(D_DAIN_RAM_TYPE, reg);
high = HIGH32BITS(input_address);
low = LOW32BITS(input_address);
cdma_reg_write(D_DAIN_ADDR_HIGH_0, high);
cdma_reg_write(D_DAIN_ADDR_LOW_0, low);
high = HIGH32BITS((input_address + conv_surface->offset_u));
low = LOW32BITS(input_address + conv_surface->offset_u);
cdma_reg_write(D_DAIN_ADDR_HIGH_1, high);
cdma_reg_write(D_DAIN_ADDR_LOW_1, low);
cdma_reg_write(D_LINE_STRIDE, conv_surface->src_data.line_stride);
cdma_reg_write(D_SURF_STRIDE, conv_surface->src_data.surf_stride);
cdma_reg_write(D_LINE_UV_STRIDE, conv_surface->in_line_uv_stride);
reg = ((conv_surface->src_data.line_stride ==
((uint32_t)conv_surface->src_data.width * atom_size))
<< SHIFT(CDMA_D_DAIN_MAP_0, LINE_PACKED));
reg |= ((conv_surface->src_data.surf_stride ==
((uint32_t)(conv_surface->src_data.width *
conv_surface->src_data.height) * atom_size))
<< SHIFT(CDMA_D_DAIN_MAP_0, SURF_PACKED));
cdma_reg_write(D_DAIN_MAP, reg);
reg = ((conv_op->batch - 1)
<< SHIFT(CDMA_D_BATCH_NUMBER_0, BATCHES));
cdma_reg_write(D_BATCH_NUMBER, reg);
cdma_reg_write(D_BATCH_STRIDE, conv_op->batch_stride);
reg = ((conv_op->entry_per_slice - 1)
<< SHIFT(CDMA_D_ENTRY_PER_SLICE_0, ENTRIES));
cdma_reg_write(D_ENTRY_PER_SLICE, reg);
reg = ((conv_op->fetch_grain - 1)
<< SHIFT(CDMA_D_FETCH_GRAIN_0, GRAINS));
cdma_reg_write(D_FETCH_GRAIN, reg);
reg = (map_weight_fmt[conv_op->weight_format]
<< SHIFT(CDMA_D_WEIGHT_FORMAT_0, WEIGHT_FORMAT));
cdma_reg_write(D_WEIGHT_FORMAT, reg);
reg = ((conv_op->bytes_per_kernel - 1)
<< SHIFT(CDMA_D_WEIGHT_SIZE_0_0, BYTE_PER_KERNEL));
cdma_reg_write(D_WEIGHT_SIZE_0, reg);
reg = ((conv_surface->dst_data.channel - 1)
<< SHIFT(CDMA_D_WEIGHT_SIZE_1_0, WEIGHT_KERNEL));
cdma_reg_write(D_WEIGHT_SIZE_1, reg);
reg = (map_ram[conv_surface->weight_data.type]
<< SHIFT(CDMA_D_WEIGHT_RAM_TYPE_0, WEIGHT_RAM_TYPE));
cdma_reg_write(D_WEIGHT_RAM_TYPE, reg);
high = HIGH32BITS(weight_address);
low = LOW32BITS(weight_address);
cdma_reg_write(D_WEIGHT_ADDR_HIGH, high);
cdma_reg_write(D_WEIGHT_ADDR_LOW, low);
cdma_reg_write(D_WEIGHT_BYTES, conv_surface->weight_data.size);
if (conv_op->weight_format == WEIGHT_FORMAT_COMPRESSED) {
high = HIGH32BITS(wgs_address);
low = LOW32BITS(wgs_address);
cdma_reg_write(D_WGS_ADDR_HIGH, high);
cdma_reg_write(D_WGS_ADDR_LOW, low);
high = HIGH32BITS(wmb_address);
low = LOW32BITS(wmb_address);
cdma_reg_write(D_WMB_ADDR_HIGH, high);
cdma_reg_write(D_WMB_ADDR_LOW, low);
cdma_reg_write(D_WMB_BYTES, conv_surface->wmb_data.size);
}
reg = (map_mean[conv_op->mean_format]
<< SHIFT(CDMA_D_MEAN_FORMAT_0, MEAN_FORMAT));
cdma_reg_write(D_MEAN_FORMAT, reg);
if (conv_op->mean_format == MEAN_FORMAT_ENABLE) {
reg = ((conv_op->mean_ry
<< SHIFT(CDMA_D_MEAN_GLOBAL_0_0, MEAN_RY)) &
MASK(CDMA_D_MEAN_GLOBAL_0_0, MEAN_RY)) |
((conv_op->mean_gu
<< SHIFT(CDMA_D_MEAN_GLOBAL_0_0, MEAN_GU)) &
MASK(CDMA_D_MEAN_GLOBAL_0_0, MEAN_GU));
cdma_reg_write(D_MEAN_GLOBAL_0, reg);
reg = ((conv_op->mean_bv
<< SHIFT(CDMA_D_MEAN_GLOBAL_1_0, MEAN_BV))&
MASK(CDMA_D_MEAN_GLOBAL_1_0, MEAN_BV)) |
((conv_op->mean_ax
<< SHIFT(CDMA_D_MEAN_GLOBAL_1_0, MEAN_AX))&
MASK(CDMA_D_MEAN_GLOBAL_1_0, MEAN_AX));
cdma_reg_write(D_MEAN_GLOBAL_1, reg);
}
if (conv_op->in_cvt.enable) {
reg = ((FIELD_ENUM(CDMA_D_CVT_CFG_0, CVT_EN, ENABLE))
<< SHIFT(CDMA_D_CVT_CFG_0, CVT_EN)) |
(conv_op->in_cvt.truncate
<< SHIFT(CDMA_D_CVT_CFG_0, CVT_TRUNCATE));
cdma_reg_write(D_CVT_CFG, reg);
cdma_reg_write(D_CVT_OFFSET, conv_op->in_cvt.offset);
cdma_reg_write(D_CVT_SCALE, conv_op->in_cvt.scale);
} else {
reg = ((FIELD_ENUM(CDMA_D_CVT_CFG_0, CVT_EN, DISABLE))
<< SHIFT(CDMA_D_CVT_CFG_0, CVT_EN));
cdma_reg_write(D_CVT_CFG, reg);
}
reg = ((conv_op->conv_stride_x - 1)
<< SHIFT(CDMA_D_CONV_STRIDE_0, CONV_X_STRIDE)) |
((conv_op->conv_stride_y - 1)
<< SHIFT(CDMA_D_CONV_STRIDE_0, CONV_Y_STRIDE));
cdma_reg_write(D_CONV_STRIDE, reg);
reg = (conv_op->pad_x_left <<
SHIFT(CDMA_D_ZERO_PADDING_0, PAD_LEFT)) |
(conv_op->pad_x_right
<< SHIFT(CDMA_D_ZERO_PADDING_0, PAD_RIGHT)) |
(conv_op->pad_y_top
<< SHIFT(CDMA_D_ZERO_PADDING_0, PAD_TOP)) |
(conv_op->pad_y_bottom
<< SHIFT(CDMA_D_ZERO_PADDING_0, PAD_BOTTOM));
cdma_reg_write(D_ZERO_PADDING, reg);
reg = conv_op->pad_val <<
SHIFT(CDMA_D_ZERO_PADDING_VALUE_0, PAD_VALUE) &
MASK(CDMA_D_ZERO_PADDING_VALUE_0, PAD_VALUE);
cdma_reg_write(D_ZERO_PADDING_VALUE, reg);
reg = ((conv_op->weight_bank - 1)
<< SHIFT(CDMA_D_BANK_0, WEIGHT_BANK)) |
((conv_op->data_bank - 1)
<< SHIFT(CDMA_D_BANK_0, DATA_BANK));
cdma_reg_write(D_BANK, reg);
exit:
dla_trace("Exit: %s", __func__);
RETURN(ret);
}
int
dla_conv_is_ready(struct dla_processor *processor,
struct dla_processor_group *group)
{
return 1;
}
void
dla_conv_dump_config(struct dla_processor_group *group)
{
struct dla_conv_op_desc *conv_op;
struct dla_conv_surface_desc *conv_surface;
conv_surface = &group->surface_desc->conv_surface;
conv_op = &group->operation_desc->conv_op;
dla_debug_conv_surface_desc(conv_surface, group->roi_index);
dla_debug_conv_op_desc(conv_op, group->roi_index);
}
int
dla_conv_program(struct dla_processor_group *group)
{
int32_t ret;
dla_trace("Enter: %s", __func__);
ret = processor_conv_program(group);
if (ret)
goto exit;
exit:
dla_trace("Exit: %s", __func__);
RETURN(ret);
}

View file

@ -0,0 +1,361 @@
/*
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __FIRMWARE_DLA_ENGINE_INTERNAL_H_
#define __FIRMWARE_DLA_ENGINE_INTERNAL_H_
#include <opendla.h>
#include <dla_engine.h>
#include <dla_interface.h>
#include <dla_debug.h>
#include "nvdla_interface.h"
#define BITS(num, range) ((((0xFFFFFFFF >> (31 - (1 ? range))) & \
(0xFFFFFFFF << (0 ? range))) & num) >> \
(0 ? range))
#define HIGH32BITS(val64bit) ((uint32_t)(val64bit >> 32))
#define LOW32BITS(val64bit) ((uint32_t)(val64bit))
#ifdef MIN
#undef MIN
#endif /* MIN */
#ifdef MAX
#undef MAX
#endif /* MAX */
#define MIN(a, b) ((a) > (b) ? (b) : (a))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
/*********************************************************/
/******************** Utilities **************************/
/*********************************************************/
#ifdef DEBUG
#define CHECK_ALIGN(val, align) assert((val&(align-1)) == 0)
#else
#define CHECK_ALIGN(val, align)
#endif /* DEBUG */
#define MASK(reg, field) (reg##_##field##_FIELD)
#define FIELD_ENUM(r, f, e) (r##_##f##_##e)
#define SHIFT(reg, field) (reg##_##field##_SHIFT)
#define GLB_REG(name) GLB_##name##_0
#define MCIF_REG(name) MCIF_##name##_0
#define CVIF_REG(name) CVIF_##name##_0
#define BDMA_REG(name) BDMA_##name##_0
#define CDMA_REG(name) CDMA_##name##_0
#define CSC_REG(name) CSC_##name##_0
#define CMAC_A_REG(name) CMAC_A_##name##_0
#define CMAC_B_REG(name) CMAC_B_##name##_0
#define CACC_REG(name) CACC_##name##_0
#define SDP_RDMA_REG(name) SDP_RDMA_##name##_0
#define SDP_REG(name) SDP_##name##_0
#define PDP_RDMA_REG(name) PDP_RDMA_##name##_0
#define PDP_REG(name) PDP_##name##_0
#define CDP_RDMA_REG(name) CDP_RDMA_##name##_0
#define CDP_REG(name) CDP_##name##_0
#define RBK_REG(name) RBK_##name##_0
/* alias for register read for each sub-module */
#define glb_reg_read(reg) reg_read(GLB_REG(reg))
#define bdma_reg_read(reg) reg_read(BDMA_REG(reg))
#define cdma_reg_read(reg) reg_read(CDMA_REG(reg))
#define csc_reg_read(reg) reg_read(CSC_REG(reg))
#define cmac_a_reg_read(reg) reg_read(CMAC_A_REG(reg))
#define cmac_b_reg_read(reg) reg_read(CMAC_B_REG(reg))
#define cacc_reg_read(reg) reg_read(CACC_REG(reg))
#define sdp_rdma_reg_read(reg) reg_read(SDP_RDMA_REG(reg))
#define sdp_reg_read(reg) reg_read(SDP_REG(reg))
#define pdp_rdma_reg_read(reg) reg_read(PDP_RDMA_REG(reg))
#define pdp_reg_read(reg) reg_read(PDP_REG(reg))
#define cdp_rdma_reg_read(reg) reg_read(CDP_RDMA_REG(reg))
#define cdp_reg_read(reg) reg_read(CDP_REG(reg))
#define rubik_reg_read(reg) reg_read(RBK_REG(reg))
/* alias for register write for each sub-module */
#define glb_reg_write(reg, val) reg_write(GLB_REG(reg), val)
#define bdma_reg_write(reg, val) reg_write(BDMA_REG(reg), val)
#define cdma_reg_write(reg, val) reg_write(CDMA_REG(reg), val)
#define csc_reg_write(reg, val) reg_write(CSC_REG(reg), val)
#define cmac_a_reg_write(reg, val) reg_write(CMAC_A_REG(reg), val)
#define cmac_b_reg_write(reg, val) reg_write(CMAC_B_REG(reg), val)
#define cacc_reg_write(reg, val) reg_write(CACC_REG(reg), val)
#define sdp_rdma_reg_write(reg, val) reg_write(SDP_RDMA_REG(reg), val)
#define sdp_reg_write(reg, val) reg_write(SDP_REG(reg), val)
#define pdp_rdma_reg_write(reg, val) reg_write(PDP_RDMA_REG(reg), val)
#define pdp_reg_write(reg, val) reg_write(PDP_REG(reg), val)
#define cdp_rdma_reg_write(reg, val) reg_write(CDP_RDMA_REG(reg), val)
#define cdp_reg_write(reg, val) reg_write(CDP_REG(reg), val)
#define rubik_reg_write(reg, val) reg_write(RBK_REG(reg), val)
void reg_write(uint32_t addr, uint32_t reg);
uint32_t reg_read(uint32_t addr);
/**
* Operation descriptor cache functions
*/
void
dla_put_op_desc(struct dla_common_op_desc *op_desc);
struct dla_common_op_desc
*dla_get_op_desc(struct dla_task *task,
int16_t index,
uint8_t op_type,
uint8_t roi_index);
void
dla_dump_op_desc(struct dla_common_op_desc *desc);
void
dla_get_refcount(struct dla_common_op_desc *op_desc);
void
dla_init_op_cache(struct dla_engine *engine);
/**
* Operation completion handler
*/
int
dla_op_completion(struct dla_processor *processor,
struct dla_processor_group *group);
int32_t
dla_read_lut(struct dla_engine *engine, int16_t index, void *dst);
int
dla_enable_intr(uint32_t mask);
int
dla_disable_intr(uint32_t mask);
int
utils_get_free_group(struct dla_processor *processor,
uint8_t *group_id,
uint8_t *rdma_id);
int32_t
dla_get_dma_cube_address(void *driver_context,
void *task_data,
int16_t index,
uint32_t offset,
void *dst_ptr,
uint32_t destination);
int
dla_read_input_address(struct dla_data_cube *data,
uint64_t *address,
int16_t op_index,
uint8_t roi_index,
uint8_t bpp);
/**
* BDMA operations
*/
void
dla_bdma_set_producer(int32_t group_id, int32_t rdma_group_id);
int
dla_bdma_enable(struct dla_processor_group *group);
int
dla_bdma_program(struct dla_processor_group *group);
int
dla_bdma_is_ready(struct dla_processor *processor,
struct dla_processor_group *group);
void
dla_bdma_dump_config(struct dla_processor_group *group);
void
dla_bdma_rdma_check(struct dla_processor_group *group);
#if STAT_ENABLE
void
dla_bdma_stat_data(struct dla_processor *processor,
struct dla_processor_group *group);
void
dla_bdma_dump_stat(struct dla_processor *processor);
#else
static inline void
dla_bdma_stat_data(struct dla_processor *processor,
struct dla_processor_group *group) {}
static inline void
dla_bdma_dump_stat(struct dla_processor *processor) {}
#endif
/**
* Convolution operations
*/
void
dla_conv_set_producer(int32_t group_id, int32_t rdma_group_id);
int
dla_conv_enable(struct dla_processor_group *group);
int
dla_conv_program(struct dla_processor_group *group);
int
dla_conv_is_ready(struct dla_processor *processor,
struct dla_processor_group *group);
void
dla_conv_dump_config(struct dla_processor_group *group);
void
dla_conv_rdma_check(struct dla_processor_group *group);
#if STAT_ENABLE
void
dla_conv_stat_data(struct dla_processor *processor,
struct dla_processor_group *group);
void
dla_conv_dump_stat(struct dla_processor *processor);
#else
static inline void
dla_conv_stat_data(struct dla_processor *processor,
struct dla_processor_group *group) {}
static inline void
dla_conv_dump_stat(struct dla_processor *processor) {}
#endif /* STAT_ENABLE */
/**
* SDP operations
*/
void
dla_sdp_set_producer(int32_t group_id, int32_t rdma_group_id);
int
dla_sdp_enable(struct dla_processor_group *group);
int
dla_sdp_program(struct dla_processor_group *group);
int
dla_sdp_is_ready(struct dla_processor *processor,
struct dla_processor_group *group);
void
dla_sdp_dump_config(struct dla_processor_group *group);
void
dla_sdp_rdma_check(struct dla_processor_group *group);
#if STAT_ENABLE
void
dla_sdp_stat_data(struct dla_processor *processor,
struct dla_processor_group *group);
void
dla_sdp_dump_stat(struct dla_processor *processor);
#else
static inline void
dla_sdp_stat_data(struct dla_processor *processor,
struct dla_processor_group *group) {}
static inline void
dla_sdp_dump_stat(struct dla_processor *processor) {}
#endif
/**
* PDP operations
*/
void
dla_pdp_set_producer(int32_t group_id, int32_t rdma_group_id);
int
dla_pdp_enable(struct dla_processor_group *group);
int
dla_pdp_program(struct dla_processor_group *group);
int
dla_pdp_is_ready(struct dla_processor *processor,
struct dla_processor_group *group);
void
dla_pdp_dump_config(struct dla_processor_group *group);
void
dla_pdp_rdma_check(struct dla_processor_group *group);
#if STAT_ENABLE
void
dla_pdp_stat_data(struct dla_processor *processor,
struct dla_processor_group *group);
void
dla_pdp_dump_stat(struct dla_processor *processor);
#else
static inline void
dla_pdp_stat_data(struct dla_processor *processor,
struct dla_processor_group *group) {}
static inline void
dla_pdp_dump_stat(struct dla_processor *processor) {}
#endif
/**
* CDP operations
*/
void
dla_cdp_set_producer(int32_t group_id, int32_t rdma_group_id);
int
dla_cdp_enable(struct dla_processor_group *group);
int
dla_cdp_program(struct dla_processor_group *group);
int
dla_cdp_is_ready(struct dla_processor *processor,
struct dla_processor_group *group);
void
dla_cdp_dump_config(struct dla_processor_group *group);
void
dla_cdp_rdma_check(struct dla_processor_group *group);
#if STAT_ENABLE
void
dla_cdp_stat_data(struct dla_processor *processor,
struct dla_processor_group *group);
void
dla_cdp_dump_stat(struct dla_processor *processor);
#else
static inline void
dla_cdp_stat_data(struct dla_processor *processor,
struct dla_processor_group *group) {}
static inline void
dla_cdp_dump_stat(struct dla_processor *processor) {}
#endif
/**
* RUBIK operations
*/
void
dla_rubik_set_producer(int32_t group_id, int32_t rdma_group_id);
int
dla_rubik_enable(struct dla_processor_group *group);
int
dla_rubik_program(struct dla_processor_group *group);
int
dla_rubik_is_ready(struct dla_processor *processor,
struct dla_processor_group *group);
void
dla_rubik_dump_config(struct dla_processor_group *group);
void
dla_rubik_rdma_check(struct dla_processor_group *group);
#if STAT_ENABLE
void
dla_rubik_stat_data(struct dla_processor *processor,
struct dla_processor_group *group);
void
dla_rubik_dump_stat(struct dla_processor *processor);
#else
static inline void
dla_rubik_stat_data(struct dla_processor *processor,
struct dla_processor_group *group) {}
static inline void
dla_rubik_dump_stat(struct dla_processor *processor) {}
#endif
#endif

262
drivers/nvdla/engine.c Normal file
View file

@ -0,0 +1,262 @@
/*
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <opendla.h>
#include <dla_debug.h>
#include <dla_err.h>
#include <dla_interface.h>
#include "dla_engine_internal.h"
#include "common.h"
static const uint32_t map_rdma_ptr_addr[] = {
0xFFFFFFFF,
0xFFFFFFFF,
SDP_REG(RDMA_S_POINTER),
PDP_REG(RDMA_S_POINTER),
CDP_REG(RDMA_S_POINTER),
0xFFFFFFFF,
};
static const uint32_t map_sts_addr[] = {
BDMA_REG(STATUS),
CACC_REG(S_STATUS),
SDP_REG(S_STATUS),
PDP_REG(S_STATUS),
CDP_REG(S_STATUS),
RBK_REG(S_STATUS),
};
static const uint32_t map_ptr_addr[] = {
BDMA_REG(STATUS),
CACC_REG(S_POINTER),
SDP_REG(S_POINTER),
PDP_REG(S_POINTER),
CDP_REG(S_POINTER),
RBK_REG(S_POINTER),
};
int32_t dla_enable_intr(uint32_t mask)
{
uint32_t reg = glb_reg_read(S_INTR_MASK);
reg = reg & (~mask);
glb_reg_write(S_INTR_MASK, reg);
RETURN(0);
}
int32_t dla_disable_intr(uint32_t mask)
{
uint32_t reg = glb_reg_read(S_INTR_MASK);
reg = reg | mask;
glb_reg_write(S_INTR_MASK, reg);
RETURN(0);
}
uint8_t bdma_grp_sts[2] = {
FIELD_ENUM(BDMA_STATUS_0, IDLE, YES),
FIELD_ENUM(BDMA_STATUS_0, IDLE, YES)
};
struct dla_roi_desc roi_desc;
/**
* Get DMA data cube address
*/
int32_t
dla_get_dma_cube_address(void *driver_context, void *task_data,
int16_t index, uint32_t offset, void *dst_ptr,
uint32_t destination)
{
int32_t ret = 0;
uint64_t *pdst = (uint64_t *)dst_ptr;
ret = dla_get_dma_address(driver_context, task_data, index,
dst_ptr, destination);
if (ret)
goto exit;
pdst[0] += offset;
exit:
return ret;
}
/**
* Read input buffer address
*
* For input layer, in case of static ROI this address is read
* from address list and index is specified in data cube. In case
* dynamic ROI, it has to be read depending on ROI information
* and using surface address
*
* For all other layers, this address is read from address list
* using index specified in data cube
*/
int
dla_read_input_address(struct dla_data_cube *data,
uint64_t *address,
int16_t op_index,
uint8_t roi_index,
uint8_t bpp)
{
uint64_t roi_desc_addr;
int32_t ret = ERR(INVALID_INPUT);
struct dla_engine *en = dla_get_engine();
/**
* If memory type is HW then no address required
*/
if (data->type == DLA_MEM_HW) {
ret = 0;
goto exit;
}
/**
* If address list index is not -1 means this address has to
* be read from address list
*/
if (data->address != -1) {
/**
* But if other parameters indicate that this is input layer
* for dynamic ROI then it is an error
*/
if (en->network->dynamic_roi &&
en->network->input_layer == op_index)
goto exit;
ret = dla_get_dma_cube_address(en->driver_context,
en->task->task_data,
data->address,
data->offset,
(void *)address,
DESTINATION_DMA);
goto exit;
}
/**
* Check if it is dynamic ROI and this is input layer
*/
if (en->network->dynamic_roi && en->network->input_layer == op_index) {
if (!en->task->surface_addr)
goto exit;
/* Calculate address of ROI descriptor in array */
roi_desc_addr = en->task->roi_array_addr;
/* Read ROI descriptor */
ret = dla_data_read(en->driver_context,
en->task->task_data,
roi_desc_addr,
(void *)&roi_desc,
sizeof(roi_desc),
sizeof(struct dla_roi_array_desc) +
roi_index * sizeof(struct dla_roi_desc));
if (ret)
goto exit;
/* Calculate ROI address */
*address = en->task->surface_addr;
*address += (roi_desc.top * data->line_stride) +
(bpp * roi_desc.left);
}
exit:
RETURN(ret);
}
int
utils_get_free_group(struct dla_processor *processor,
uint8_t *group_id,
uint8_t *rdma_id)
{
int32_t ret = 0;
uint32_t pointer;
uint32_t hw_consumer_ptr;
uint32_t hw_rdma_ptr;
hw_rdma_ptr = 0;
if (processor->op_type == DLA_OP_BDMA) {
pointer = reg_read(map_ptr_addr[processor->op_type]);
hw_consumer_ptr = ((pointer & MASK(BDMA_STATUS_0, GRP0_BUSY)) >>
SHIFT(BDMA_STATUS_0, GRP0_BUSY)) ==
FIELD_ENUM(BDMA_STATUS_0, GRP0_BUSY, YES) ?
1 : 0;
} else {
pointer = reg_read(map_ptr_addr[processor->op_type]);
hw_consumer_ptr = (pointer & MASK(CDP_S_POINTER_0, CONSUMER)) >>
SHIFT(CDP_S_POINTER_0, CONSUMER);
/**
* Read current consumer pointer for RDMA only if processor
* has RDMA module
*/
if (map_rdma_ptr_addr[processor->op_type] != 0xFFFFFFFF) {
pointer =
reg_read(map_rdma_ptr_addr[processor->op_type]);
hw_rdma_ptr = (pointer &
MASK(CDP_S_POINTER_0, CONSUMER)) >>
SHIFT(CDP_S_POINTER_0, CONSUMER);
}
}
/**
* If both processors are programmed then exit
*/
if (processor->group_status == 0x3) {
ret = ERR(PROCESSOR_BUSY);
goto exit;
}
if (!processor->group_status)
/**
* If both groups are idle then use consumer pointer
*/
*group_id = hw_consumer_ptr;
else
/**
* Here it is assumed that only one group is idle or busy
* and hence right shift will work to get correct
* group id
*/
*group_id = !(processor->group_status >> 1);
/**
* If both groups are idle then read group id from pointer
*/
if (!processor->rdma_status)
*rdma_id = hw_rdma_ptr;
else
*rdma_id = !(processor->rdma_status >> 1);
exit:
RETURN(ret);
}

303
drivers/nvdla/engine_data.c Normal file
View file

@ -0,0 +1,303 @@
/*
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <nvdla_interface.h>
#include <dla_interface.h>
#include "dla_engine_internal.h"
static union dla_operation_container operation_desc[DLA_OP_NUM][DLA_NUM_GROUPS];
static union dla_surface_container surface_desc[DLA_OP_NUM][DLA_NUM_GROUPS];
static struct dla_task global_task;
static struct dla_engine engine = {
.processors[DLA_OP_BDMA] = {
.name = "BDMA",
.op_type = DLA_OP_BDMA,
.program = dla_bdma_program,
.enable = dla_bdma_enable,
.set_producer = dla_bdma_set_producer,
.is_ready = dla_bdma_is_ready,
.dump_config = dla_bdma_dump_config,
.rdma_check = dla_bdma_rdma_check,
.get_stat_data = dla_bdma_stat_data,
.dump_stat = dla_bdma_dump_stat,
.consumer_ptr = 0,
.roi_index = 0,
.group_status = 0,
.rdma_status = 0,
.last_group = 1,
.groups[0] = {
.id = 0,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[DLA_OP_BDMA][0],
.surface_desc = &surface_desc[DLA_OP_BDMA][0],
},
.groups[1] = {
.id = 1,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[DLA_OP_BDMA][1],
.surface_desc = &surface_desc[DLA_OP_BDMA][1],
},
},
.processors[DLA_OP_CONV] = {
.name = "Convolution",
.op_type = DLA_OP_CONV,
.program = dla_conv_program,
.enable = dla_conv_enable,
.set_producer = dla_conv_set_producer,
.is_ready = dla_conv_is_ready,
.dump_config = dla_conv_dump_config,
.rdma_check = dla_conv_rdma_check,
.get_stat_data = dla_conv_stat_data,
.dump_stat = dla_conv_dump_stat,
.consumer_ptr = 0,
.roi_index = 0,
.group_status = 0,
.rdma_status = 0,
.last_group = 1,
.groups[0] = {
.id = 0,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[DLA_OP_CONV][0],
.surface_desc = &surface_desc[DLA_OP_CONV][0],
},
.groups[1] = {
.id = 1,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[DLA_OP_CONV][1],
.surface_desc = &surface_desc[DLA_OP_CONV][1],
},
},
.processors[DLA_OP_SDP] = {
.name = "SDP",
.op_type = DLA_OP_SDP,
.program = dla_sdp_program,
.enable = dla_sdp_enable,
.set_producer = dla_sdp_set_producer,
.is_ready = dla_sdp_is_ready,
.dump_config = dla_sdp_dump_config,
.rdma_check = dla_sdp_rdma_check,
.get_stat_data = dla_sdp_stat_data,
.dump_stat = dla_sdp_dump_stat,
.consumer_ptr = 0,
.roi_index = 0,
.group_status = 0,
.rdma_status = 0,
.last_group = 1,
.groups[0] = {
.id = 0,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[DLA_OP_SDP][0],
.surface_desc = &surface_desc[DLA_OP_SDP][0],
},
.groups[1] = {
.id = 1,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[DLA_OP_SDP][1],
.surface_desc = &surface_desc[DLA_OP_SDP][1],
},
},
.processors[DLA_OP_PDP] = {
.name = "PDP",
.op_type = DLA_OP_PDP,
.program = dla_pdp_program,
.enable = dla_pdp_enable,
.set_producer = dla_pdp_set_producer,
.is_ready = dla_pdp_is_ready,
.dump_config = dla_pdp_dump_config,
.rdma_check = dla_pdp_rdma_check,
.get_stat_data = dla_pdp_stat_data,
.dump_stat = dla_pdp_dump_stat,
.consumer_ptr = 0,
.roi_index = 0,
.group_status = 0,
.rdma_status = 0,
.last_group = 1,
.groups[0] = {
.id = 0,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[DLA_OP_PDP][0],
.surface_desc = &surface_desc[DLA_OP_PDP][0],
},
.groups[1] = {
.id = 1,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[DLA_OP_PDP][1],
.surface_desc = &surface_desc[DLA_OP_PDP][1],
},
},
.processors[DLA_OP_CDP] = {
.name = "CDP",
.op_type = DLA_OP_CDP,
.program = dla_cdp_program,
.enable = dla_cdp_enable,
.set_producer = dla_cdp_set_producer,
.is_ready = dla_cdp_is_ready,
.dump_config = dla_cdp_dump_config,
.rdma_check = dla_cdp_rdma_check,
.get_stat_data = dla_cdp_stat_data,
.dump_stat = dla_cdp_dump_stat,
.consumer_ptr = 0,
.roi_index = 0,
.group_status = 0,
.rdma_status = 0,
.last_group = 1,
.groups[0] = {
.id = 0,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[DLA_OP_CDP][0],
.surface_desc = &surface_desc[DLA_OP_CDP][0],
},
.groups[1] = {
.id = 1,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[DLA_OP_CDP][1],
.surface_desc = &surface_desc[DLA_OP_CDP][1],
},
},
.processors[DLA_OP_RUBIK] = {
.name = "RUBIK",
.op_type = DLA_OP_RUBIK,
.program = dla_rubik_program,
.enable = dla_rubik_enable,
.set_producer = dla_rubik_set_producer,
.is_ready = dla_rubik_is_ready,
.dump_config = dla_rubik_dump_config,
.rdma_check = dla_rubik_rdma_check,
.get_stat_data = dla_rubik_stat_data,
.dump_stat = dla_rubik_dump_stat,
.consumer_ptr = 0,
.roi_index = 0,
.group_status = 0,
.rdma_status = 0,
.last_group = 1,
.groups[0] = {
.id = 0,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[DLA_OP_RUBIK][0],
.surface_desc = &surface_desc[DLA_OP_RUBIK][0],
},
.groups[1] = {
.id = 1,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[DLA_OP_RUBIK][1],
.surface_desc = &surface_desc[DLA_OP_RUBIK][1],
},
},
};
struct dla_engine *dla_get_engine(void)
{
return &engine;
}
int32_t dla_register_driver(void **engine_context, void *driver_context)
{
*engine_context = &engine;
engine.task = &global_task;
engine.driver_context = driver_context;
engine.task->task_data = NULL;
dla_init_op_cache(&engine);
RETURN(0);
}
uint32_t reg_read(uint32_t addr)
{
return dla_reg_read(engine.driver_context, addr);
}
void reg_write(uint32_t addr, uint32_t reg)
{
dla_reg_write(engine.driver_context, addr, reg);
}

View file

@ -0,0 +1,551 @@
/*
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <dla_debug.h>
#include <dla_interface.h>
#include <dla_sched.h>
#include "engine_debug.h"
#if DEBUG_NETWORK_DATA
void
dla_debug_network_desc(struct dla_network_desc *nd)
{
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW dla_network_desc\n");
dla_debug("---------------------------------------------------------\n");
dla_debug("op desc index = %d\n", nd->operation_desc_index);
dla_debug("surface desc index = %d\n", nd->surface_desc_index);
dla_debug("dep graph index = %d\n", nd->dependency_graph_index);
dla_debug("lut data index = %d\n", nd->lut_data_index);
dla_debug("stat_list_index = %d\n", nd->stat_list_index);
dla_debug("roi array index = %d\n", nd->roi_array_index);
dla_debug("surface index = %d\n", nd->surface_index);
dla_debug("num rois = %u\n", nd->num_rois);
dla_debug("num ops = %u\n", nd->num_operations);
dla_debug("num luts = %u\n", nd->num_luts);
dla_debug("num addr = %u\n", nd->num_addresses);
dla_debug("input layer = %u\n", nd->input_layer);
dla_debug("dynamic roi = %u\n", nd->dynamic_roi);
}
static void
dla_debug_bdma_transfer(struct dla_bdma_transfer_desc *tr, int32_t id)
{
dla_debug("transfer[%d] = [ dla_bdma_transfer_desc =>\n", id);
dla_debug(" source_address = %x\n", tr->source_address);
dla_debug(" destination_address = %x\n", tr->destination_address);
dla_debug(" line_size = %x\n", tr->line_size);
dla_debug(" line_repeat = %x\n", tr->line_repeat);
dla_debug(" source_line = %x\n", tr->source_line);
dla_debug(" destination_line = %x\n", tr->destination_line);
dla_debug(" surface_repeat = %x\n", tr->surface_repeat);
dla_debug(" source_surface = %x\n", tr->source_surface);
dla_debug(" destination_surface = %x\n", tr->destination_surface);
}
void
dla_debug_bdma_surface_desc(struct dla_bdma_surface_desc *desc, int32_t roi)
{
int32_t i;
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW ROI[%d]: dla_bdma_surface_desc\n", roi);
dla_debug("---------------------------------------------------------\n");
dla_debug("source_type = %u\n", desc->source_type);
dla_debug("destination_type = %u\n", desc->destination_type);
dla_debug("num_transfers = %u\n", desc->num_transfers);
for (i = 0; i < desc->num_transfers; i++)
dla_debug_bdma_transfer(&desc->transfers[i], i);
}
void
dla_debug_bdma_op_desc(struct dla_bdma_op_desc *desc, int32_t roi)
{
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW ROI[%d]: dla_bdma_op_desc\n", roi);
dla_debug("---------------------------------------------------------\n");
dla_debug("num_transfers = %u\n", desc->num_transfers);
}
void
dla_debug_address_info(struct dla_task *tk)
{
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW address list\n");
dla_debug("---------------------------------------------------------\n");
dla_debug("task base address = %llu\n", tk->base);
dla_debug("op desc address = %llu\n", tk->operation_desc_addr);
dla_debug("surface desc address = %llu\n", tk->surface_desc_addr);
dla_debug("dependency graph address = %llu\n", tk->dependency_graph_addr);
dla_debug("LUT data address = %llu\n", tk->lut_data_addr);
dla_debug("stat address = %llu\n", tk->stat_data_addr);
dla_debug("ROI array address = %llu\n", tk->roi_array_addr);
dla_debug("surface address = %llu\n", tk->surface_addr);
}
void
dla_debug_op_desc(struct dla_common_op_desc *desc, int32_t roi)
{
int32_t i;
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW ROI[%d]: dla_common_op_desc\n", roi);
dla_debug("---------------------------------------------------------\n");
dla_debug("[%p] Operation index %d ROI %d dep_count %d type %d\n",
(unsigned int *)desc, desc->index, desc->roi_index,
desc->dependency_count, desc->op_type);
dla_debug("consumers = [ dla_consumer =>\n");
for (i = 0; i < DLA_OP_NUM; i++)
dla_debug(" [ %d %d ]", desc->consumers[i].index,
desc->consumers[i].event);
dla_debug("]");
dla_debug("fused_parent = [ dla_consumer =>\n");
dla_debug(" [ %d %d ]", desc->fused_parent.index,
desc->fused_parent.event);
dla_debug("]");
}
static void
dla_debug_data_cube(struct dla_data_cube *cube)
{
dla_debug(" type = %u\n", cube->type);
dla_debug(" address = %d\n", cube->address);
dla_debug(" width = %x\n", cube->width);
dla_debug(" height = %x\n", cube->height);
dla_debug(" channel = %x\n", cube->channel);
dla_debug(" size = %u\n", cube->size);
dla_debug(" line_stride = %u\n", cube->line_stride);
dla_debug(" surf_stride = %u\n", cube->surf_stride);
dla_debug(" plane_stride = %u\n", cube->plane_stride);
dla_debug("]");
}
static void
dla_debug_converter(struct dla_cvt_param *cvt)
{
dla_debug("[ scale = %d, truncate = %u, enable = %u, offset = %d ]\n",
cvt->scale, cvt->truncate, cvt->enable, cvt->offset);
}
static void
dla_debug_float_data(struct dla_float_data *float_data)
{
dla_debug("[ scale = %d, shifter = %d ]\n",
float_data->scale, float_data->shifter);
}
static void
dla_debug_dla_slope(union dla_slope *slope)
{
dla_debug(" data_i =\n");
dla_debug_float_data(&slope->data_i);
dla_debug(" data_f = %u\n", slope->data_f);
}
static void
dla_debug_lut_offset(union dla_lut_offset *offset)
{
dla_debug(" exp_offset = %d\n", offset->exp_offset);
dla_debug(" frac_bits = %d\n", offset->frac_bits);
}
void
dla_debug_lut_params(struct dla_lut_param *lut_param)
{
int32_t i, j;
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW dla_lut_param\n");
dla_debug("---------------------------------------------------------\n");
dla_debug("linear_exp_table = [\n");
for (i = 0; i < (1<<LUT_LINEAR_EXP_TABLE_ENTRY_LOG2)+1; i++)
dla_debug(" %u", lut_param->linear_exp_table[i]);
dla_debug("]");
dla_debug("linear_only_table = [\n");
for (j = 0; j < (1<<LUT_LINEAR_ONLY_TABLE_ENTRY_LOG2)+1; j++)
dla_debug(" %u\n", lut_param->linear_only_table[j]);
dla_debug("]\n");
dla_debug("linear_exp_offset =\n");
dla_debug_lut_offset(&lut_param->linear_exp_offset);
dla_debug("linear_only_offset =\n");
dla_debug_lut_offset(&lut_param->linear_only_offset);
dla_debug("linear_exp_start = %llu\n",
lut_param->linear_exp_start);
dla_debug("linear_exp_end = %llu\n",
lut_param->linear_exp_end);
dla_debug("linear_only_start = %llu\n",
lut_param->linear_only_start);
dla_debug("linear_only_end = %llu\n",
lut_param->linear_only_end);
dla_debug("linear_exp_underflow_slope =\n");
dla_debug_dla_slope(&lut_param->linear_exp_underflow_slope);
dla_debug("linear_exp_overflow_slope =\n");
dla_debug_dla_slope(&lut_param->linear_exp_overflow_slope);
dla_debug("linear_only_underflow_slope =\n");
dla_debug_dla_slope(&lut_param->linear_only_underflow_slope);
dla_debug("linear_only_overflow_slope =\n");
dla_debug_dla_slope(&lut_param->linear_only_overflow_slope);
dla_debug("hybrid_priority = %u\n",
lut_param->hybrid_priority);
dla_debug("underflow_priority = %u\n",
lut_param->underflow_priority);
dla_debug("overflow_priority = %u\n",
lut_param->overflow_priority);
dla_debug("method = %u\n",
lut_param->method);
}
void
dla_debug_bdma_stats(struct dla_bdma_stat_desc *stat)
{
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW STATS: dla_bdma_stat_desc\n");
dla_debug("---------------------------------------------------------\n");
dla_debug("read_stall = %u\n", stat->read_stall);
dla_debug("write_stall = %u\n", stat->write_stall);
dla_debug("runtime = %u\n", stat->runtime);
}
void
dla_debug_conv_surface_desc(struct dla_conv_surface_desc *desc, int32_t roi)
{
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW ROI[%d]: dla_conv_surface_desc\n", roi);
dla_debug("---------------------------------------------------------\n");
dla_debug("weight_data = [ dla_data_cube =>\n");
dla_debug_data_cube(&desc->weight_data);
dla_debug("wmb_data = [ dla_data_cube =>\n");
dla_debug_data_cube(&desc->wmb_data);
dla_debug("wgs_data = [ dla_data_cube =>\n");
dla_debug_data_cube(&desc->wgs_data);
dla_debug("src_data = [ dla_data_cube =>\n");
dla_debug_data_cube(&desc->src_data);
dla_debug("dst_data = [ dla_data_cube =>\n");
dla_debug_data_cube(&desc->dst_data);
dla_debug("offset_u = %lld\n", desc->offset_u);
dla_debug("in_line_uv_stride = %u\n", desc->in_line_uv_stride);
}
void
dla_debug_conv_op_desc(struct dla_conv_op_desc *desc, int32_t roi)
{
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW ROI[%d]: dla_conv_op_desc\n", roi);
dla_debug("---------------------------------------------------------\n");
dla_debug("conv_mode = %u\n", desc->conv_mode);
dla_debug("data_reuse = %u\n", desc->data_reuse);
dla_debug("weight_reuse = %u\n", desc->weight_reuse);
dla_debug("skip_data_rls = %u\n", desc->skip_data_rls);
dla_debug("skip_weight_rls = %u\n", desc->skip_weight_rls);
dla_debug("entry_per_slice = %u\n", desc->entry_per_slice);
dla_debug("data_format = %u\n", desc->data_format);
dla_debug("pixel_mapping = %u\n", desc->pixel_mapping);
dla_debug("fetch_grain = %u\n", desc->fetch_grain);
dla_debug("batch = %u\n", desc->batch);
dla_debug("weight_format = %u\n", desc->weight_format);
dla_debug("data_bank = %u\n", desc->data_bank);
dla_debug("weight_bank = %u\n", desc->weight_bank);
dla_debug("batch_stride = %u\n", desc->batch_stride);
dla_debug("post_extension = %u\n", desc->post_extension);
dla_debug("pixel_override = %u\n", desc->pixel_override);
dla_debug("release = %u\n", desc->release);
dla_debug("input_width_csc = %u\n", desc->input_width_csc);
dla_debug("input_height_csc = %u\n", desc->input_height_csc);
dla_debug("input_channel_csc = %u\n", desc->input_channel_csc);
dla_debug("kernel_width_csc = %u\n", desc->kernel_width_csc);
dla_debug("kernel_height_csc = %u\n", desc->kernel_height_csc);
dla_debug("kernel_channel_csc = %u\n", desc->kernel_channel_csc);
dla_debug("input_width_cmac = %u\n", desc->input_width_cmac);
dla_debug("input_height_cmac = %u\n", desc->input_height_cmac);
dla_debug("bytes_per_kernel = %u\n", desc->bytes_per_kernel);
dla_debug("mean_ry = %d\n", desc->mean_ry);
dla_debug("mean_gu = %d\n", desc->mean_gu);
dla_debug("mean_bv = %d\n", desc->mean_bv);
dla_debug("mean_ax = %d\n", desc->mean_ax);
dla_debug("mean_format = %u\n", desc->mean_format);
dla_debug("conv_stride_x = %u\n", desc->conv_stride_x);
dla_debug("conv_stride_y = %u\n", desc->conv_stride_y);
dla_debug("pad_x_left = %u\n", desc->pad_x_left);
dla_debug("pad_x_right = %u\n", desc->pad_x_right);
dla_debug("pad_y_top = %u\n", desc->pad_y_top);
dla_debug("pad_y_bottom = %u\n", desc->pad_y_bottom);
dla_debug("dilation_x = %u\n", desc->dilation_x);
dla_debug("dilation_y = %u\n", desc->dilation_y);
dla_debug("pra_truncate = %u\n", desc->pra_truncate);
dla_debug("in_precision = %u\n", desc->in_precision);
dla_debug("out_precision = %u\n", desc->out_precision);
dla_debug("pad_val = %d\n", desc->pad_val);
dla_debug("in_cvt =\n");
dla_debug_converter(&desc->in_cvt);
dla_debug("out_cvt =\n");
dla_debug_converter(&desc->out_cvt);
}
void
dla_debug_conv_stats(struct dla_conv_stat_desc *stat)
{
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW STATS: dla_conv_stat_desc\n");
dla_debug("---------------------------------------------------------\n");
dla_debug("data_read_stall = %u\n", stat->data_read_stall);
dla_debug("weight_read_stall = %u\n", stat->weight_read_stall);
dla_debug("data_read_latency = %u\n", stat->data_read_latency);
dla_debug("weight_read_latency = %u\n", stat->weight_read_latency);
dla_debug("saturation_count = %u\n", stat->saturation_count);
dla_debug("nan_data_num = %u\n", stat->nan_data_num);
dla_debug("nan_weight_num = %u\n", stat->nan_weight_num);
dla_debug("inf_data_num = %u\n", stat->inf_data_num);
dla_debug("inf_weight_num = %u\n", stat->inf_weight_num);
dla_debug("runtime = %u\n", stat->runtime);
}
void
dla_debug_pdp_surface_desc(struct dla_pdp_surface_desc *desc, int32_t roi)
{
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW ROI[%d]: dla_pdp_surface_desc\n", roi);
dla_debug("---------------------------------------------------------\n");
dla_debug("src_data = [ dla_data_cube =>\n");
dla_debug_data_cube(&desc->src_data);
dla_debug("dst_data = [ dla_data_cube =>\n");
dla_debug_data_cube(&desc->dst_data);
}
void
dla_debug_pdp_op_desc(struct dla_pdp_op_desc *desc, int32_t roi)
{
int32_t i;
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW ROI[%d]: dla_pdp_op_desc\n", roi);
dla_debug("---------------------------------------------------------\n");
dla_debug("precision = %u\n", desc->precision);
dla_debug("padding_value = [\n");
for (i = 0; i < PDP_PAD_VAL_NUM; i++)
dla_debug(" %d\n", desc->padding_value[i]);
dla_debug("]\n");
dla_debug("split_num = %u\n", desc->split_num);
dla_debug("partial_in_width_first = %u\n",
desc->partial_in_width_first);
dla_debug("partial_in_width_mid = %u\n", desc->partial_in_width_mid);
dla_debug("partial_in_width_last = %u\n", desc->partial_in_width_last);
dla_debug("partial_width_first = %u\n", desc->partial_width_first);
dla_debug("partial_width_mid = %u\n", desc->partial_width_mid);
dla_debug("partial_width_last = %u\n", desc->partial_width_last);
dla_debug("pool_mode = %u\n", desc->pool_mode);
dla_debug("pool_width = %u\n", desc->pool_width);
dla_debug("pool_height = %u\n", desc->pool_height);
dla_debug("stride_x = %u\n", desc->stride_x);
dla_debug("stride_y = %u\n", desc->stride_y);
dla_debug("pad_left = %u\n", desc->pad_left);
dla_debug("pad_right = %u\n", desc->pad_right);
dla_debug("pad_top = %u\n", desc->pad_top);
dla_debug("pad_bottom = %u\n", desc->pad_bottom);
}
void
dla_debug_pdp_stats(struct dla_pdp_stat_desc *stat)
{
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW STATS: dla_pdp_stat_desc\n");
dla_debug("---------------------------------------------------------\n");
dla_debug("inf_input_num = %u\n", stat->inf_input_num);
dla_debug("nan_input_num = %u\n", stat->nan_input_num);
dla_debug("nan_output_num = %u\n", stat->nan_output_num);
dla_debug("write_stall = %u\n", stat->write_stall);
dla_debug("runtime = %u\n", stat->runtime);
}
void
dla_debug_cdp_surface_desc(struct dla_cdp_surface_desc *desc, int32_t roi)
{
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW ROI[%d]: dla_cdp_surface_desc\n", roi);
dla_debug("---------------------------------------------------------\n");
dla_debug("src_data = [ dla_data_cube =>\n");
dla_debug_data_cube(&desc->src_data);
dla_debug("dst_data = [ dla_data_cube =>\n");
dla_debug_data_cube(&desc->dst_data);
}
void
dla_debug_cdp_op_desc(struct dla_cdp_op_desc *desc, int32_t roi)
{
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW ROI[%d]: dla_cdp_op_desc\n", roi);
dla_debug("---------------------------------------------------------\n");
dla_debug("in_precision = %u\n", desc->in_precision);
dla_debug("out_precision = %u\n", desc->out_precision);
dla_debug("lut_index = %d\n", desc->lut_index);
dla_debug("in_cvt =\n");
dla_debug_converter(&desc->in_cvt);
dla_debug("out_cvt =\n");
dla_debug_converter(&desc->out_cvt);
dla_debug("local_size = %u\n", desc->local_size);
dla_debug("bypass_sqsum = %u\n", desc->bypass_sqsum);
dla_debug("bypass_out_mul = %u\n", desc->bypass_out_mul);
}
void
dla_debug_cdp_stats(struct dla_cdp_stat_desc *stat)
{
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW STATS: dla_cdp_stat_desc\n");
dla_debug("---------------------------------------------------------\n");
dla_debug("nan_input_num = %u\n", stat->nan_input_num);
dla_debug("inf_input_num = %u\n", stat->inf_input_num);
dla_debug("nan_output_num = %u\n", stat->nan_output_num);
dla_debug("write_stall = %u\n", stat->write_stall);
dla_debug("lut_uflow = %u\n", stat->lut_uflow);
dla_debug("lut_oflow = %u\n", stat->lut_oflow);
dla_debug("lut_hybrid = %u\n", stat->lut_hybrid);
dla_debug("lut_le_hit = %u\n", stat->lut_le_hit);
dla_debug("lut_lo_hit = %u\n", stat->lut_lo_hit);
dla_debug("saturation_count = %u\n", stat->saturation_count);
dla_debug("runtime = %u\n", stat->runtime);
}
void
dla_debug_rubik_surface_desc(struct dla_rubik_surface_desc *desc, int32_t roi)
{
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW ROI[%d]: dla_rubik_surface_desc\n", roi);
dla_debug("---------------------------------------------------------\n");
dla_debug("src_data = [ dla_data_cube =>\n");
dla_debug_data_cube(&desc->src_data);
dla_debug("dst_data = [ dla_data_cube =>\n");
dla_debug_data_cube(&desc->dst_data);
}
void
dla_debug_rubik_op_desc(struct dla_rubik_op_desc *desc, int32_t roi)
{
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW ROI[%d]: dla_rubik_op_desc\n", roi);
dla_debug("---------------------------------------------------------\n");
dla_debug("mode = %u\n", desc->mode);
dla_debug("precision = %u\n", desc->precision);
dla_debug("stride_x = %u\n", desc->stride_x);
dla_debug("stride_y = %u\n", desc->stride_y);
}
void
dla_debug_rubik_stats(struct dla_rubik_stat_desc *stat)
{
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW STATS: dla_rubik_stat_desc\n");
dla_debug("---------------------------------------------------------\n");
dla_debug("read_stall = %u\n", stat->read_stall);
dla_debug("write_stall = %u\n", stat->write_stall);
dla_debug("runtime = %u\n", stat->runtime);
}
static void
dla_debug_sdp_op(struct dla_sdp_op *sdp_op)
{
dla_debug(" enable = %u\n", sdp_op->enable);
dla_debug(" alu_type = %u\n", sdp_op->alu_type);
dla_debug(" type = %u\n", sdp_op->type);
dla_debug(" mode = %u\n", sdp_op->mode);
dla_debug(" act = %u\n", sdp_op->act);
dla_debug(" shift_value = %u\n", sdp_op->shift_value);
dla_debug(" truncate = %u\n", sdp_op->truncate);
dla_debug(" precision = %u\n", sdp_op->precision);
dla_debug(" alu_operand = %d\n", sdp_op->alu_operand);
dla_debug(" mul_operand = %d\n", sdp_op->mul_operand);
dla_debug("cvt.alu_cvt =\n");
dla_debug_converter(&sdp_op->cvt.alu_cvt);
dla_debug("cvt.mul_cvt =\n");
dla_debug_converter(&sdp_op->cvt.mul_cvt);
dla_debug("]\n");
}
void
dla_debug_sdp_surface_desc(struct dla_sdp_surface_desc *desc, int32_t roi)
{
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW ROI[%d]: dla_sdp_surface_desc\n", roi);
dla_debug("---------------------------------------------------------\n");
dla_debug("src_data = [ dla_data_cube =>\n");
dla_debug_data_cube(&desc->src_data);
dla_debug("x1_data = [ dla_data_cube =>\n");
dla_debug_data_cube(&desc->x1_data);
dla_debug("x2_data = [ dla_data_cube =>\n");
dla_debug_data_cube(&desc->x2_data);
dla_debug("y_data = [ dla_data_cube =>\n");
dla_debug_data_cube(&desc->y_data);
dla_debug("dst_data = [ dla_data_cube =>\n");
dla_debug_data_cube(&desc->dst_data);
}
void
dla_debug_sdp_op_desc(struct dla_sdp_op_desc *desc, int32_t roi)
{
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW ROI[%d]: dla_sdp_op_desc\n", roi);
dla_debug("---------------------------------------------------------\n");
dla_debug("src_precision = %u\n", desc->src_precision);
dla_debug("dst_precision = %u\n", desc->dst_precision);
dla_debug("lut_index = %d\n", desc->lut_index);
dla_debug("out_cvt =\n");
dla_debug_converter(&desc->out_cvt);
dla_debug("conv_mode = %u\n", desc->conv_mode);
dla_debug("batch_num = %u\n", desc->batch_num);
dla_debug("batch_stride = %u\n", desc->batch_stride);
dla_debug("x1_op = [ dla_sdp_op =>\n");
dla_debug_sdp_op(&desc->x1_op);
dla_debug("x2_op = [ dla_sdp_op =>\n");
dla_debug_sdp_op(&desc->x2_op);
dla_debug("y_op = [ dla_sdp_op =>\n");
dla_debug_sdp_op(&desc->y_op);
}
void
dla_debug_sdp_stats(struct dla_sdp_stat_desc *stat)
{
dla_debug("*********************************************************\n");
dla_debug("NVDLA FW STATS: dla_sdp_stat_desc\n");
dla_debug("---------------------------------------------------------\n");
dla_debug("nan_input_num = %u\n", stat->nan_input_num);
dla_debug("inf_input_num = %u\n", stat->inf_input_num);
dla_debug("nan_output_num = %u\n", stat->nan_output_num);
dla_debug("wdma_write_stall = %u\n", stat->wdma_write_stall);
dla_debug("lut_underflow = %u\n", stat->lut_underflow);
dla_debug("lut_overflow = %u\n", stat->lut_overflow);
dla_debug("lut_hybrid = %u\n", stat->lut_hybrid);
dla_debug("lut_le_hit = %u\n", stat->lut_le_hit);
dla_debug("lut_lo_hit = %u\n", stat->lut_lo_hit);
dla_debug("saturation_count = %u\n", stat->saturation_count);
dla_debug("runtime = %u\n", stat->runtime);
}
#endif /* DEBUG_NETWORK_DATA */

View file

@ -0,0 +1,129 @@
/*
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __FIRMWARE_ENGINE_DEBUG_H_
#define __FIRMWARE_ENGINE_DEBUG_H_
#include <dla_debug.h>
#include <dla_interface.h>
#if DEBUG_NETWORK_DATA
void
dla_debug_op_desc(struct dla_common_op_desc *desc, int32_t roi);
void
dla_debug_network_desc(struct dla_network_desc *network_desc);
void
dla_debug_address_info(struct dla_task *task);
void
dla_debug_bdma_surface_desc(struct dla_bdma_surface_desc *desc, int32_t roi);
void
dla_debug_bdma_op_desc(struct dla_bdma_op_desc *desc, int32_t roi);
void
dla_debug_bdma_stats(struct dla_bdma_stat_desc *stat);
void
dla_debug_conv_surface_desc(struct dla_conv_surface_desc *desc, int32_t roi);
void
dla_debug_conv_op_desc(struct dla_conv_op_desc *desc, int32_t roi);
void
dla_debug_conv_stats(struct dla_conv_stat_desc *stat);
void
dla_debug_sdp_op_desc(struct dla_sdp_op_desc *desc, int32_t roi);
void
dla_debug_sdp_surface_desc(struct dla_sdp_surface_desc *desc, int32_t roi);
void
dla_debug_sdp_stats(struct dla_sdp_stat_desc *stat);
void
dla_debug_pdp_surface_desc(struct dla_pdp_surface_desc *desc, int32_t roi);
void
dla_debug_pdp_op_desc(struct dla_pdp_op_desc *desc, int32_t roi);
void
dla_debug_pdp_stats(struct dla_pdp_stat_desc *stat);
void
dla_debug_cdp_surface_desc(struct dla_cdp_surface_desc *desc, int32_t roi);
void
dla_debug_cdp_op_desc(struct dla_cdp_op_desc *desc, int32_t roi);
void
dla_debug_cdp_stats(struct dla_cdp_stat_desc *stat);
void
dla_debug_rubik_op_desc(struct dla_rubik_op_desc *desc, int32_t roi);
void
dla_debug_rubik_surface_desc(struct dla_rubik_surface_desc *desc, int32_t roi);
void
dla_debug_rubik_stats(struct dla_rubik_stat_desc *stat);
void
dla_debug_lut_params(struct dla_lut_param *lut_param);
#else
static inline void
dla_debug_op_desc(struct dla_common_op_desc *desc, int32_t roi) {}
static inline void
dla_debug_network_desc(struct dla_network_desc *network_desc) {}
static inline void
dla_debug_address_info(struct dla_task *task) {}
static inline void
dla_debug_bdma_surface_desc(struct dla_bdma_surface_desc *desc, int32_t roi) {}
static inline void
dla_debug_bdma_op_desc(struct dla_bdma_op_desc *desc, int32_t roi) {}
static inline void
dla_debug_bdma_stats(struct dla_bdma_stat_desc *stat) {}
static inline void
dla_debug_conv_surface_desc(struct dla_conv_surface_desc *desc, int32_t roi) {}
static inline void
dla_debug_conv_op_desc(struct dla_conv_op_desc *desc, int32_t roi) {}
static inline void
dla_debug_conv_stats(struct dla_conv_stat_desc *stat) {}
static inline void
dla_debug_sdp_op_desc(struct dla_sdp_op_desc *desc, int32_t roi) {}
static inline void
dla_debug_sdp_surface_desc(struct dla_sdp_surface_desc *desc, int32_t roi) {}
static inline void
dla_debug_sdp_stats(struct dla_sdp_stat_desc *stat) {}
static inline void
dla_debug_pdp_surface_desc(struct dla_pdp_surface_desc *desc, int32_t roi) {}
static inline void
dla_debug_pdp_op_desc(struct dla_pdp_op_desc *desc, int32_t roi) {}
static inline void
dla_debug_pdp_stats(struct dla_pdp_stat_desc *stat) {}
static inline void
dla_debug_cdp_surface_desc(struct dla_cdp_surface_desc *desc, int32_t roi) {}
static inline void
dla_debug_cdp_op_desc(struct dla_cdp_op_desc *desc, int32_t roi) {}
static inline void
dla_debug_cdp_stats(struct dla_cdp_stat_desc *stat) {}
static inline void
dla_debug_rubik_op_desc(struct dla_rubik_op_desc *desc, int32_t roi) {}
static inline void
dla_debug_rubik_surface_desc(struct dla_rubik_surface_desc *desc, int32_t roi) {}
static inline void
dla_debug_rubik_stats(struct dla_rubik_stat_desc *stat) {}
static inline void
dla_debug_lut_params(struct dla_lut_param *lut_param) {}
#endif /* DEBUG_NETWORK_DATA */
#endif /* __FIRMWARE_ENGINE_DEBUG_H_ */

136
drivers/nvdla/engine_isr.c Normal file
View file

@ -0,0 +1,136 @@
/*
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <opendla.h>
#include <dla_debug.h>
#include <dla_engine.h>
#include <dla_interface.h>
#include "dla_engine_internal.h"
int32_t dla_isr_handler(void *engine_data)
{
uint32_t mask;
uint32_t reg;
struct dla_processor *processor = NULL;
struct dla_processor_group *group;
struct dla_engine *engine = (struct dla_engine *)engine_data;
mask = glb_reg_read(S_INTR_MASK);
reg = glb_reg_read(S_INTR_STATUS);
dla_trace("Enter: dla_isr_handler, reg:%x, mask:%x\n", reg, mask);
if (reg & MASK(GLB_S_INTR_STATUS_0, CACC_DONE_STATUS0)) {
processor = &engine->processors[DLA_OP_CONV];
group = &processor->groups[0];
group->events |= (1 << DLA_EVENT_OP_COMPLETED);
}
if (reg & MASK(GLB_S_INTR_STATUS_0, CACC_DONE_STATUS1)) {
processor = &engine->processors[DLA_OP_CONV];
group = &processor->groups[1];
group->events |= (1 << DLA_EVENT_OP_COMPLETED);
}
if (reg & MASK(GLB_S_INTR_STATUS_0, SDP_DONE_STATUS0)) {
processor = &engine->processors[DLA_OP_SDP];
group = &processor->groups[0];
group->events |= (1 << DLA_EVENT_OP_COMPLETED);
}
if (reg & MASK(GLB_S_INTR_STATUS_0, SDP_DONE_STATUS1)) {
processor = &engine->processors[DLA_OP_SDP];
group = &processor->groups[1];
group->events |= (1 << DLA_EVENT_OP_COMPLETED);
}
if (reg & MASK(GLB_S_INTR_STATUS_0, CDP_DONE_STATUS0)) {
processor = &engine->processors[DLA_OP_CDP];
group = &processor->groups[0];
group->events |= (1 << DLA_EVENT_OP_COMPLETED);
}
if (reg & MASK(GLB_S_INTR_STATUS_0, CDP_DONE_STATUS1)) {
processor = &engine->processors[DLA_OP_CDP];
group = &processor->groups[1];
group->events |= (1 << DLA_EVENT_OP_COMPLETED);
}
if (reg & MASK(GLB_S_INTR_STATUS_0, RUBIK_DONE_STATUS0)) {
processor = &engine->processors[DLA_OP_RUBIK];
group = &processor->groups[0];
group->events |= (1 << DLA_EVENT_OP_COMPLETED);
}
if (reg & MASK(GLB_S_INTR_STATUS_0, RUBIK_DONE_STATUS1)) {
processor = &engine->processors[DLA_OP_RUBIK];
group = &processor->groups[1];
group->events |= (1 << DLA_EVENT_OP_COMPLETED);
}
if (reg & MASK(GLB_S_INTR_STATUS_0, PDP_DONE_STATUS0)) {
processor = &engine->processors[DLA_OP_PDP];
group = &processor->groups[0];
group->events |= (1 << DLA_EVENT_OP_COMPLETED);
}
if (reg & MASK(GLB_S_INTR_STATUS_0, PDP_DONE_STATUS1)) {
processor = &engine->processors[DLA_OP_PDP];
group = &processor->groups[1];
group->events |= (1 << DLA_EVENT_OP_COMPLETED);
}
if (reg & MASK(GLB_S_INTR_STATUS_0, BDMA_DONE_STATUS0)) {
processor = &engine->processors[DLA_OP_BDMA];
group = &processor->groups[0];
group->events |= (1 << DLA_EVENT_OP_COMPLETED);
}
if (reg & MASK(GLB_S_INTR_STATUS_0, BDMA_DONE_STATUS1)) {
processor = &engine->processors[DLA_OP_BDMA];
group = &processor->groups[1];
group->events |= (1 << DLA_EVENT_OP_COMPLETED);
}
if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_DAT_DONE_STATUS0)) {
processor = &engine->processors[DLA_OP_CONV];
group = &processor->groups[0];
group->events |= (1 << DLA_EVENT_CDMA_DT_DONE);
}
if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_DAT_DONE_STATUS1)) {
processor = &engine->processors[DLA_OP_CONV];
group = &processor->groups[1];
group->events |= (1 << DLA_EVENT_CDMA_DT_DONE);
}
if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_WT_DONE_STATUS0)) {
processor = &engine->processors[DLA_OP_CONV];
group = &processor->groups[0];
group->events |= (1 << DLA_EVENT_CDMA_WT_DONE);
}
if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_WT_DONE_STATUS1)) {
processor = &engine->processors[DLA_OP_CONV];
group = &processor->groups[1];
group->events |= (1 << DLA_EVENT_CDMA_WT_DONE);
}
glb_reg_write(S_INTR_STATUS, reg);
mask = glb_reg_read(S_INTR_MASK);
reg = glb_reg_read(S_INTR_STATUS);
dla_trace("Exit: dla_isr_handler, reg:%x, mask:%x\n", reg, mask);
RETURN(0);
}

View file

@ -0,0 +1,94 @@
/*
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __FIRMWARE_DLA_DEBUG_H_
#define __FIRMWARE_DLA_DEBUG_H_
#define STRINGIFY(s) #s
#define DEFER_STRINGIFY(s) STRINGIFY(s)
#define FILELINE DEFER_STRINGIFY(__LINE__)
#define FILENAME DEFER_STRINGIFY(__FILE__)
#define LOG_EVENT_BDMA_SHIFT 0U
#define LOG_EVENT_CONV_SHIFT 4U
#define LOG_EVENT_SDP_SHIFT 8U
#define LOG_EVENT_PDP_SHIFT 12U
#define LOG_EVENT_CDP_SHIFT 16U
#define LOG_EVENT_RBK_SHIFT 20U
#define LOG_EVENT_GROUP_SHIFT 24U
#define LOG_EVENT_ROI_SHIFT 28U
#define LOG_TASK_START 1
#define LOG_TASK_END 2
#define LOG_READ_OP_CONFIG_START 3
#define LOG_READ_OP_CONFIG_END 4
#define LOG_READ_SURF_CONFIG_START 5
#define LOG_READ_SURF_CONFIG_END 6
#define LOG_PROGRAM_START 7
#define LOG_PROGRAM_END 8
#define LOG_OPERATION_START 9
#define LOG_OPERATION_END 10
#define LOG_EVENT(roi, group, processor, event)
/**
* Used to enable/disable reading stat registers
*/
#define STAT_ENABLE 1
/**
* Used to print debug network data
*/
#define DEBUG_NETWORK_DATA 0
#define pr_dump_stack(format, ...)
#define dla_trace(format, ...)
#define assert(condition)
#define RETURN(err) { return (err); }
#define DEBUG_ASSERT
#ifdef DEBUG_ASSERT
#define ASSERT_GOTO(_condition, _ret, _err_value, _goto) \
do { \
if (!(_condition)) { \
dla_error("Assertion Fail(" FILENAME FILELINE "):" \
STRINGIFY(_condition)); \
_ret = _err_value; \
goto _goto; \
} else { \
_ret = 0; \
} \
} while (0)
#else
#define ASSERT_GOTO(_condition, _ret, _err_value, _goto) assert(condition)
#endif /* DEBUG_ASSERT */
#endif

View file

@ -0,0 +1,94 @@
/*
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __DLA_ENGINE_H_
#define __DLA_ENGINE_H_
#include <dla_interface.h>
#include <dla_sched.h>
struct dla_processor_group {
uint8_t id;
uint8_t rdma_id;
uint8_t active;
uint8_t events;
uint8_t roi_index;
uint8_t is_rdma_needed;
uint8_t pending;
int32_t lut_index;
uint8_t programming;
uint64_t start_time;
struct dla_common_op_desc *op_desc;
struct dla_common_op_desc *consumers[DLA_OP_NUM];
struct dla_common_op_desc *fused_parent;
union dla_operation_container *operation_desc;
union dla_surface_container *surface_desc;
};
struct dla_processor {
const char *name;
uint8_t op_type;
uint8_t consumer_ptr;
uint8_t roi_index;
uint8_t group_status;
uint8_t rdma_status;
uint8_t last_group;
struct dla_common_op_desc *tail_op;
struct dla_processor_group groups[DLA_NUM_GROUPS];
union dla_stat_container *stat_data_desc;
int32_t (*is_ready)(struct dla_processor *processor,
struct dla_processor_group *group);
int32_t (*enable)(struct dla_processor_group *group);
int32_t (*program)(struct dla_processor_group *group);
void (*set_producer)(int32_t group_id, int32_t rdma_id);
void (*dump_config)(struct dla_processor_group *group);
void (*rdma_check)(struct dla_processor_group *group);
void (*get_stat_data)(struct dla_processor *processor,
struct dla_processor_group *group);
void (*dump_stat)(struct dla_processor *processor);
};
struct dla_engine {
struct dla_task *task;
struct dla_config *config_data;
struct dla_network_desc *network;
struct dla_processor processors[DLA_OP_NUM];
uint16_t num_proc_hwl;
int32_t status;
uint32_t stat_enable;
void *driver_context;
};
struct dla_engine *dla_get_engine(void);
#endif

View file

@ -0,0 +1,50 @@
/*
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __FIRMWARE_DLA_ERR_H_
#define __FIRMWARE_DLA_ERR_H_
#define ERR(code) -DLA_ERR_##code
#define DLA_ERR_NONE 0
#define DLA_ERR_INVALID_METHOD 1
#define DLA_ERR_INVALID_TASK 2
#define DLA_ERR_INVALID_INPUT 3
#define DLA_ERR_INVALID_FALC_DMA 4
#define DLA_ERR_INVALID_QUEUE 5
#define DLA_ERR_INVALID_PREACTION 6
#define DLA_ERR_INVALID_POSTACTION 7
#define DLA_ERR_NO_MEM 8
#define DLA_ERR_INVALID_DESC_VER 9
#define DLA_ERR_INVALID_ENGINE_ID 10
#define DLA_ERR_INVALID_REGION 11
#define DLA_ERR_PROCESSOR_BUSY 12
#define DLA_ERR_RETRY 13
#define DLA_ERR_TASK_STATUS_MISMATCH 14
#endif

View file

@ -0,0 +1,886 @@
/*
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __FIRMWARE_DLA_INTERFACE_H_
#define __FIRMWARE_DLA_INTERFACE_H_
#include <nvdla_interface.h>
/**
* @ingroup Processors
* @name DLA Processors
* Processor modules in DLA engine. Each processor has it's
* own operation a.k.a. HW layer. Network is formed using
* graph of these operations
* @{
*/
#define DLA_OP_BDMA 0
#define DLA_OP_CONV 1
#define DLA_OP_SDP 2
#define DLA_OP_PDP 3
#define DLA_OP_CDP 4
#define DLA_OP_RUBIK 5
/** @} */
/**
* @ingroup Processors
* @name Maximum number of processors
* @brief DLA ash 6 processors
* @{
*/
#define DLA_OP_NUM 6
/** @} */
/**
* @ingroup Processors
* @name Number of groups
* @brief Each processor has 2 groups of registers
* @{
*/
#define DLA_NUM_GROUPS 2
/** @} */
/**
* Network descriptor
*
* Contains all information to execute a network
*
* @op_head: Index of first operation of each type in operations list
* @num_rois: Number of ROIs
* @num_operations: Number of operations in one list
* @num_luts: Number of LUTs
*/
struct dla_network_desc {
int16_t operation_desc_index;
int16_t surface_desc_index;
int16_t dependency_graph_index;
int16_t lut_data_index;
int16_t roi_array_index;
int16_t surface_index;
int16_t stat_list_index;
int16_t reserved1;
int16_t op_head[DLA_OP_NUM];
uint16_t num_rois;
uint16_t num_operations;
uint16_t num_luts;
uint16_t num_addresses;
int16_t input_layer;
uint8_t dynamic_roi;
uint8_t reserved0;
} __packed __aligned(4);
/**
* @name Memory types
* @brief DLA engnine can read/write to/from 3 memory types
* @{
*/
#define DLA_MEM_MC 0 /* External DRAM */
#define DLA_MEM_CV 1 /* CV-SRAM */
#define DLA_MEM_HW 2 /* DLA sub-module */
/** @} */
/**
* @ingroup Events
* @name Operation events
* @brief Different events triggered by an operations
* @{
*/
#define DLA_EVENT_OP_COMPLETED 1
#define DLA_EVENT_OP_PROGRAMMED 2
#define DLA_EVENT_OP_ENABLED 3
#define DLA_EVENT_CDMA_WT_DONE 4
#define DLA_EVENT_CDMA_DT_DONE 5
/** @} */
struct dla_consumer {
int16_t index; /* the index of dla_common_op_desc in dep_graph_addr */
uint8_t event;
uint8_t res;
} __packed __aligned(4);
struct dla_common_op_desc {
int16_t index; /* set by ucode */
int8_t roi_index;
uint8_t op_type;
uint8_t dependency_count;
uint8_t reserved0[3];
struct dla_consumer consumers[DLA_OP_NUM];
struct dla_consumer fused_parent;
} __packed __aligned(4);
struct dla_roi_array_desc {
uint32_t array_length;
uint32_t array_reserved;
} __packed __aligned(4);
struct dla_roi_desc {
uint32_t left;
uint32_t top;
uint32_t right;
uint32_t bottom;
} __packed __aligned(4);
/**
* @ingroup BDMA
* @name Maximum BDMA transfers
* @brief BDMA supports multiple transfers in operation. This indicates
* maximum number of transfers possible in one operation.
* @{
*/
#define NUM_MAX_BDMA_OPS 20
/** @} */
struct dla_bdma_transfer_desc {
int16_t source_address;
int16_t destination_address;
uint32_t line_size;
uint32_t line_repeat;
uint32_t source_line;
uint32_t destination_line;
uint32_t surface_repeat;
uint32_t source_surface;
uint32_t destination_surface;
} __packed __aligned(4);
struct dla_bdma_surface_desc {
uint8_t source_type;
uint8_t destination_type;
uint16_t num_transfers;
struct dla_bdma_transfer_desc transfers[NUM_MAX_BDMA_OPS];
} __packed __aligned(4);
struct dla_bdma_op_desc {
uint16_t num_transfers;
uint16_t reserved0;
} __packed __aligned(4);
struct dla_bdma_stat_desc {
uint32_t read_stall;
uint32_t write_stall;
uint32_t runtime;
} __packed __aligned(4);
/**
* @ingroup Convolution
* @name Convolution mode
* @brief Convolution modes support by DLA
* @{
*/
#define CONV_MODE_DIRECT 0
#define CONV_MODE_WINOGRAD 1
/** @} */
/**
* @ingroup Processors
* @name Precision BPE mapping
* @brief Precision formats and Bit Per Elements mapping
* @{
*/
#define BPE_PRECISION_INT8 1
#define BPE_PRECISION_INT16 2
#define BPE_PRECISION_FP16 2
/** @} */
/**
* @ingroup Processors
* @name Precision types
* @brief Precision formats supported by DLA engine
* @{
*/
#define PRECISION_INT8 0
#define PRECISION_INT16 1
#define PRECISION_FP16 2
/** @} */
/**
* @ingroup Processors
* @name Data formats
* @brief Data formats supported by DLA engine
* @{
*/
#define FORMAT_T_R8 0
#define FORMAT_T_R10 1
#define FORMAT_T_R12 2
#define FORMAT_T_R16 3
#define FORMAT_T_R16_I 4
#define FORMAT_T_R16_F 5
#define FORMAT_T_A16B16G16R16 6
#define FORMAT_T_X16B16G16R16 7
#define FORMAT_T_A16B16G16R16_F 8
#define FORMAT_T_A16Y16U16V16 9
#define FORMAT_T_V16U16Y16A16 10
#define FORMAT_T_A16Y16U16V16_F 11
#define FORMAT_T_A8B8G8R8 12
#define FORMAT_T_A8R8G8B8 13
#define FORMAT_T_B8G8R8A8 14
#define FORMAT_T_R8G8B8A8 15
#define FORMAT_T_X8B8G8R8 16
#define FORMAT_T_X8R8G8B8 17
#define FORMAT_T_B8G8R8X8 18
#define FORMAT_T_R8G8B8X8 19
#define FORMAT_T_A2B10G10R10 20
#define FORMAT_T_A2R10G10B10 21
#define FORMAT_T_B10G10R10A2 22
#define FORMAT_T_R10G10B10A2 23
#define FORMAT_T_A2Y10U10V10 24
#define FORMAT_T_V10U10Y10A2 25
#define FORMAT_T_A8Y8U8V8 26
#define FORMAT_T_V8U8Y8A8 27
#define FORMAT_T_Y8___U8V8_N444 28
#define FORMAT_T_Y8___V8U8_N444 29
#define FORMAT_T_Y10___U10V10_N444 30
#define FORMAT_T_Y10___V10U10_N444 31
#define FORMAT_T_Y12___U12V12_N444 32
#define FORMAT_T_Y12___V12U12_N444 33
#define FORMAT_T_Y16___U16V16_N444 34
#define FORMAT_T_Y16___V16U16_N444 35
#define FORMAT_FEATURE 36
/** @} */
/**
* @ingroup Convolution
* @name Pixel mapping
* @brief Pixel mapping formats supported for image input in Convolution
* @{
*/
#define MAP_PITCH_LINEAR 0
/** @} */
/**
* @ingroup Convolution
* @name Weight formats
* @brief Weight data formats supported in Convolution
* @{
*/
#define WEIGHT_FORMAT_UNCOMPRESSED 0
#define WEIGHT_FORMAT_COMPRESSED 1
/** @} */
/**
* @ingroup Convolution
* @name Mean data format
* @brief Mean data formats supported in Convolution
* @{
*/
#define MEAN_FORMAT_DISABLE 0
#define MEAN_FORMAT_ENABLE 1
/** @} */
struct dla_cvt_param {
int16_t scale;
uint8_t truncate;
uint8_t enable;
int32_t offset;
} __packed __aligned(4);
struct dla_data_cube {
uint16_t type; /* dla_mem_type */
int16_t address; /* offset to the actual IOVA in task.address_list */
uint32_t offset; /* offset within address */
uint32_t size;
/* cube dimensions */
uint16_t width;
uint16_t height;
uint16_t channel;
uint16_t reserved0;
/* stride information */
uint32_t line_stride;
uint32_t surf_stride;
/* For Rubik only */
uint32_t plane_stride;
} __packed __aligned(4);
#define PIXEL_OVERRIDE_UINT 0
#define PIXEL_OVERRIDE_INT 1
struct dla_conv_surface_desc {
/* Data cube */
struct dla_data_cube weight_data;
struct dla_data_cube wmb_data;
struct dla_data_cube wgs_data;
struct dla_data_cube src_data;
struct dla_data_cube dst_data;
/**
* u_addr = input_data.source_addr + offset_u
* this field should be set when YUV is not interleave format
*
*/
int64_t offset_u;
/* line stride for 2nd plane, must be 32bytes aligned */
uint32_t in_line_uv_stride;
} __packed __aligned(4);
struct dla_conv_op_desc {
/* Performance parameters */
/* dla_conv_mode */
uint8_t conv_mode;
uint8_t data_reuse;
uint8_t weight_reuse;
uint8_t skip_data_rls;
uint8_t skip_weight_rls;
uint8_t reserved0;
uint16_t entry_per_slice;
/* dla_data_format */
uint8_t data_format;
/* dla_pixel_mapping */
uint8_t pixel_mapping;
/* number of free slices before fetch */
uint16_t fetch_grain;
uint8_t reserved_b[8];
/* batch_num */
uint8_t batch;
/* dla_weight_format */
uint8_t weight_format;
uint8_t data_bank;
uint8_t weight_bank;
/* the offset in bytes of each data cube in a batch */
uint32_t batch_stride;
uint8_t post_extension;
uint8_t pixel_override;
/* number of slices need to be released */
uint16_t release;
/* The input cube dimension for CSC */
uint16_t input_width_csc;
uint16_t input_height_csc;
uint16_t input_channel_csc;
uint16_t kernel_width_csc;
uint16_t kernel_height_csc;
uint16_t kernel_channel_csc;
/* The input cube dimension for CMAC */
uint16_t input_width_cmac;
uint16_t input_height_cmac;
/* actual size in bytes */
uint32_t bytes_per_kernel;
/* Algorithm parameters */
int16_t mean_ry; /* mean value for red in RGB or Y in YUV */
int16_t mean_gu; /* mean value for green in RGB or U in YUV */
int16_t mean_bv; /* mean value for blue in RGB or V in YUV */
int16_t mean_ax;
uint8_t mean_format; /* dla_mean_format */
uint8_t conv_stride_x;
uint8_t conv_stride_y;
uint8_t pad_x_left;
uint8_t pad_x_right;
uint8_t pad_y_top;
uint8_t pad_y_bottom;
uint8_t dilation_x;
uint8_t dilation_y;
uint8_t reserved2[2];
/* Precision parameters */
uint8_t pra_truncate;
uint8_t in_precision;
/* The output precision from CONV, it's the MAC processing precison */
uint8_t out_precision;
int16_t pad_val;
/* input converter parameters */
struct dla_cvt_param in_cvt;
/* output converter parameters, support truncate only */
struct dla_cvt_param out_cvt;
} __packed __aligned(4);
struct dla_conv_stat_desc {
uint32_t data_read_stall;
uint32_t weight_read_stall;
uint32_t data_read_latency;
uint32_t weight_read_latency;
uint32_t saturation_count;
uint32_t nan_data_num;
uint32_t nan_weight_num;
uint32_t inf_data_num;
uint32_t inf_weight_num;
uint32_t runtime;
} __packed __aligned(4);
/**
* @ingroup SDP
* @name Activation functions
* @brief Activation functions supported in SDP
* @{
*/
#define ACTIVATION_NONE 0
#define ACTIVATION_RELU 1
#define ACTIVATION_LUT 2
#define ACTIVATION_PRELU 3
/** @} */
/**
* @ingroup LUT
* @name LUT size
* @brief LUT sizes for linear and exponentila LUT
* @{
*/
#define LUT_LINEAR_EXP_TABLE_ENTRY_LOG2 6
#define LUT_LINEAR_ONLY_TABLE_ENTRY_LOG2 8
/** @} */
/**
* @ingroup LUT
* @name LUT types
* @brief DLA supports two types of LUT, linear and exonential
* @{
*/
#define LUT_LINEAR_EXP_TABLE 0
#define LUT_LINEAR_ONLY_TABLE 1
/** @} */
/**
* @ingroup LUT
* @name LUT methods
* @brief DLA supports two types of LUT, linear and exonential
* @{
*/
#define LUT_METHOD_EXPONENTIAL 0
#define LUT_METHOD_LINEAR 1
/** @} */
/**
* @ingroup LUT
* @name LUT
* @brief DLA supports two types of LUT, linear and exonential
* @{
*/
#define LUT_PRI_LINEAR_EXP 0
#define LUT_PRI_LINEAR_ONLY 1
/** @} */
union dla_lut_offset {
/**
* Number should be substracted on log domain before look up
* exponetial table it has the same definition as hardware
* thus input scaling should also take into account when
* set this field.
*/
int8_t exp_offset;
/**
* Number of bits should be right shift before looking
* up linear table
*/
int8_t frac_bits;
uint16_t reserved0;
};
/**
* This struct is used to represent floating point values by INT
* suppose we have a float point number fp_x, it will be represented
* as:
*
* fp_x = scale_int_x>>(shifter_x)
*
* This is very useful for INT pipeline;
*/
struct dla_float_data {
int16_t scale;
int8_t shifter;
uint8_t reserved0;
} __packed __aligned(4);
/**
* For INT pipeline, we use the struct above to represent a floating number;
* For FP16 pipeline, we should store the FP16 encoded value into a uint16_t
* container
*/
union dla_slope {
struct dla_float_data data_i;
uint16_t data_f;
};
struct dla_lut_param {
/**
* value of expression ((1<<LUT_LINEAR_EXP_TABLE_ENTRY_LOG2)+1) is 65,
* ((1<<LUT_LINEAR_ONLY_TABLE_ENTRY_LOG2)+1) is 257, and int16_t is of
* 2Byte. And below two statement's combined memory size is 644 Byte.
*
* NOTE: below two declaration combined size should always be multiple
* of 4.
*/
int16_t linear_exp_table[(1<<LUT_LINEAR_EXP_TABLE_ENTRY_LOG2)+1];
int16_t linear_only_table[(1<<LUT_LINEAR_ONLY_TABLE_ENTRY_LOG2)+1];
union dla_lut_offset linear_exp_offset;
union dla_lut_offset linear_only_offset;
/**
* The start and end point of raw table,
* valid when raw_method=LINEAR only
*/
uint64_t linear_exp_start;
uint64_t linear_exp_end;
uint64_t linear_only_start;
uint64_t linear_only_end;
union dla_slope linear_exp_underflow_slope;
union dla_slope linear_exp_overflow_slope;
union dla_slope linear_only_underflow_slope;
union dla_slope linear_only_overflow_slope;
/**
* dla_lut_priority, when both lut are hit(or one overflow,
* the other underflow), which one should be selected as output
*/
uint8_t hybrid_priority;
uint8_t underflow_priority;
uint8_t overflow_priority;
uint8_t method; /* dla_lut_method */
} __packed __aligned(4);
struct dla_sdp_surface_desc {
/* Data cube */
/* source input cube, available when SDP working on offline mode */
struct dla_data_cube src_data;
/* X1 input cube */
struct dla_data_cube x1_data;
/* X2 input cube */
struct dla_data_cube x2_data;
/* Y input cube */
struct dla_data_cube y_data;
/* Output cube */
struct dla_data_cube dst_data;
} __packed __aligned(4);
#define SDP_OP_NONE 0
#define SDP_OP_MUL 1
#define SDP_OP_ADD 2
#define SDP_OP_BOTH 3
#define SDP_ALU_OP_MAX 0
#define SDP_ALU_OP_MIN 1
#define SDP_ALU_OP_SUM 2
#define SDP_ALU_OP_EQL 3
#define SDP_OP_PER_LAYER 0
#define SDP_OP_PER_KERNEL 1
#define SDP_OP_PER_POINT 2
struct dla_sdp_cvt {
struct dla_cvt_param alu_cvt;
struct dla_cvt_param mul_cvt;
} __packed __aligned(4);
struct dla_sdp_op {
uint8_t enable;
uint8_t alu_type; /* dla_sdp_alu_op_type */
uint8_t type; /* dla_sdp_op_type */
uint8_t mode; /* dla_sdp_op_mode */
uint8_t act; /* dla_act_type */
uint8_t shift_value; /* left shift */
uint8_t truncate;
uint8_t precision;
int32_t alu_operand;
int32_t mul_operand;
struct dla_sdp_cvt cvt;
} __packed __aligned(4);
struct dla_sdp_op_desc {
/* Precision parameters */
/* dla_precision */
uint8_t src_precision;
uint8_t dst_precision;
int16_t lut_index;
struct dla_cvt_param out_cvt;
/* Performance parameters */
/* dla_conv_mode */
uint8_t conv_mode;
uint8_t batch_num;
uint16_t reserved0;
uint32_t batch_stride; /* will be used when batch_num > 1 */
/* Algorithm parameters */
struct dla_sdp_op x1_op;
struct dla_sdp_op x2_op;
struct dla_sdp_op y_op;
} __packed __aligned(4);
struct dla_sdp_stat_desc {
uint32_t nan_input_num;
uint32_t inf_input_num;
uint32_t nan_output_num;
uint32_t wdma_write_stall;
uint32_t lut_underflow;
uint32_t lut_overflow;
uint32_t lut_hybrid;
uint32_t lut_le_hit;
uint32_t lut_lo_hit;
uint32_t saturation_count;
uint32_t runtime;
} __packed __aligned(4);
#define POOL_MODE_AVG 0
#define POOL_MODE_MAX 1
#define POOL_MODE_MIN 2
#define POOL_SIZE_1 0
#define POOL_SIZE_2 1
#define POOL_SIZE_3 2
#define POOL_SIZE_4 3
#define POOL_SIZE_5 4
#define POOL_SIZE_6 5
#define POOL_SIZE_7 6
#define POOL_SIZE_8 7
#define PDP_PAD_VAL_NUM 7
struct dla_pdp_surface_desc {
/* Data cube */
struct dla_data_cube src_data;
struct dla_data_cube dst_data;
} __packed __aligned(4);
struct dla_pdp_op_desc {
/* Performance parameters */
uint16_t partial_in_width_first;
uint16_t partial_in_width_mid;
uint16_t partial_in_width_last;
uint16_t partial_width_first;
uint16_t partial_width_mid;
uint16_t partial_width_last;
uint8_t split_num;
/* Algorithm parameters */
uint8_t pool_mode; /* dla_pool_mode */
uint8_t pool_width; /* dla_pool_width */
uint8_t pool_height; /* dla_pool_height */
uint8_t stride_x;
uint8_t stride_y;
/**
* The left/right padding size,
* pad_right might be less than pad_left
*/
uint8_t pad_left;
uint8_t pad_right;
/* The top/bottom padding size */
uint8_t pad_top;
uint8_t pad_bottom;
/* Precision parameters */
uint8_t precision; /* dla_precision */
uint8_t reserved0;
/**
* if input has non-zero "offset", this value should be set
* There'll be 7 different paddding values, the relationship between
* those versions are:
* padding_value[0] = -offset*scaling;
* padding_value[1] = 2*padding_value[0]
* padding_value[2] = 3*padding_value[0]
* ...
* The purpose is to avoid ucode implement FP16
* multiplier(for FP16 mode)
*/
int32_t padding_value[PDP_PAD_VAL_NUM];
} __packed __aligned(4);
struct dla_pdp_stat_desc {
uint32_t inf_input_num;
uint32_t nan_input_num;
uint32_t nan_output_num;
uint32_t write_stall;
uint32_t runtime;
} __packed __aligned(4);
struct dla_cdp_surface_desc {
/* Data cube */
struct dla_data_cube src_data;
struct dla_data_cube dst_data;
} __packed __aligned(4);
struct dla_cdp_op_desc {
/* Precision parameters */
/* dla_precision */
uint8_t in_precision;
uint8_t out_precision;
int16_t lut_index;
struct dla_cvt_param in_cvt;
struct dla_cvt_param out_cvt;
/* Performance parameters */
/* Algorithm parameters */
uint8_t local_size;
uint8_t bypass_sqsum;
uint8_t bypass_out_mul;
uint8_t reserved0;
} __packed __aligned(4);
struct dla_cdp_stat_desc {
uint32_t nan_input_num;
uint32_t inf_input_num;
uint32_t nan_output_num;
uint32_t write_stall;
uint32_t lut_uflow;
uint32_t lut_oflow;
uint32_t lut_hybrid;
uint32_t lut_le_hit;
uint32_t lut_lo_hit;
uint32_t saturation_count;
uint32_t runtime;
} __packed __aligned(4);
struct dla_rubik_surface_desc {
/* Data cube */
struct dla_data_cube src_data;
struct dla_data_cube dst_data;
} __packed __aligned(4);
/* rubik mode */
#define RUBIK_MODE_CONTRACT 0
#define RUBIK_MODE_SPLIT 1
#define RUBIK_MODE_MERGE 2
struct dla_rubik_op_desc {
/* Precision parameters */
uint8_t mode;
uint8_t precision;
uint8_t stride_x;
uint8_t stride_y;
} __packed __aligned(4);
struct dla_rubik_stat_desc {
uint32_t read_stall;
uint32_t write_stall;
uint32_t runtime;
} __packed __aligned(4);
union dla_surface_container {
struct dla_bdma_surface_desc bdma_surface;
struct dla_conv_surface_desc conv_surface;
struct dla_sdp_surface_desc sdp_surface;
struct dla_pdp_surface_desc pdp_surface;
struct dla_cdp_surface_desc cdp_surface;
struct dla_rubik_surface_desc rubik_surface;
};
union dla_operation_container {
struct dla_bdma_op_desc bdma_op;
struct dla_conv_op_desc conv_op;
struct dla_sdp_op_desc sdp_op;
struct dla_pdp_op_desc pdp_op;
struct dla_cdp_op_desc cdp_op;
struct dla_rubik_op_desc rubik_op;
};
union dla_stat_container {
struct dla_bdma_stat_desc bdma_stat;
struct dla_conv_stat_desc conv_stat;
struct dla_sdp_stat_desc sdp_stat;
struct dla_pdp_stat_desc pdp_stat;
struct dla_cdp_stat_desc cdp_stat;
struct dla_rubik_stat_desc rubik_stat;
};
/**
* status notifier structure
*
* @address: 64-bit timestamp representing the time at which
* the notifier was written
* @status_engine: status work captured from HW engine
* @subframe: NA
* @status_task: status word as configured from an action list
*/
struct dla_task_status {
uint64_t timestamp;
uint32_t status_engine;
uint16_t subframe;
uint16_t status_task;
} __packed __aligned(4);
#endif

View file

@ -0,0 +1,74 @@
/*
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __DLA_SCHED_H_
#define __DLA_SCHED_H_
struct dla_task {
/* platform specific data to communicate with portability layer */
void *task_data;
/* task state */
uint32_t state;
/* Task base address */
uint64_t base;
/* start address of a list of dla_operation_container */
uint64_t operation_desc_addr;
/* start address of a list of dla_surface_container */
uint64_t surface_desc_addr;
/* start address of a list of dla_common_op_desc */
uint64_t dependency_graph_addr;
/* start address of a list of dla_lut_param */
uint64_t lut_data_addr;
/*
* start address of a list of dla_roi_desc,
* the first one is dla_roi_array_desc
* valid when network.dynamic_roi is true
*/
uint64_t roi_array_addr;
/* start address of a list of dla_surface_container */
uint64_t surface_addr;
/* start address of a list of dla_stat_container */
uint64_t stat_data_addr;
} __packed __aligned(256);
/**
* @brief Configuration parameters supported by the engine
*
* atom_size Memory smallest access size
* bdma_enable Defines whether bdma is supported
* rubik_enable Defines whether rubik is supported
* weight_compress_support Defines whether weight data compression is supported
*/
struct dla_config {
uint32_t atom_size;
bool bdma_enable;
bool rubik_enable;
bool weight_compress_support;
};
#endif

View file

@ -0,0 +1,327 @@
/*
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __NVDLA_INTERFACE_H_
#define __NVDLA_INTERFACE_H_
#include <linux/types.h>
/**
* @brief Register driver to firmware
*
* Implementation in firmware, called by portability layer
*
* This function must be called once during boot to initialize DLA
* engine scheduler and register driver with firmware before submitting
* any task. Pass pointer to driver context in @param driver_context
* which is passed as param when firmware calls any function
* of portability layer. It also updates pointer to engine context
* which must be passed in any function call to firmware after this point.
*
* @param engine_context Pointer to engine specific data
* @param driver_context Pointer to driver specific data
*
* @return 0 on success and negative on error
*/
int32_t dla_register_driver(void **engine_context, void *driver_context);
/**
* @brief Interrupt handler
*
* Implementation in firmware, called by portability layer
*
* This function is called when DLA interrupt is received. Portability layer
* should register it's own handler using the mechanism supported by that platform
* and call this function from the handler. Call to this function must be
* protected by lock to prevent handling interrupt when firmware is programming
* layers in process context.
*
* @param engine_context Engine specific data received in dla_register_driver
*
* @return 0 on success and negative on error
*/
int32_t dla_isr_handler(void *engine_context);
/**
* @brief Process events recorded in interrupt handler
*
* Implementation in firmware, called by portability layer
*
* Interrupt handler just records events and does not process those events.
* Portability layer must call this function in thread/process context after
* interrupt handler is done.
*
* @param engine_context Engine specific data received in dla_register_driver
* @param task_complete Pointer to parameter to indicate task complete,
firmare writes 1 to it if all layers are processed.
*
* @return 0 on success and negative on error
*
*/
int32_t dla_process_events(void *engine_context, uint32_t *task_complete);
/**
* @brief Clear task from firmware
*
* Implementation in firmware, called by portability layer
*
* This function resets engine scheduler state including op descriptor cache,
* error values, sub-engine status, events etc and clears previous task state
* from firmware. This function can be called by portability layer after
* task completion. It is not mandatory to call it but calling it will
* ensure clean state before next task execution.
*
* @param engine_context Engine specific data received in dla_register_driver
*
* @return 0 on success and negative on error
*
*/
void dla_clear_task(void *engine_context);
/**
* @brief Execute task
*
* Implementation in firmware, called by portability layer
*
* This function initializes sub-engines and starts task execution. Further
* programming and layer scheduling is triggered by events received from
* hardware.
*
* @param engine_context Engine specific data received in dla_register_driver
* @param task_data Task specific data to be passed when reading task info
* @param config_data Configuration data to be passed
*
* @return 0 on success and negative on error
*
*/
int32_t dla_execute_task(void *engine_context, void *task_data, void *config_data);
/**
* @brief Register read
*
* Implementation in portability layer, called by firmware
*
* Read DLA HW register. Portability layer is responsible to use correct
* base address and for any IO mapping if required.
*
* @param engine_context Driver specific data received in dla_register_driver
* @param addr Register offset
*
* @return Register value
*
*/
uint32_t dla_reg_read(void *driver_context, uint32_t addr);
/**
* @brief Register write
*
* Implementation in portability layer, called by firmware
*
* Write DLA HW registr. Portability layer is responsible to use correct
* base address and for any IO mapping if required.
*
* @param driver_context Driver specific data received in dla_register_driver
* @param addr Register offset
* @param reg Value to write
*
*/
void dla_reg_write(void *driver_context, uint32_t addr, uint32_t reg);
/**
* @brief Read data from DMA mapped memory in local buffer
*
* Implementation in portability layer, called by firmware
*
* This function reads data from buffers passed by UMD in local memory.
* Addresses for buffers passed by are shared in address list and network
* descriptor contains index in address list for those buffers. Firmware
* reads this data from buffer shared by UMD into local buffer to consume
* the information.
*
* @param driver_context Driver specific data received in dla_register_driver
* @param task_data Task specific data received in dla_execute_task
* @param src Index in address list
* @param dst Pointer to local memory
* @param size Size of data to copy
* @param offset Offset from start of UMD buffer
*
* @return 0 on success and negative on error
*
*/
int32_t dla_data_read(void *driver_context, void *task_data,
uint64_t src, void *dst,
uint32_t size, uint64_t offset);
/**
* @brief Write data to DMA mapped memory from local buffer
*
* Implementation in portability layer, called by firmware
*
* This function writes data from local buffer to buffer passed by UMD.
* Addresses for buffers passed by are shared in address list and network
* descriptor contains index in address list for those buffers. Firmware
* writes this data to buffer shared by UMD from local buffer to update
* the information.
*
* @param driver_context Driver specific data received in dla_register_driver
* @param task_data Task specific data received in dla_execute_task
* @param src Pointer to local memory
* @param dst Index in address list
* @param size Size of data to copy
* @param offset Offset from start of UMD buffer
*
* @return 0 on success and negative on error
*
*/
int32_t dla_data_write(void *driver_context, void *task_data,
void *src, uint64_t dst,
uint32_t size, uint64_t offset);
/* Destination for DMA buffer */
#define DESTINATION_PROCESSOR 0
#define DESTINATION_DMA 1
/**
* @brief Read DMA address
*
* Implementation in portability layer, called by firmware
*
* Some buffers shared by UMD are accessed by processor responsible for
* programming DLA HW. It would be companion micro-controller in case of
* headed config while main CPU in case of headless config. Also, some
* buffers are accessed by DLA DMA engines inside sub-engines. This function
* should return proper address accessible by destination user depending
* on config.
*
* @param driver_context Driver specific data received in dla_register_driver
* @param task_data Task specific data received in dla_execute_task
* @param index Index in address list
* @param dst_ptr Pointer to update address
* @param destination Destination user for DMA address
*
* @return 0 on success and negative on error
*
*/
int32_t dla_get_dma_address(void *driver_context, void *task_data,
int16_t index, void *dst_ptr,
uint32_t destination);
/**
* @brief Read time value in micro-seconds
*
* Implementation in portability layer, called by firmware
*
* Read system time in micro-seconds
*
* @return Time value in micro-seconds
*
*/
int64_t dla_get_time_us(void);
/**
* @brief Print debug message
*
* Implementation in portability layer, called by firmware
*
* Print debug message to console
*
* @param str Format string and variable arguments
*
*/
void dla_debug(const char *str, ...);
/**
* @brief Print information message
*
* Implementation in portability layer, called by firmware
*
* Print information message to console
*
* @param str Format string and variable arguments
*
*/
void dla_info(const char *str, ...);
/**
* @brief Print warning message
*
* Implementation in portability layer, called by firmware
*
* Print warning message to console
*
* @param str Format string and variable arguments
*
*/
void dla_warn(const char *str, ...);
/**
* @brief Print error message
*
* Implementation in portability layer, called by firmware
*
* Print error message to console
*
* @param str Format string and variable arguments
*
*/
void dla_error(const char *str, ...);
/**
* @brief Fill memory region
*
* Implementation in portability layer, called by firmware
*
* Fills the first len bytes of the memory area pointed to by src
* with the constant byte ch.
*
* @param src Memory area address
* @param ch Byte to fill
* @param len Length of memory area to fill
*
* @return Memory area address
*
*/
void *dla_memset(void *src, int ch, uint64_t len);
/**
* @brief Copy memory
*
* Implementation in portability layer, called by firmware
*
* Copies len bytes from memory area src to memory area dest.
*
* @param dest Destination memory area address
* @param src Source memory area address
* @param len Length of memory area to copy
*
* @return Destination memory area address
*
*/
void *dla_memcpy(void *dest, const void *src, uint64_t len);
#endif

View file

@ -0,0 +1,138 @@
/*
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __LINUX_NVDLA_IOCTL_H
#define __LINUX_NVDLA_IOCTL_H
#include <linux/ioctl.h>
#include <linux/types.h>
#if !defined(__KERNEL__)
#define __user
#endif
/**
* struct nvdla_mem_handle structure for memory handles
*
* @handle handle to DMA buffer allocated in userspace
* @reserved Reserved for padding
* @offset offset in bytes from start address of buffer
*
*/
struct nvdla_mem_handle {
__u32 handle;
__u32 reserved;
__u64 offset;
};
/**
* struct nvdla_ioctl_submit_task structure for single task information
*
* @num_addresses total number of entries in address_list
* @reserved Reserved for padding
* @address_list pointer to array of struct nvdla_mem_handle
*
*/
struct nvdla_ioctl_submit_task {
#define NVDLA_MAX_BUFFERS_PER_TASK (6144)
__u32 num_addresses;
#define NVDLA_NO_TIMEOUT (0xffffffff)
__u32 timeout;
__u64 address_list;
};
/**
* struct nvdla_submit_args structure for task submit
*
* @tasks pointer to array of struct nvdla_ioctl_submit_task
* @num_tasks number of entries in tasks
* @flags flags for task submit, no flags defined yet
* @version version of task structure
*
*/
struct nvdla_submit_args {
__u64 tasks;
__u16 num_tasks;
#define NVDLA_MAX_TASKS_PER_SUBMIT 24
#define NVDLA_SUBMIT_FLAGS_ATOMIC (1 << 0)
__u16 flags;
__u32 version;
};
/**
* struct nvdla_gem_create_args for allocating DMA buffer through GEM
*
* @handle handle updated by kernel after allocation
* @flags implementation specific flags
* @size size of buffer to allocate
*/
struct nvdla_gem_create_args {
__u32 handle;
__u32 flags;
__u64 size;
};
/**
* struct nvdla_gem_map_offset_args for mapping DMA buffer
*
* @handle handle of the buffer
* @reserved reserved for padding
* @offset offset updated by kernel after mapping
*/
struct nvdla_gem_map_offset_args {
__u32 handle;
__u32 reserved;
__u64 offset;
};
/**
* struct nvdla_gem_destroy_args for destroying DMA buffer
*
* @handle handle of the buffer
*/
struct nvdla_gem_destroy_args {
__u32 handle;
};
#define DRM_NVDLA_SUBMIT 0x00
#define DRM_NVDLA_GEM_CREATE 0x01
#define DRM_NVDLA_GEM_MMAP 0x02
#define DRM_NVDLA_GEM_DESTROY 0x03
#define DRM_IOCTL_NVDLA_SUBMIT DRM_IOWR(DRM_COMMAND_BASE + DRM_NVDLA_SUBMIT, struct nvdla_submit_args)
#define DRM_IOCTL_NVDLA_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE + DRM_NVDLA_GEM_CREATE, struct nvdla_gem_create_args)
#define DRM_IOCTL_NVDLA_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + DRM_NVDLA_GEM_MMAP, struct nvdla_gem_map_offset_args)
#define DRM_IOCTL_NVDLA_GEM_DESTROY DRM_IOWR(DRM_COMMAND_BASE + DRM_NVDLA_GEM_DESTROY, struct nvdla_gem_destroy_args)
#endif

View file

@ -0,0 +1,153 @@
/*
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __LINUX_NVDLA_LINUX_H_
#define __LINUX_NVDLA_LINUX_H_
#include <linux/completion.h>
#include <linux/device.h>
#include <linux/kref.h>
#include <linux/platform_device.h>
#include <linux/spinlock.h>
/**
* @brief Task information submitted from user space
*
* ref Reference count for task
* num_addresses Number of addresses in address list
* nvdla_dev Pointer to NVDLA device
* address_list Address list
* file DRM file instance
*/
struct nvdla_task {
struct kref ref;
uint32_t num_addresses;
struct nvdla_device *nvdla_dev;
struct nvdla_mem_handle *address_list;
struct drm_file *file;
};
/**
* @brief Configuration parameters supported by the engine
*
* atom_size Memory smallest access size
* bdma_enable Defines whether bdma is supported
* rubik_enable Defines whether rubik is supported
* weight_compress_support Defines whether weight data compression is supported
*/
struct nvdla_config
{
uint32_t atom_size;
bool bdma_enable;
bool rubik_enable;
bool weight_compress_support;
};
/**
* @brief NVDLA device
*
* irq Interrupt number associated with this device
* ref Reference count for device
* base IO mapped base address for device
* nvdla_lock Spinlock used for synchronization
* drm DRM device instance
* task Pointer to task in execution
* config_data Pointer to the configuration data
* pdev Pointer to NVDLA platform device
* event_notifier Completion object used to wait for events from HW
* engine_context Private data passed from engine in dla_engine_init
*/
struct nvdla_device {
int32_t irq;
struct kref ref;
void __iomem *base;
spinlock_t nvdla_lock;
struct drm_device *drm;
struct nvdla_task *task;
struct nvdla_config *config_data;
struct platform_device *pdev;
struct completion event_notifier;
void *engine_context;
};
/**
* @brief Submit task
*
* This function submits task to NVDLA engine.
*
* @param nvdla_dev Pointer to NVDLA device
* @param task Pointer to task
* @return 0 on success and negative on error
*
*/
int32_t nvdla_task_submit(struct nvdla_device *nvdla_dev, struct nvdla_task *task);
/**
* @brief Get DMA address
*
* This function gets DMA address for given fd
*
* @param dev DRM device instance
* @param file DRM file instance
* @param fd File desriptor for DMA buffer
* @param addr Pointer to update DMA address
* @return 0 on success and negative on error
*
*/
int32_t nvdla_gem_dma_addr(struct drm_device *dev, struct drm_file *file,
uint32_t fd, dma_addr_t *addr);
/**
* @brief DRM probe
*
* Probe function for DRM device
*
* @param nvdla_dev NVDLA device pointer
* @return 0 on success and negative on error
*
*/
int32_t nvdla_drm_probe(struct nvdla_device *nvdla_dev);
/**
* @brief DRM remove
*
* Remove function for DRM device
*
* @param nvdla_dev NVDLA device pointer
*
*/
void nvdla_drm_remove(struct nvdla_device *nvdla_dev);
#endif

View file

@ -0,0 +1,40 @@
/*
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __OPENDLA_H_
#define __OPENDLA_H_
#define DLA_2_CONFIG
#ifdef DLA_2_CONFIG
#include <opendla_small.h>
#else
#include <opendla_initial.h>
#endif
#endif

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,448 @@
/*
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdarg.h>
#include <linux/dma-buf.h>
#include <linux/dma-mapping.h>
#include <linux/fs.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/irqdomain.h>
#include <linux/module.h>
#include <linux/of.h>
#include <linux/of_device.h>
#include <linux/of_irq.h>
#include <linux/of_platform.h>
#include <linux/platform_device.h>
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/time.h>
#include <linux/uaccess.h>
#include <nvdla_interface.h>
#include <nvdla_linux.h>
#include <nvdla_ioctl.h>
#include <opendla.h>
static struct nvdla_config nvdla_config_os_initial = {
.atom_size = 32,
.bdma_enable = true,
.rubik_enable = true,
.weight_compress_support = true,
};
static struct nvdla_config nvdla_config_small = {
.atom_size = 8,
.bdma_enable = false,
.rubik_enable = false,
.weight_compress_support = false,
};
static struct nvdla_config nvdla_config_large = {
.atom_size = 32,
.bdma_enable = false,
.rubik_enable = false,
.weight_compress_support = false,
};
void dla_debug(const char *str, ...)
{
va_list args;
va_start(args, str);
vprintk(pr_fmt(str), args);
va_end(args);
}
void dla_info(const char *str, ...)
{
va_list args;
va_start(args, str);
vprintk(str, args);
va_end(args);
}
void dla_warn(const char *str, ...)
{
va_list args;
va_start(args, str);
vprintk(str, args);
va_end(args);
}
void dla_error(const char *str, ...)
{
va_list args;
va_start(args, str);
vprintk(str, args);
va_end(args);
}
void *dla_memset(void *src, int ch, uint64_t len)
{
memset(src, ch, len);
return src;
}
void *dla_memcpy(void *dest, const void *src, uint64_t len)
{
return memcpy(dest, src, len);
}
int64_t dla_get_time_us(void)
{
return ktime_get_ns() / NSEC_PER_USEC;
}
void dla_reg_write(void *driver_context, uint32_t addr, uint32_t reg)
{
struct nvdla_device *nvdla_dev =
(struct nvdla_device *)driver_context;
if (!nvdla_dev)
return;
writel(reg, nvdla_dev->base + addr);
}
uint32_t dla_reg_read(void *driver_context, uint32_t addr)
{
struct nvdla_device *nvdla_dev =
(struct nvdla_device *)driver_context;
if (!nvdla_dev)
return 0;
return readl(nvdla_dev->base + addr);
}
static irqreturn_t nvdla_engine_isr(int32_t irq, void *data)
{
unsigned long flags;
struct nvdla_device *nvdla_dev = (struct nvdla_device *)data;
if (!nvdla_dev)
return IRQ_NONE;
spin_lock_irqsave(&nvdla_dev->nvdla_lock, flags);
dla_isr_handler(nvdla_dev->engine_context);
complete(&nvdla_dev->event_notifier);
spin_unlock_irqrestore(&nvdla_dev->nvdla_lock, flags);
return IRQ_HANDLED;
}
static int32_t dla_read_dma_address(void *driver_context, void *task_data,
int16_t index, void *dst)
{
int32_t ret = 0;
struct nvdla_mem_handle *handles;
dma_addr_t *phys_addr = (dma_addr_t *)(dst);
struct nvdla_device *nvdla_dev =
(struct nvdla_device *)driver_context;
struct nvdla_task *task = (struct nvdla_task *)task_data;
if (index == -1 || index > task->num_addresses)
return -EINVAL;
handles = (struct nvdla_mem_handle *)task->address_list;
ret = nvdla_gem_dma_addr(nvdla_dev->drm, task->file,
handles[index].handle,
phys_addr);
/* Add offset to IOVA address */
*phys_addr = *phys_addr + handles[index].offset;
return ret;
}
static int32_t dla_read_cpu_address(void *driver_context, void *task_data,
int16_t index, void *dst)
{
uint64_t *temp = (uint64_t *)dst;
struct nvdla_task *task = (struct nvdla_task *)task_data;
if (index == -1 || index > task->num_addresses)
return -EINVAL;
*temp = (uint64_t)index;
return 0;
}
int32_t dla_get_dma_address(void *driver_context, void *task_data,
int16_t index, void *dst_ptr,
uint32_t destination)
{
int32_t ret = 0;
if (destination == DESTINATION_PROCESSOR) {
ret = dla_read_cpu_address(driver_context, task_data,
index, dst_ptr);
} else if (destination == DESTINATION_DMA) {
ret = dla_read_dma_address(driver_context, task_data,
index, dst_ptr);
} else {
ret = -EINVAL;
}
return ret;
}
int32_t dla_data_write(void *driver_context, void *task_data,
void *src, uint64_t dst,
uint32_t size, uint64_t offset)
{
int32_t ret;
void *ptr = NULL;
struct dma_buf *buf;
struct dma_buf_map map;
struct nvdla_mem_handle *handles;
struct nvdla_task *task = (struct nvdla_task *)task_data;
uint64_t dma_addr = 0;
dla_get_dma_address(driver_context, task_data,dst, (void *)&dma_addr, DESTINATION_DMA);
handles = task->address_list;
buf = dma_buf_get(handles[dst].handle);
if (IS_ERR(buf)) {
pr_err("%s: Failed get dma_buf for handle=%d\n", __func__,
handles[dst].handle);
return -EFAULT;
}
ret = dma_buf_begin_cpu_access(buf, DMA_BIDIRECTIONAL);
if (ret)
goto put_dma_buf;
ret = dma_buf_vmap(buf, &map);
ptr = ret ? NULL : map.vaddr;
if (!ptr) {
pr_err("%s: Failed to vmap dma_buf for handle=%d\n", __func__,
handles[dst].handle);
ret = -ENOMEM;
goto end_cpu_access;
}
memcpy((void *)((uint8_t *)ptr + offset), src, size);
dma_buf_vunmap(buf, &map);
end_cpu_access:
dma_buf_end_cpu_access(buf, DMA_BIDIRECTIONAL);
put_dma_buf:
dma_buf_put(buf);
return ret;
}
int32_t dla_data_read(void *driver_context, void *task_data,
uint64_t src, void *dst,
uint32_t size, uint64_t offset)
{
int32_t ret;
void *ptr = NULL;
struct dma_buf *buf;
struct dma_buf_map map;
struct nvdla_mem_handle *handles;
struct nvdla_task *task = (struct nvdla_task *)task_data;
uint64_t dma_addr = 0;
dla_get_dma_address(driver_context, task_data, src, (void *)&dma_addr, DESTINATION_DMA);
handles = task->address_list;
buf = dma_buf_get(handles[src].handle);
if (IS_ERR(buf)) {
pr_err("%s: Failed get dma_buf for handle=%d\n", __func__,
handles[src].handle);
return -EFAULT;
}
ret = dma_buf_begin_cpu_access(buf, DMA_BIDIRECTIONAL);
if (ret)
goto put_dma_buf;
ret = dma_buf_vmap(buf, &map);
ptr = ret ? NULL : map.vaddr;
if (!ptr) {
pr_err("%s: Failed to vmap dma_buf for handle=%d\n", __func__,
handles[src].handle);
ret = -ENOMEM;
goto end_cpu_access;
}
memcpy(dst, (void *)(((uint8_t *)ptr) + offset), size);
dma_buf_vunmap(buf, &map);
end_cpu_access:
dma_buf_end_cpu_access(buf, DMA_BIDIRECTIONAL);
put_dma_buf:
dma_buf_put(buf);
return ret;
}
int32_t nvdla_task_submit(struct nvdla_device *nvdla_dev, struct nvdla_task *task)
{
int32_t err = 0;
uint32_t task_complete = 0;
nvdla_dev->task = task;
err = dla_execute_task(nvdla_dev->engine_context, (void *)task, nvdla_dev->config_data);
if (err) {
pr_err("Task execution failed\n");
return err;
}
pr_debug("Wait for task complete\n");
while (1) {
unsigned long flags;
wait_for_completion(&nvdla_dev->event_notifier);
spin_lock_irqsave(&nvdla_dev->nvdla_lock, flags);
err = dla_process_events(nvdla_dev->engine_context, &task_complete);
spin_unlock_irqrestore(&nvdla_dev->nvdla_lock, flags);
if (err || task_complete)
break;
}
pr_debug("Task complete\n");
dla_clear_task(nvdla_dev->engine_context);
return err;
}
/* driver probe and init */
static const struct of_device_id nvdla_of_match[] = {
{
.compatible = "nvidia,nvdla_os_initial",
.data = &nvdla_config_os_initial,
},
{
.compatible = "nvidia,nv_small",
.data = &nvdla_config_small,
},
{
.compatible = "nvidia,nv_large",
.data = &nvdla_config_large,
},
{ },
};
static int32_t nvdla_probe(struct platform_device *pdev)
{
int32_t err = 0;
struct resource *res;
struct nvdla_device *nvdla_dev;
struct device *dev = &pdev->dev;
const struct of_device_id *match;
if (!pdev->dev.of_node)
return -EINVAL;
match = of_match_device(nvdla_of_match, &pdev->dev);
if (!match) {
pr_err("Missing DT entry!\n");
return -EINVAL;
}
pr_err("Probe NVDLA config %s\n", match->compatible);
nvdla_dev = devm_kzalloc(dev, sizeof(*nvdla_dev), GFP_KERNEL);
if (!nvdla_dev)
return -ENOMEM;
platform_set_drvdata(pdev, nvdla_dev);
nvdla_dev->pdev = pdev;
nvdla_dev->config_data = (struct nvdla_config *)match->data;
init_completion(&nvdla_dev->event_notifier);
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
nvdla_dev->base = devm_ioremap_resource(&pdev->dev, res);
if (IS_ERR(nvdla_dev->base))
return PTR_ERR(nvdla_dev->base);
res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
if (!res) {
dev_err(&pdev->dev, "no irq resource\n");
return -EINVAL;
}
nvdla_dev->irq = res->start;
err = devm_request_irq(&pdev->dev, nvdla_dev->irq,
nvdla_engine_isr, 0,
dev_name(&pdev->dev), nvdla_dev);
if (err)
return err;
dla_register_driver(&nvdla_dev->engine_context, (void *)nvdla_dev);
dla_clear_task(nvdla_dev->engine_context);
err = nvdla_drm_probe(nvdla_dev);
if (err)
dev_err(&pdev->dev, "failed to register drm device\n");
return err;
}
static int32_t __exit nvdla_remove(struct platform_device *pdev)
{
struct nvdla_device *nvdla_dev = dev_get_drvdata(&pdev->dev);
nvdla_drm_remove(nvdla_dev);
return 0;
}
static struct platform_driver nvdla_driver = {
.probe = nvdla_probe,
.remove = __exit_p(nvdla_remove),
.driver = {
.owner = THIS_MODULE,
.name = "NVDLA",
.of_match_table = of_match_ptr(nvdla_of_match),
},
};
module_platform_driver(nvdla_driver);
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("NVIDIA");
MODULE_DESCRIPTION("Nvidia Deep Learning Accelerator driver");

475
drivers/nvdla/nvdla_gem.c Normal file
View file

@ -0,0 +1,475 @@
/*
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <drm/drm_device.h>
#include <drm/drm_drv.h>
#include <drm/drm_gem.h>
#include <drm/drm_gem_cma_helper.h>
#include <linux/dma-buf.h>
#include <linux/dma-mapping.h>
#include <linux/dma-map-ops.h>
#include <linux/of.h>
#include <linux/of_address.h>
#include <nvdla_linux.h>
#include <nvdla_ioctl.h>
#include <opendla.h>
#define to_nvdla_obj(x) container_of(x, struct nvdla_gem_object, object)
struct nvdla_gem_object {
struct drm_gem_object object;
void *kvaddr;
dma_addr_t dma_addr;
unsigned long dma_attrs;
};
static int32_t nvdla_fill_task_desc(struct nvdla_ioctl_submit_task *local_task,
struct nvdla_task *task)
{
struct nvdla_mem_handle *handles;
/* update task desc fields */
task->num_addresses = local_task->num_addresses;
handles = kzalloc(local_task->num_addresses *
sizeof(struct nvdla_mem_handle), GFP_KERNEL);
if (handles == NULL)
return -EFAULT;
/* get user addresses list */
if (copy_from_user(handles,
(void __user *)local_task->address_list,
(task->num_addresses *
sizeof(struct nvdla_mem_handle)))) {
pr_err("failed to copy address list from user ptr\n");
kfree(handles);
return -EFAULT;
}
task->address_list = handles;
return 0;
}
static int32_t nvdla_submit(struct drm_device *drm, void *arg,
struct drm_file *file)
{
int32_t err = 0;
struct nvdla_task *task;
struct nvdla_ioctl_submit_task local_task;
struct nvdla_ioctl_submit_task __user *user_task;
struct nvdla_device *nvdla_dev = dev_get_drvdata(drm->dev);
struct nvdla_submit_args *args =
(struct nvdla_submit_args *)arg;
user_task = (struct nvdla_ioctl_submit_task __user *)
(uintptr_t)args->tasks;
if (!user_task)
return -EINVAL;
/* IOCTL copy descriptors */
if (copy_from_user(&local_task, (void __user *)user_task,
(sizeof(*user_task))))
return -EFAULT;
task = kzalloc(sizeof(*task), GFP_KERNEL);
if (task == NULL)
return -EFAULT;
nvdla_dev->task = task;
kref_init(&task->ref);
task->nvdla_dev = nvdla_dev;
task->file = file;
/* update task desc fields */
err = nvdla_fill_task_desc(&local_task, task);
if (err)
goto free_task_desc;
err = nvdla_task_submit(nvdla_dev, task);
kfree(task->address_list);
free_task_desc:
kfree(task);
return err;
}
static int32_t nvdla_gem_alloc(struct nvdla_gem_object *nobj)
{
struct drm_gem_object *dobj = &nobj->object;
struct drm_device *drm = dobj->dev;
nobj->dma_attrs = DMA_ATTR_WRITE_COMBINE;
nobj->kvaddr = dma_alloc_attrs(drm->dev, dobj->size, &nobj->dma_addr,
GFP_KERNEL, nobj->dma_attrs);
if (!nobj->kvaddr)
return -ENOMEM;
return 0;
}
static void nvdla_gem_free(struct nvdla_gem_object *nobj)
{
struct drm_gem_object *dobj = &nobj->object;
struct drm_device *drm = dobj->dev;
dma_free_attrs(drm->dev, dobj->size, nobj->kvaddr, nobj->dma_addr,
nobj->dma_attrs);
}
static void nvdla_gem_free_object(struct drm_gem_object *dobj)
{
struct nvdla_gem_object *nobj;
drm_gem_free_mmap_offset(dobj);
nobj = to_nvdla_obj(dobj);
nvdla_gem_free(nobj);
kfree(nobj);
}
static struct sg_table
*nvdla_drm_gem_prime_get_sg_table(struct drm_gem_object *dobj)
{
int32_t ret;
struct sg_table *sgt;
struct drm_device *drm = dobj->dev;
struct nvdla_gem_object *nobj = to_nvdla_obj(dobj);
sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
if (!sgt)
return ERR_PTR(-ENOMEM);
ret = dma_get_sgtable_attrs(drm->dev, sgt, nobj->kvaddr,
nobj->dma_addr, dobj->size,
nobj->dma_attrs);
if (ret) {
DRM_ERROR("failed to allocate sgt, %d\n", ret);
kfree(sgt);
return ERR_PTR(ret);
}
return sgt;
}
static int nvdla_drm_gem_prime_vmap(struct drm_gem_object *obj, struct dma_buf_map *map)
{
struct nvdla_gem_object *nobj = to_nvdla_obj(obj);
if (nobj->dma_attrs & DMA_ATTR_NO_KERNEL_MAPPING)
return -ENOMEM;
dma_buf_map_set_vaddr(map, nobj->kvaddr);
return 0;
}
static void nvdla_drm_gem_prime_vunmap(struct drm_gem_object *obj, struct dma_buf_map *map)
{
/* Nothing to do */
}
static const struct drm_gem_object_funcs nvdla_gem_funcs = {
.free = nvdla_gem_free_object,
.export = drm_gem_prime_export,
.vmap = nvdla_drm_gem_prime_vmap,
.vunmap = nvdla_drm_gem_prime_vunmap,
.get_sg_table = nvdla_drm_gem_prime_get_sg_table,
.vm_ops = &drm_gem_cma_vm_ops,
};
static struct nvdla_gem_object *
nvdla_gem_create_object(struct drm_device *drm, uint32_t size)
{
int32_t ret;
struct drm_gem_object *dobj;
struct nvdla_gem_object *nobj;
size = round_up(size, PAGE_SIZE);
nobj = kzalloc(sizeof(*nobj), GFP_KERNEL);
if (!nobj)
return ERR_PTR(-ENOMEM);
dobj = &nobj->object;
dobj->funcs = &nvdla_gem_funcs;
drm_gem_private_object_init(drm, dobj, size);
ret = nvdla_gem_alloc(nobj);
if (ret)
goto free_nvdla_obj;
return nobj;
free_nvdla_obj:
kfree(nobj);
return ERR_PTR(ret);
}
static struct nvdla_gem_object *
nvdla_gem_create_with_handle(struct drm_file *file_priv,
struct drm_device *drm, uint32_t size,
uint32_t *handle)
{
int32_t ret;
struct drm_gem_object *dobj;
struct nvdla_gem_object *nobj;
nobj = nvdla_gem_create_object(drm, size);
if (IS_ERR(nobj))
return ERR_CAST(nobj);
dobj = &nobj->object;
ret = drm_gem_handle_create(file_priv, dobj, handle);
if (ret)
goto free_drm_object;
drm_gem_object_put(dobj);
return nobj;
free_drm_object:
nvdla_gem_free_object(dobj);
return ERR_PTR(ret);
}
static int32_t nvdla_gem_create(struct drm_device *drm, void *data,
struct drm_file *file)
{
struct nvdla_gem_object *nobj;
struct nvdla_gem_create_args *args = data;
nobj = nvdla_gem_create_with_handle(file, drm, args->size,
&args->handle);
if (IS_ERR(nobj))
return PTR_ERR(nobj);
return 0;
}
static int32_t nvdla_drm_gem_object_mmap(struct drm_gem_object *dobj,
struct vm_area_struct *vma)
{
int32_t ret;
struct nvdla_gem_object *nobj = to_nvdla_obj(dobj);
struct drm_device *drm = dobj->dev;
vma->vm_flags &= ~VM_PFNMAP;
vma->vm_pgoff = 0;
ret = dma_mmap_attrs(drm->dev, vma, nobj->kvaddr, nobj->dma_addr,
dobj->size, nobj->dma_attrs);
if (ret)
drm_gem_vm_close(vma);
return ret;
}
static int32_t nvdla_drm_gem_mmap_buf(struct drm_gem_object *obj,
struct vm_area_struct *vma)
{
int32_t ret;
ret = drm_gem_mmap_obj(obj, obj->size, vma);
if (ret)
return ret;
return nvdla_drm_gem_object_mmap(obj, vma);
}
static int32_t nvdla_drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
{
int32_t ret;
struct drm_gem_object *obj;
ret = drm_gem_mmap(filp, vma);
if (ret)
return ret;
obj = vma->vm_private_data;
return nvdla_drm_gem_object_mmap(obj, vma);
}
int32_t nvdla_gem_dma_addr(struct drm_device *dev, struct drm_file *file,
uint32_t fd, dma_addr_t *addr)
{
int32_t ret;
uint32_t handle;
struct nvdla_gem_object *nobj;
struct drm_gem_object *dobj;
ret = drm_gem_prime_fd_to_handle(dev, file, fd, &handle);
if (ret)
return ret;
dobj = drm_gem_object_lookup(file, handle);
if (!dobj)
return -EINVAL;
nobj = to_nvdla_obj(dobj);
*addr = nobj->dma_addr;
drm_gem_object_put(dobj);
return 0;
}
static int32_t nvdla_gem_map_offset(struct drm_device *drm, void *data,
struct drm_file *file)
{
int32_t ret;
struct drm_gem_object *dobj;
struct nvdla_gem_map_offset_args *args = data;
dobj = drm_gem_object_lookup(file, args->handle);
if (!dobj)
return -EINVAL;
ret = drm_gem_create_mmap_offset(dobj);
if (ret)
goto out;
args->offset = drm_vma_node_offset_addr(&dobj->vma_node);
out:
drm_gem_object_put(dobj);
return 0;
}
static int32_t nvdla_gem_destroy(struct drm_device *drm, void *data,
struct drm_file *file)
{
struct nvdla_gem_destroy_args *args = data;
return drm_gem_handle_delete(file, args->handle);
}
static const struct file_operations nvdla_drm_fops = {
.owner = THIS_MODULE,
.open = drm_open,
.release = drm_release,
.unlocked_ioctl = drm_ioctl,
.mmap = nvdla_drm_gem_mmap,
.poll = drm_poll,
.read = drm_read,
#ifdef CONFIG_COMPAT
.compat_ioctl = drm_compat_ioctl,
#endif
.llseek = noop_llseek,
};
static const struct drm_ioctl_desc nvdla_drm_ioctls[] = {
DRM_IOCTL_DEF_DRV(NVDLA_SUBMIT, nvdla_submit, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(NVDLA_GEM_CREATE, nvdla_gem_create, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(NVDLA_GEM_MMAP, nvdla_gem_map_offset, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(NVDLA_GEM_DESTROY, nvdla_gem_destroy, DRM_RENDER_ALLOW),
};
static struct drm_driver nvdla_drm_driver = {
.driver_features = DRIVER_GEM | DRIVER_RENDER,
.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
.gem_prime_import = drm_gem_prime_import,
.gem_prime_mmap = nvdla_drm_gem_mmap_buf,
.ioctls = nvdla_drm_ioctls,
.num_ioctls = ARRAY_SIZE(nvdla_drm_ioctls),
.fops = &nvdla_drm_fops,
.name = "nvdla",
.desc = "NVDLA driver",
.date = "20171017",
.major = 0,
.minor = 0,
.patchlevel = 0,
};
int32_t nvdla_drm_probe(struct nvdla_device *nvdla_dev)
{
int32_t err;
struct drm_device *drm;
struct drm_driver *driver = &nvdla_drm_driver;
struct resource res_cma;
struct device_node *node;
drm = drm_dev_alloc(driver, &nvdla_dev->pdev->dev);
if (IS_ERR(drm))
return PTR_ERR(drm);
nvdla_dev->drm = drm;
err = drm_dev_register(drm, 0);
if (err < 0)
goto unref;
/**
* TODO Register separate driver for memory and use DT node to
* read memory range
*/
node = of_parse_phandle(drm->dev->of_node, "memory-region", 0);
if(node ){
dev_info(drm->dev, "Get mem from memory-region\n");
of_address_to_resource(node, 0, &res_cma);
err = dma_declare_coherent_memory(drm->dev, res_cma.start, res_cma.start,resource_size(&res_cma));
} else {
dev_info(drm->dev, "NVDLA using the default mem.\n");
err = dma_declare_coherent_memory(drm->dev, 0xC0000000, 0xC0000000, 0x40000000);
}
if (err < 0) {
goto unref;
}
return 0;
unref:
drm_dev_put(drm);
return err;
}
void nvdla_drm_remove(struct nvdla_device *nvdla_dev)
{
drm_dev_unregister(nvdla_dev->drm);
drm_dev_put(nvdla_dev->drm);
}

528
drivers/nvdla/pdp.c Normal file
View file

@ -0,0 +1,528 @@
/*
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <opendla.h>
#include <dla_debug.h>
#include <dla_err.h>
#include <dla_interface.h>
#include "common.h"
#include "dla_engine_internal.h"
#include "engine_debug.h"
#define MAX_SPLIT_NUM 64
#ifndef ARRAY_SIZE
#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a[0])))
#endif
static const uint8_t map_ram[] = {
FIELD_ENUM(PDP_RDMA_D_SRC_RAM_CFG_0, SRC_RAM_TYPE, MC),
FIELD_ENUM(PDP_RDMA_D_SRC_RAM_CFG_0, SRC_RAM_TYPE, CV),
};
static const uint8_t map_pool[] = {
FIELD_ENUM(PDP_D_OPERATION_MODE_CFG_0,
POOLING_METHOD, POOLING_METHOD_AVERAGE),
FIELD_ENUM(PDP_D_OPERATION_MODE_CFG_0,
POOLING_METHOD, POOLING_METHOD_MAX),
FIELD_ENUM(PDP_D_OPERATION_MODE_CFG_0,
POOLING_METHOD, POOLING_METHOD_MIN),
};
static const uint8_t map_precision[] = {
FIELD_ENUM(PDP_D_DATA_FORMAT_0, INPUT_DATA, INT8),
FIELD_ENUM(PDP_D_DATA_FORMAT_0, INPUT_DATA, INT16),
FIELD_ENUM(PDP_D_DATA_FORMAT_0, INPUT_DATA, FP16),
};
static const uint8_t map_pool_kernel[] = {
FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_1),
FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_2),
FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_3),
FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_4),
FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_5),
FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_6),
FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_7),
FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_8),
};
/* The reciprocal of kernel width: 1/1, 1/2, 1/3, ... */
static const uint32_t recip_kernel_size[2][8] = {
/*
* INT8/16
* 1 1/2 1/3 1/4 1/5 1/6 1/7 1/8
*/
{0x10000, 0x8000, 0x5555, 0x4000, 0x3333, 0x2aaa, 0x2492, 0x2000},
{0x7c00, 0x7800, 0x7555, 0x7400, 0x7266, 0x7155, 0x7092, 0x7000},
};
#if STAT_ENABLE
void
dla_pdp_stat_data(struct dla_processor *processor,
struct dla_processor_group *group)
{
uint64_t end_time = 0;
struct dla_pdp_stat_desc *pdp_stat;
pdp_stat = &processor->stat_data_desc->pdp_stat;
end_time = dla_get_time_us();
pdp_stat->write_stall = pdp_reg_read(D_PERF_WRITE_STALL);
pdp_stat->runtime = (uint32_t)(end_time - group->start_time);
}
void
dla_pdp_dump_stat(struct dla_processor *processor)
{
struct dla_pdp_stat_desc *pdp_stat;
pdp_stat = &processor->stat_data_desc->pdp_stat;
dla_debug_pdp_stats(pdp_stat);
}
#endif /* STAT_ENABLE */
static uint32_t
get_fly_mode(uint8_t type)
{
uint32_t val;
val = type == DLA_MEM_HW ?
FIELD_ENUM(PDP_D_OPERATION_MODE_CFG_0,
FLYING_MODE, ON_FLYING) :
FIELD_ENUM(PDP_D_OPERATION_MODE_CFG_0,
FLYING_MODE, OFF_FLYING);
return val;
}
void
dla_pdp_set_producer(int32_t group_id, int32_t rdma_group_id)
{
uint32_t reg;
dla_trace("Enter: %s", __func__);
dla_debug("group id %d rdma id %d\n", group_id, rdma_group_id);
reg = group_id << SHIFT(PDP_S_POINTER_0, PRODUCER);
pdp_reg_write(S_POINTER, reg);
reg = rdma_group_id << SHIFT(PDP_RDMA_S_POINTER_0, PRODUCER);
pdp_rdma_reg_write(S_POINTER, reg);
dla_trace("Exit: %s", __func__);
}
int
dla_pdp_enable(struct dla_processor_group *group)
{
int32_t ret = 0;
uint32_t reg;
struct dla_engine *engine = dla_get_engine();
dla_trace("Enter: %s", __func__);
if (!group) {
ret = ERR(INVALID_INPUT);
goto exit;
}
if (engine->stat_enable == (uint32_t)1) {
reg = FIELD_ENUM(PDP_D_PERF_ENABLE_0, DMA_EN, ENABLE);
pdp_reg_write(D_PERF_ENABLE, reg);
group->start_time = dla_get_time_us();
}
dla_debug("rdma needed %u\n", group->is_rdma_needed);
/**
* enable all sub-modules
*/
if (group->is_rdma_needed) {
reg = FIELD_ENUM(PDP_RDMA_D_OP_ENABLE_0, OP_EN, ENABLE);
pdp_rdma_reg_write(D_OP_ENABLE, reg);
}
reg = FIELD_ENUM(PDP_D_OP_ENABLE_0, OP_EN, ENABLE);
pdp_reg_write(D_OP_ENABLE, reg);
exit:
dla_trace("Exit: %s", __func__);
RETURN(ret);
}
void
dla_pdp_rdma_check(struct dla_processor_group *group)
{
struct dla_pdp_surface_desc *pdp_surface;
pdp_surface = &group->surface_desc->pdp_surface;
group->is_rdma_needed = 0;
if (pdp_surface->src_data.type != DLA_MEM_HW)
group->is_rdma_needed = 1;
}
static int
validate_strides(uint8_t stride_x, uint8_t stride_y)
{
int32_t ret = 0;
if (stride_x < 1 || stride_y < 1 || stride_x > 8 || stride_y > 8) {
dla_error("Invalid Stride (x[%d], y[%d])\n", stride_x, stride_y);
ret = ERR(INVALID_INPUT);
}
RETURN(ret);
}
static int
vaildate_pdp_configs(struct dla_processor_group *group)
{
int32_t ret = 0;
struct dla_pdp_op_desc *pdp_op;
struct dla_pdp_surface_desc *pdp_surface;
dla_trace("Enter: %s", __func__);
pdp_op = &group->operation_desc->pdp_op;
pdp_surface = &group->surface_desc->pdp_surface;
if (pdp_surface->dst_data.type == DLA_MEM_HW) {
dla_error("Destination buffer for PDP has to be either MC or CV");
ret = ERR(INVALID_INPUT);
goto exit;
}
ret = validate_data_cube(pdp_surface->src_data, pdp_surface->dst_data,
DLA_MEM_HW);
if (ret)
goto exit;
ret = validate_precision(pdp_op->precision, ARRAY_SIZE(map_precision));
if (ret)
goto exit;
ret = validate_strides(pdp_op->stride_x, pdp_op->stride_y);
if (ret)
goto exit;
if (pdp_op->split_num > MAX_SPLIT_NUM) {
dla_error("Invalid split_num: %u\n", pdp_op->split_num);
ret = ERR(INVALID_INPUT);
goto exit;
}
if (pdp_op->pool_width >= ARRAY_SIZE(map_pool_kernel)) {
dla_error("Invalid pool_width: %u\n", pdp_op->pool_width);
ret = ERR(INVALID_INPUT);
goto exit;
}
if (pdp_op->pool_height >= ARRAY_SIZE(map_pool_kernel)) {
dla_error("Invalid pool_height: %u\n", pdp_op->pool_height);
ret = ERR(INVALID_INPUT);
goto exit;
}
if (pdp_op->pool_mode >= ARRAY_SIZE(map_pool)) {
dla_error("Invalid pool_mode: %u\n", pdp_op->pool_mode);
ret = ERR(INVALID_INPUT);
goto exit;
}
exit:
dla_trace("Exit: %s", __func__);
RETURN(ret);
}
static int
processor_pdp_program(struct dla_processor_group *group)
{
int32_t ret = 0;
uint32_t reg, high, low;
uint64_t input_address = 0;
uint64_t output_address = 0;
struct dla_engine *engine = dla_get_engine();
struct dla_pdp_op_desc *pdp_op;
struct dla_pdp_surface_desc *pdp_surface;
dla_trace("Enter: %s", __func__);
pdp_op = &group->operation_desc->pdp_op;
pdp_surface = &group->surface_desc->pdp_surface;
ret = vaildate_pdp_configs(group);
if (ret)
goto exit;
ret = dla_read_input_address(&pdp_surface->src_data,
&input_address,
group->op_desc->index,
group->roi_index,
1);
if (ret)
goto exit;
if (pdp_surface->dst_data.address != -1)
dla_get_dma_cube_address(engine->driver_context,
engine->task->task_data,
pdp_surface->dst_data.address,
pdp_surface->dst_data.offset,
(void *)&output_address,
DESTINATION_DMA);
if (pdp_surface->src_data.type != DLA_MEM_HW) {
/* PDP RDMA */
pdp_rdma_reg_write(D_DATA_CUBE_IN_WIDTH,
pdp_surface->src_data.width - 1);
pdp_rdma_reg_write(D_DATA_CUBE_IN_HEIGHT,
pdp_surface->src_data.height - 1);
pdp_rdma_reg_write(D_DATA_CUBE_IN_CHANNEL,
pdp_surface->src_data.channel - 1);
high = HIGH32BITS(input_address);
low = LOW32BITS(input_address);
pdp_rdma_reg_write(D_SRC_BASE_ADDR_HIGH, high);
pdp_rdma_reg_write(D_SRC_BASE_ADDR_LOW, low);
pdp_rdma_reg_write(D_SRC_LINE_STRIDE,
pdp_surface->src_data.line_stride);
pdp_rdma_reg_write(D_SRC_SURFACE_STRIDE,
pdp_surface->src_data.surf_stride);
reg = (map_precision[pdp_op->precision]
<< SHIFT(PDP_RDMA_D_DATA_FORMAT_0, INPUT_DATA));
pdp_rdma_reg_write(D_DATA_FORMAT, reg);
reg = map_ram[pdp_surface->src_data.type]
<< SHIFT(PDP_RDMA_D_SRC_RAM_CFG_0, SRC_RAM_TYPE);
pdp_rdma_reg_write(D_SRC_RAM_CFG, reg);
reg = ((pdp_op->split_num - 1)
<< SHIFT(PDP_RDMA_D_OPERATION_MODE_CFG_0, SPLIT_NUM));
pdp_rdma_reg_write(D_OPERATION_MODE_CFG, reg);
reg = (map_pool_kernel[pdp_op->pool_width]
<< SHIFT(PDP_RDMA_D_POOLING_KERNEL_CFG_0,
KERNEL_WIDTH)) |
((pdp_op->stride_x - 1)
<< SHIFT(PDP_RDMA_D_POOLING_KERNEL_CFG_0,
KERNEL_STRIDE_WIDTH));
pdp_rdma_reg_write(D_POOLING_KERNEL_CFG, reg);
reg = (pdp_op->pad_left
<< SHIFT(PDP_RDMA_D_POOLING_PADDING_CFG_0, PAD_WIDTH));
pdp_rdma_reg_write(D_POOLING_PADDING_CFG, reg);
reg = ((pdp_op->partial_in_width_first == 0 ? 0 :
pdp_op->partial_in_width_first - 1)
<< SHIFT(PDP_RDMA_D_PARTIAL_WIDTH_IN_0,
PARTIAL_WIDTH_IN_FIRST)) |
((pdp_op->partial_in_width_mid == 0 ? 0 :
pdp_op->partial_in_width_mid - 1)
<< SHIFT(PDP_RDMA_D_PARTIAL_WIDTH_IN_0,
PARTIAL_WIDTH_IN_MID)) |
((pdp_op->partial_in_width_last == 0 ? 0 :
pdp_op->partial_in_width_last - 1)
<< SHIFT(PDP_RDMA_D_PARTIAL_WIDTH_IN_0,
PARTIAL_WIDTH_IN_LAST));
pdp_rdma_reg_write(D_PARTIAL_WIDTH_IN, reg);
} else {
ASSERT_GOTO(pdp_op->split_num == 1, ret,
ERR(INVALID_INPUT), exit);
}
reg = ((pdp_surface->src_data.width - 1)
<< SHIFT(PDP_D_DATA_CUBE_IN_WIDTH_0, CUBE_IN_WIDTH));
pdp_reg_write(D_DATA_CUBE_IN_WIDTH, reg);
reg = ((pdp_surface->src_data.height - 1)
<< SHIFT(PDP_D_DATA_CUBE_IN_HEIGHT_0, CUBE_IN_HEIGHT));
pdp_reg_write(D_DATA_CUBE_IN_HEIGHT, reg);
reg = ((pdp_surface->src_data.channel - 1)
<< SHIFT(PDP_D_DATA_CUBE_IN_CHANNEL_0, CUBE_IN_CHANNEL));
pdp_reg_write(D_DATA_CUBE_IN_CHANNEL, reg);
reg = ((pdp_surface->dst_data.width - 1)
<< SHIFT(PDP_D_DATA_CUBE_OUT_WIDTH_0, CUBE_OUT_WIDTH));
pdp_reg_write(D_DATA_CUBE_OUT_WIDTH, reg);
reg = ((pdp_surface->dst_data.height - 1)
<< SHIFT(PDP_D_DATA_CUBE_OUT_HEIGHT_0, CUBE_OUT_HEIGHT));
pdp_reg_write(D_DATA_CUBE_OUT_HEIGHT, reg);
reg = ((pdp_surface->dst_data.channel - 1)
<< SHIFT(PDP_D_DATA_CUBE_OUT_CHANNEL_0, CUBE_OUT_CHANNEL));
pdp_reg_write(D_DATA_CUBE_OUT_CHANNEL, reg);
reg = (map_pool[pdp_op->pool_mode]
<< SHIFT(PDP_D_OPERATION_MODE_CFG_0, POOLING_METHOD)) |
(get_fly_mode(pdp_surface->src_data.type)
<< SHIFT(PDP_D_OPERATION_MODE_CFG_0, FLYING_MODE)) |
((pdp_op->split_num - 1)
<< SHIFT(PDP_D_OPERATION_MODE_CFG_0, SPLIT_NUM));
pdp_reg_write(D_OPERATION_MODE_CFG, reg);
reg = ((pdp_op->partial_in_width_first == 0 ? 0 :
pdp_op->partial_in_width_first-1)
<< SHIFT(PDP_D_PARTIAL_WIDTH_IN_0, PARTIAL_WIDTH_IN_FIRST)) |
((pdp_op->partial_in_width_mid == 0 ? 0 :
pdp_op->partial_in_width_mid-1)
<< SHIFT(PDP_D_PARTIAL_WIDTH_IN_0, PARTIAL_WIDTH_IN_MID)) |
((pdp_op->partial_in_width_last == 0 ? 0 :
pdp_op->partial_in_width_last-1)
<< SHIFT(PDP_D_PARTIAL_WIDTH_IN_0, PARTIAL_WIDTH_IN_LAST));
pdp_reg_write(D_PARTIAL_WIDTH_IN, reg);
reg = ((pdp_op->partial_width_first == 0 ? 0 :
pdp_op->partial_width_first-1)
<< SHIFT(PDP_D_PARTIAL_WIDTH_OUT_0, PARTIAL_WIDTH_OUT_FIRST)) |
((pdp_op->partial_width_mid == 0 ? 0 :
pdp_op->partial_width_mid-1)
<< SHIFT(PDP_D_PARTIAL_WIDTH_OUT_0, PARTIAL_WIDTH_OUT_MID)) |
((pdp_op->partial_width_last == 0 ? 0 :
pdp_op->partial_width_last-1)
<< SHIFT(PDP_D_PARTIAL_WIDTH_OUT_0, PARTIAL_WIDTH_OUT_LAST));
pdp_reg_write(D_PARTIAL_WIDTH_OUT, reg);
reg = (map_pool_kernel[pdp_op->pool_width]
<< SHIFT(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH)) |
(map_pool_kernel[pdp_op->pool_height]
<< SHIFT(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_HEIGHT))|
((pdp_op->stride_x - 1)
<< SHIFT(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_STRIDE_WIDTH)) |
((pdp_op->stride_y - 1)
<< SHIFT(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_STRIDE_HEIGHT));
pdp_reg_write(D_POOLING_KERNEL_CFG, reg);
pdp_reg_write(D_RECIP_KERNEL_WIDTH,
recip_kernel_size[pdp_op->precision ==
PRECISION_FP16][pdp_op->pool_width]);
pdp_reg_write(D_RECIP_KERNEL_HEIGHT,
recip_kernel_size[pdp_op->precision ==
PRECISION_FP16][pdp_op->pool_height]);
reg = (pdp_op->pad_left
<< SHIFT(PDP_D_POOLING_PADDING_CFG_0, PAD_LEFT)) |
(pdp_op->pad_right
<< SHIFT(PDP_D_POOLING_PADDING_CFG_0, PAD_RIGHT)) |
(pdp_op->pad_top
<< SHIFT(PDP_D_POOLING_PADDING_CFG_0, PAD_TOP)) |
(pdp_op->pad_bottom
<< SHIFT(PDP_D_POOLING_PADDING_CFG_0, PAD_BOTTOM));
if (pdp_op->precision == PRECISION_FP16) {
int32_t i;
for (i = 0; i < 7; i++)
ASSERT_GOTO(pdp_op->padding_value[i] == 0, ret,
ERR(INVALID_INPUT), exit);
}
pdp_reg_write(D_POOLING_PADDING_CFG, reg);
pdp_reg_write(D_POOLING_PADDING_VALUE_1_CFG, pdp_op->padding_value[0]);
pdp_reg_write(D_POOLING_PADDING_VALUE_2_CFG, pdp_op->padding_value[1]);
pdp_reg_write(D_POOLING_PADDING_VALUE_3_CFG, pdp_op->padding_value[2]);
pdp_reg_write(D_POOLING_PADDING_VALUE_4_CFG, pdp_op->padding_value[3]);
pdp_reg_write(D_POOLING_PADDING_VALUE_5_CFG, pdp_op->padding_value[4]);
pdp_reg_write(D_POOLING_PADDING_VALUE_6_CFG, pdp_op->padding_value[5]);
pdp_reg_write(D_POOLING_PADDING_VALUE_7_CFG, pdp_op->padding_value[6]);
if (pdp_surface->src_data.type != DLA_MEM_HW) {
pdp_reg_write(D_SRC_LINE_STRIDE,
pdp_surface->src_data.line_stride);
pdp_reg_write(D_SRC_SURFACE_STRIDE,
pdp_surface->src_data.surf_stride);
}
high = HIGH32BITS(output_address);
low = LOW32BITS(output_address);
pdp_reg_write(D_DST_BASE_ADDR_LOW, low);
pdp_reg_write(D_DST_BASE_ADDR_HIGH, high);
pdp_reg_write(D_DST_LINE_STRIDE, pdp_surface->dst_data.line_stride);
pdp_reg_write(D_DST_SURFACE_STRIDE, pdp_surface->dst_data.surf_stride);
reg = (map_ram[pdp_surface->dst_data.type]
<< SHIFT(PDP_D_DST_RAM_CFG_0, DST_RAM_TYPE));
pdp_reg_write(D_DST_RAM_CFG, reg);
reg = (map_precision[pdp_op->precision]
<< SHIFT(PDP_D_DATA_FORMAT_0, INPUT_DATA));
pdp_reg_write(D_DATA_FORMAT, reg);
exit:
dla_trace("Exit: %s", __func__);
RETURN(ret);
}
int
dla_pdp_is_ready(struct dla_processor *processor,
struct dla_processor_group *group)
{
return 1;
}
void
dla_pdp_dump_config(struct dla_processor_group *group)
{
struct dla_pdp_op_desc *pdp_op;
struct dla_pdp_surface_desc *pdp_surface;
pdp_surface = &group->surface_desc->pdp_surface;
pdp_op = &group->operation_desc->pdp_op;
dla_debug_pdp_surface_desc(pdp_surface, group->roi_index);
dla_debug_pdp_op_desc(pdp_op, group->roi_index);
}
int
dla_pdp_program(struct dla_processor_group *group)
{
int32_t ret;
dla_trace("Enter: %s", __func__);
if (!group) {
ret = ERR(INVALID_INPUT);
goto exit;
}
dla_enable_intr(MASK(GLB_S_INTR_MASK_0, PDP_DONE_MASK1) |
MASK(GLB_S_INTR_MASK_0, PDP_DONE_MASK0));
ret = processor_pdp_program(group);
if (ret)
goto exit;
exit:
dla_trace("Exit: %s", __func__);
RETURN(ret);
}

292
drivers/nvdla/rubik.c Normal file
View file

@ -0,0 +1,292 @@
/*
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <opendla.h>
#include <dla_debug.h>
#include <dla_err.h>
#include <dla_interface.h>
#include "common.h"
#include "dla_engine_internal.h"
#include "engine_debug.h"
static uint8_t map_rubik_mode[] = {
FIELD_ENUM(RBK_D_MISC_CFG_0, RUBIK_MODE, CONTRACT),
FIELD_ENUM(RBK_D_MISC_CFG_0, RUBIK_MODE, SPLIT),
FIELD_ENUM(RBK_D_MISC_CFG_0, RUBIK_MODE, MERGE),
};
static uint8_t map_ram_type[] = {
FIELD_ENUM(RBK_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE, MCIF),
FIELD_ENUM(RBK_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE, CVIF),
};
static uint8_t map_precision[] = {
FIELD_ENUM(RBK_D_MISC_CFG_0, IN_PRECISION, INT8),
FIELD_ENUM(RBK_D_MISC_CFG_0, IN_PRECISION, INT16),
FIELD_ENUM(RBK_D_MISC_CFG_0, IN_PRECISION, FP16),
};
static uint8_t map_bpe[] = {
BPE_PRECISION_INT8,
BPE_PRECISION_INT16,
BPE_PRECISION_FP16,
};
#if STAT_ENABLE
void
dla_rubik_stat_data(struct dla_processor *processor,
struct dla_processor_group *group)
{
uint64_t end_time = 0;
struct dla_rubik_stat_desc *rubik_stat;
rubik_stat = &processor->stat_data_desc->rubik_stat;
end_time = dla_get_time_us();
rubik_stat->read_stall = rubik_reg_read(D_PERF_READ_STALL);
rubik_stat->write_stall = rubik_reg_read(D_PERF_WRITE_STALL);
rubik_stat->runtime = (uint32_t)(end_time - group->start_time);
}
void
dla_rubik_dump_stat(struct dla_processor *processor)
{
struct dla_rubik_stat_desc *rubik_stat;
rubik_stat = &processor->stat_data_desc->rubik_stat;
dla_debug_rubik_stats(rubik_stat);
}
#endif /* STAT_ENABLE */
void
dla_rubik_set_producer(int32_t group_id, int32_t __unused)
{
uint32_t reg;
/**
* set producer pointer for all sub-modules
*/
reg = group_id << SHIFT(RBK_S_POINTER_0, PRODUCER);
rubik_reg_write(S_POINTER, reg);
}
int
dla_rubik_enable(struct dla_processor_group *group)
{
uint32_t reg;
struct dla_engine *engine = dla_get_engine();
dla_trace("Enter: %s", __func__);
if (engine->stat_enable == (uint32_t)1) {
rubik_reg_write(D_PERF_ENABLE, 1);
group->start_time = dla_get_time_us();
}
/**
* enable all sub-modules
*/
reg = FIELD_ENUM(RBK_D_OP_ENABLE_0, OP_EN, ENABLE);
rubik_reg_write(D_OP_ENABLE, reg);
dla_trace("Exit: %s", __func__);
RETURN(0);
}
void
dla_rubik_rdma_check(struct dla_processor_group *group)
{
group->is_rdma_needed = 0;
}
static int32_t
processor_rubik_program(struct dla_processor_group *group)
{
int32_t ret = 0;
uint32_t reg, high, low;
uint64_t input_address = 0;
uint64_t output_address = 0;
struct dla_engine *engine = dla_get_engine();
struct dla_rubik_op_desc *rubik_op;
struct dla_rubik_surface_desc *rubik_surface;
dla_trace("Enter: %s", __func__);
rubik_op = &group->operation_desc->rubik_op;
rubik_surface = &group->surface_desc->rubik_surface;
/* Argument check */
ASSERT_GOTO((rubik_surface->src_data.type != DLA_MEM_HW),
ret, ERR(INVALID_INPUT), exit);
ASSERT_GOTO((rubik_surface->dst_data.type != DLA_MEM_HW),
ret, ERR(INVALID_INPUT), exit);
/* get the addresses from task descriptor */
ret = dla_read_input_address(&rubik_surface->src_data,
&input_address,
group->op_desc->index,
group->roi_index,
1);
if (ret)
goto exit;
dla_get_dma_cube_address(engine->driver_context,
engine->task->task_data,
rubik_surface->dst_data.address,
rubik_surface->dst_data.offset,
(void *)&output_address,
DESTINATION_DMA);
/* config rubik */
reg = (((uint32_t)map_rubik_mode[rubik_op->mode]) <<
SHIFT(RBK_D_MISC_CFG_0, RUBIK_MODE)) |
(((uint32_t)map_precision[rubik_op->precision]) <<
SHIFT(RBK_D_MISC_CFG_0, IN_PRECISION));
rubik_reg_write(D_MISC_CFG, reg);
reg = (((uint32_t)map_ram_type[rubik_surface->src_data.type]) <<
SHIFT(RBK_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE));
rubik_reg_write(D_DAIN_RAM_TYPE, reg);
reg = ((rubik_surface->src_data.width-1) <<
SHIFT(RBK_D_DATAIN_SIZE_0_0, DATAIN_WIDTH)) |
((rubik_surface->src_data.height-1) <<
SHIFT(RBK_D_DATAIN_SIZE_0_0, DATAIN_HEIGHT));
rubik_reg_write(D_DATAIN_SIZE_0, reg);
reg = ((rubik_surface->src_data.channel-1) <<
SHIFT(RBK_D_DATAIN_SIZE_1_0, DATAIN_CHANNEL));
rubik_reg_write(D_DATAIN_SIZE_1, reg);
high = HIGH32BITS(input_address);
low = LOW32BITS(input_address);
rubik_reg_write(D_DAIN_ADDR_LOW, low);
rubik_reg_write(D_DAIN_ADDR_HIGH, high);
if (rubik_op->mode == RUBIK_MODE_MERGE) {
ASSERT_GOTO((rubik_surface->src_data.plane_stride != 0),
ret, ERR(INVALID_INPUT), exit);
ASSERT_GOTO(((rubik_surface->src_data.plane_stride&0x1F) == 0),
ret, ERR(INVALID_INPUT), exit);
rubik_reg_write(D_DAIN_PLANAR_STRIDE,
rubik_surface->src_data.plane_stride);
} else {
rubik_reg_write(D_DAIN_SURF_STRIDE,
rubik_surface->src_data.surf_stride);
}
rubik_reg_write(D_DAIN_LINE_STRIDE,
rubik_surface->src_data.line_stride);
reg = (((uint32_t)map_ram_type[rubik_surface->dst_data.type]) <<
SHIFT(RBK_D_DAOUT_RAM_TYPE_0, DATAOUT_RAM_TYPE));
rubik_reg_write(D_DAOUT_RAM_TYPE, reg);
reg = ((rubik_surface->dst_data.channel-1) <<
SHIFT(RBK_D_DATAOUT_SIZE_1_0, DATAOUT_CHANNEL));
rubik_reg_write(D_DATAOUT_SIZE_1, reg);
high = HIGH32BITS(output_address);
low = LOW32BITS(output_address);
rubik_reg_write(D_DAOUT_ADDR_LOW, low);
rubik_reg_write(D_DAOUT_ADDR_HIGH, high);
rubik_reg_write(D_DAOUT_LINE_STRIDE,
rubik_surface->dst_data.line_stride);
if (rubik_op->mode != RUBIK_MODE_SPLIT) {
rubik_reg_write(D_DAOUT_SURF_STRIDE,
rubik_surface->dst_data.surf_stride);
if (rubik_op->mode == RUBIK_MODE_CONTRACT) {
reg = ((rubik_surface->dst_data.channel *
map_bpe[rubik_op->precision] + 31) >> 5) *
rubik_surface->src_data.surf_stride;
rubik_reg_write(D_CONTRACT_STRIDE_0, reg);
reg = rubik_op->stride_y *
rubik_surface->dst_data.line_stride;
rubik_reg_write(D_CONTRACT_STRIDE_1, reg);
reg = (((uint32_t)(rubik_op->stride_x-1)) <<
SHIFT(RBK_D_DECONV_STRIDE_0, DECONV_X_STRIDE)) |
(((uint32_t)(rubik_op->stride_y-1)) <<
SHIFT(RBK_D_DECONV_STRIDE_0, DECONV_Y_STRIDE));
rubik_reg_write(D_DECONV_STRIDE, reg);
}
} else {
rubik_reg_write(D_DAOUT_PLANAR_STRIDE,
rubik_surface->dst_data.plane_stride);
}
exit:
dla_trace("Exit: %s", __func__);
RETURN(ret);
}
int
dla_rubik_is_ready(struct dla_processor *processor,
struct dla_processor_group *group)
{
return 1;
}
void
dla_rubik_dump_config(struct dla_processor_group *group)
{
struct dla_rubik_op_desc *rubik_op;
struct dla_rubik_surface_desc *rubik_surface;
rubik_surface = &group->surface_desc->rubik_surface;
rubik_op = &group->operation_desc->rubik_op;
dla_debug_rubik_surface_desc(rubik_surface, group->roi_index);
dla_debug_rubik_op_desc(rubik_op, group->roi_index);
}
int
dla_rubik_program(struct dla_processor_group *group)
{
int32_t ret = 0;
struct dla_engine *engine = dla_get_engine();
dla_trace("Enter: %s", __func__);
if (!engine->config_data->rubik_enable) {
dla_error("RUBIK is not supported for this configuration\n");
ret = ERR(INVALID_INPUT);
goto exit;
}
dla_enable_intr(MASK(GLB_S_INTR_MASK_0, RUBIK_DONE_MASK1) |
MASK(GLB_S_INTR_MASK_0, RUBIK_DONE_MASK0));
ret = processor_rubik_program(group);
if (ret)
goto exit;
exit:
dla_trace("Exit: %s", __func__);
RETURN(ret);
}

1160
drivers/nvdla/scheduler.c Normal file

File diff suppressed because it is too large Load diff

817
drivers/nvdla/sdp.c Normal file
View file

@ -0,0 +1,817 @@
/*
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <opendla.h>
#include <dla_debug.h>
#include <dla_interface.h>
#include "common.h"
#include "dla_engine_internal.h"
#include "engine_debug.h"
static const uint8_t map_ena[] = {
FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DISABLE, YES),
FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DISABLE, NO),
};
static const uint8_t map_prelu[] = {
FIELD_ENUM(SDP_D_DP_BS_CFG_0, BS_MUL_PRELU, NO),
FIELD_ENUM(SDP_D_DP_BS_CFG_0, BS_MUL_PRELU, YES),
};
static const uint8_t map_bypass[] = {
FIELD_ENUM(SDP_D_DP_BS_CFG_0, BS_BYPASS, YES),
FIELD_ENUM(SDP_D_DP_BS_CFG_0, BS_BYPASS, NO),
};
static const uint8_t map_alu_op[] = {
FIELD_ENUM(SDP_D_DP_EW_CFG_0, EW_ALU_ALGO, MAX),
FIELD_ENUM(SDP_D_DP_EW_CFG_0, EW_ALU_ALGO, MIN),
FIELD_ENUM(SDP_D_DP_EW_CFG_0, EW_ALU_ALGO, SUM),
FIELD_ENUM(SDP_D_DP_EW_CFG_0, EW_ALU_ALGO, EQL),
};
static const uint8_t map_alu_src[] = {
FIELD_ENUM(SDP_D_DP_BS_ALU_CFG_0, BS_ALU_SRC, MEM),
FIELD_ENUM(SDP_D_DP_BS_ALU_CFG_0, BS_ALU_SRC, REG),
};
static const uint8_t map_fly[] = {
FIELD_ENUM(SDP_D_FEATURE_MODE_CFG_0, FLYING_MODE, OFF),
FIELD_ENUM(SDP_D_FEATURE_MODE_CFG_0, FLYING_MODE, ON),
};
static const uint8_t map_dst[] = {
FIELD_ENUM(SDP_D_FEATURE_MODE_CFG_0, OUTPUT_DST, MEM),
FIELD_ENUM(SDP_D_FEATURE_MODE_CFG_0, OUTPUT_DST, PDP),
};
static const uint8_t map_wg[] = {
FIELD_ENUM(SDP_D_FEATURE_MODE_CFG_0, WINOGRAD, OFF),
FIELD_ENUM(SDP_D_FEATURE_MODE_CFG_0, WINOGRAD, ON),
};
static const uint8_t map_precision[] = {
FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT8),
FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT16),
FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, FP16),
};
static const uint32_t map_proc_precision[3][3] = {
{
FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT8),
FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT8),
FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, FP16),
},
{
FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT8),
FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT16),
FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, FP16),
},
{
FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT8),
FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT16),
FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, FP16),
},
};
static const uint8_t map_op_type[] = {
FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_USE, MUL),
FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_USE, MUL),
FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_USE, ALU),
FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_USE, BOTH),
};
static const uint8_t map_element_size[] = {
FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_SIZE, ONE_BYTE),
FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_SIZE, TWO_BYTE),
FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_SIZE, TWO_BYTE),
};
static const uint8_t map_op_mode[] = {
FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_MODE, PER_ELEMENT),
FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_MODE, PER_KERNEL),
FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_MODE, PER_ELEMENT),
};
static const uint8_t map_ram_type[] = {
FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_RAM_TYPE, MC),
FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_RAM_TYPE, CV),
};
static const uint8_t map_perf_dma[] = {
FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_DMA_EN, NO),
FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_DMA_EN, YES),
};
static const uint8_t map_perf_lut[] = {
FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_LUT_EN, NO),
FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_LUT_EN, YES),
};
static const uint8_t map_perf_sat[] = {
FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_SAT_EN, NO),
FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_SAT_EN, YES),
};
static const uint8_t map_perf_nan_inf[] = {
FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_NAN_INF_COUNT_EN, NO),
FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_NAN_INF_COUNT_EN, YES),
};
#if STAT_ENABLE
void
dla_sdp_stat_data(struct dla_processor *processor,
struct dla_processor_group *group)
{
uint64_t end_time = 0;
struct dla_sdp_stat_desc *sdp_stat;
sdp_stat = &processor->stat_data_desc->sdp_stat;
end_time = dla_get_time_us();
sdp_stat->nan_input_num = sdp_reg_read(D_STATUS_NAN_INPUT_NUM);
sdp_stat->inf_input_num = sdp_reg_read(D_STATUS_INF_INPUT_NUM);
sdp_stat->nan_output_num = sdp_reg_read(D_STATUS_NAN_OUTPUT_NUM);
sdp_stat->wdma_write_stall = sdp_reg_read(D_PERF_WDMA_WRITE_STALL);
sdp_stat->runtime = (uint32_t)(end_time - group->start_time);
}
void
dla_sdp_dump_stat(struct dla_processor *processor)
{
struct dla_sdp_stat_desc *sdp_stat;
sdp_stat = &processor->stat_data_desc->sdp_stat;
dla_debug_sdp_stats(sdp_stat);
}
#endif /* STAT_ENABLE */
void
dla_sdp_set_producer(int32_t group_id, int32_t rdma_group_id)
{
uint32_t reg;
/**
* set producer pointer for all sub-modules
*/
reg = group_id << SHIFT(SDP_S_POINTER_0, PRODUCER);
sdp_reg_write(S_POINTER, reg);
reg = rdma_group_id << SHIFT(SDP_RDMA_S_POINTER_0, PRODUCER);
sdp_rdma_reg_write(S_POINTER, reg);
}
int
dla_sdp_enable(struct dla_processor_group *group)
{
uint32_t reg;
uint8_t perf_reg;
struct dla_engine *engine = dla_get_engine();
dla_trace("Enter: %s", __func__);
if (engine->stat_enable == (uint32_t)1) {
perf_reg = (map_perf_dma[1] <<
SHIFT(SDP_D_PERF_ENABLE_0, PERF_DMA_EN)) |
(map_perf_lut[1] <<
SHIFT(SDP_D_PERF_ENABLE_0, PERF_LUT_EN)) |
(map_perf_sat[1] <<
SHIFT(SDP_D_PERF_ENABLE_0, PERF_SAT_EN)) |
(map_perf_nan_inf[1] <<
SHIFT(SDP_D_PERF_ENABLE_0, PERF_NAN_INF_COUNT_EN));
sdp_reg_write(D_PERF_ENABLE, perf_reg);
group->start_time = dla_get_time_us();
}
/**
* enable all sub-modules
*/
if (group->is_rdma_needed) {
reg = FIELD_ENUM(SDP_RDMA_D_OP_ENABLE_0, OP_EN, ENABLE);
sdp_rdma_reg_write(D_OP_ENABLE, reg);
}
reg = FIELD_ENUM(SDP_D_OP_ENABLE_0, OP_EN, ENABLE);
sdp_reg_write(D_OP_ENABLE, reg);
dla_trace("Exit: %s", __func__);
RETURN(0);
}
void
dla_sdp_rdma_check(struct dla_processor_group *group)
{
uint8_t x1_rdma_ena;
uint8_t x2_rdma_ena;
uint8_t y_rdma_ena;
uint8_t fly;
struct dla_sdp_op_desc *sdp_op;
struct dla_sdp_surface_desc *sdp_surface;
sdp_op = &group->operation_desc->sdp_op;
sdp_surface = &group->surface_desc->sdp_surface;
x1_rdma_ena = sdp_op->x1_op.enable;
x2_rdma_ena = sdp_op->x2_op.enable;
y_rdma_ena = sdp_op->y_op.enable;
x1_rdma_ena &= (sdp_op->x1_op.mode != SDP_OP_PER_LAYER);
x2_rdma_ena &= (sdp_op->x2_op.mode != SDP_OP_PER_LAYER);
y_rdma_ena &= (sdp_op->y_op.mode != SDP_OP_PER_LAYER);
fly = sdp_surface->src_data.type == DLA_MEM_HW;
group->is_rdma_needed = (!fly) || (x1_rdma_ena ||
x2_rdma_ena || y_rdma_ena);
}
static int32_t
processor_sdp_program(struct dla_processor_group *group)
{
int32_t ret = 0;
uint64_t src_addr = -1, x1_addr = -1, x2_addr = -1;
uint64_t y_addr = -1, dst_addr = -1;
uint32_t reg, high, low;
uint8_t fly;
uint32_t atom_size;
struct dla_sdp_op *x1_op;
struct dla_sdp_op *x2_op;
struct dla_sdp_op *y_op;
uint8_t x1_rdma_ena;
uint8_t x2_rdma_ena;
uint8_t y_rdma_ena;
uint8_t out_dma_ena;
struct dla_lut_param lut;
struct dla_engine *engine = dla_get_engine();
struct dla_sdp_op_desc *sdp_op;
struct dla_sdp_surface_desc *sdp_surface;
dla_trace("Enter: %s", __func__);
atom_size = engine->config_data->atom_size;
sdp_op = &group->operation_desc->sdp_op;
sdp_surface = &group->surface_desc->sdp_surface;
fly = sdp_surface->src_data.type == DLA_MEM_HW;
out_dma_ena = sdp_surface->dst_data.type != DLA_MEM_HW;
x1_op = &sdp_op->x1_op;
x2_op = &sdp_op->x2_op;
y_op = &sdp_op->y_op;
x1_rdma_ena = x1_op->enable && x1_op->type != SDP_OP_NONE;
x2_rdma_ena = x2_op->enable && x2_op->type != SDP_OP_NONE;
y_rdma_ena = y_op->enable && y_op->type != SDP_OP_NONE;
/* load address */
if (!fly) {
ret = dla_read_input_address(&sdp_surface->src_data,
&src_addr,
group->op_desc->index,
group->roi_index,
1);
if (ret)
goto exit;
CHECK_ALIGN(src_addr, atom_size);
}
if (out_dma_ena) {
dla_get_dma_cube_address(engine->driver_context,
engine->task->task_data,
sdp_surface->dst_data.address,
sdp_surface->dst_data.offset,
(void *)&dst_addr,
DESTINATION_DMA);
CHECK_ALIGN(dst_addr, atom_size);
}
if (sdp_op->lut_index >= 0) {
group->lut_index = sdp_op->lut_index;
dla_read_lut(engine, sdp_op->lut_index, (void *)&lut);
dla_debug_lut_params(&lut);
}
x1_rdma_ena &= (x1_op->mode != SDP_OP_PER_LAYER);
x2_rdma_ena &= (x2_op->mode != SDP_OP_PER_LAYER);
y_rdma_ena &= (y_op->mode != SDP_OP_PER_LAYER);
if (x1_rdma_ena) {
dla_get_dma_cube_address(engine->driver_context,
engine->task->task_data,
sdp_surface->x1_data.address,
sdp_surface->x1_data.offset,
(void *)&x1_addr,
DESTINATION_DMA);
CHECK_ALIGN(x1_addr, atom_size);
}
if (x2_rdma_ena) {
dla_get_dma_cube_address(engine->driver_context,
engine->task->task_data,
sdp_surface->x2_data.address,
sdp_surface->x2_data.offset,
(void *)&x2_addr,
DESTINATION_DMA);
CHECK_ALIGN(x2_addr, atom_size);
}
if (y_rdma_ena) {
dla_get_dma_cube_address(engine->driver_context,
engine->task->task_data,
sdp_surface->y_data.address,
sdp_surface->y_data.offset,
(void *)&y_addr,
DESTINATION_DMA);
CHECK_ALIGN(y_addr, atom_size);
}
reg = (map_fly[0] << SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, FLYING_MODE));
sdp_rdma_reg_write(D_FEATURE_MODE_CFG, reg);
reg = (map_ena[1] << SHIFT(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DISABLE));
sdp_rdma_reg_write(D_BRDMA_CFG, reg);
reg = (map_ena[1] << SHIFT(SDP_RDMA_D_NRDMA_CFG_0, NRDMA_DISABLE));
sdp_rdma_reg_write(D_NRDMA_CFG, reg);
reg = (map_ena[1] << SHIFT(SDP_RDMA_D_ERDMA_CFG_0, ERDMA_DISABLE));
sdp_rdma_reg_write(D_ERDMA_CFG, reg);
reg = (map_fly[fly] <<
SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, FLYING_MODE)) |
(map_wg[sdp_op->conv_mode == CONV_MODE_WINOGRAD] <<
SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, WINOGRAD)) |
(map_precision[sdp_op->src_precision] <<
SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION)) |
(map_precision[sdp_op->dst_precision] <<
SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, OUT_PRECISION)) |
(map_proc_precision[sdp_op->dst_precision][sdp_op->src_precision] <<
SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, PROC_PRECISION)) |
((sdp_op->batch_num-1) <<
SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, BATCH_NUMBER));
sdp_rdma_reg_write(D_FEATURE_MODE_CFG, reg);
if (group->is_rdma_needed) {
sdp_rdma_reg_write(D_DATA_CUBE_WIDTH,
sdp_surface->src_data.width - 1);
sdp_rdma_reg_write(D_DATA_CUBE_HEIGHT,
sdp_surface->src_data.height - 1);
sdp_rdma_reg_write(D_DATA_CUBE_CHANNEL,
sdp_surface->src_data.channel - 1);
/* config SDP source info */
if (!fly) {
/**
* if not on-the-fly, we have to config
* the source cube info
*/
high = HIGH32BITS(src_addr);
low = LOW32BITS(src_addr);
sdp_rdma_reg_write(D_SRC_BASE_ADDR_LOW, low);
sdp_rdma_reg_write(D_SRC_BASE_ADDR_HIGH, high);
sdp_rdma_reg_write(D_SRC_LINE_STRIDE,
sdp_surface->src_data.line_stride);
sdp_rdma_reg_write(D_SRC_SURFACE_STRIDE,
sdp_surface->src_data.surf_stride);
sdp_rdma_reg_write(D_SRC_DMA_CFG,
map_ram_type[sdp_surface->src_data.type]);
}
/* config x1 source info */
reg = (map_ena[x1_rdma_ena] <<
SHIFT(SDP_RDMA_D_BRDMA_CFG_0,
BRDMA_DISABLE)) |
(map_op_type[x1_op->type] <<
SHIFT(SDP_RDMA_D_BRDMA_CFG_0,
BRDMA_DATA_USE)) |
(map_element_size[x1_op->precision] <<
SHIFT(SDP_RDMA_D_BRDMA_CFG_0,
BRDMA_DATA_SIZE)) |
(map_op_mode[x1_op->mode] <<
SHIFT(SDP_RDMA_D_BRDMA_CFG_0,
BRDMA_DATA_MODE)) |
(map_ram_type[sdp_surface->x1_data.type] <<
SHIFT(SDP_RDMA_D_BRDMA_CFG_0,
BRDMA_RAM_TYPE));
sdp_rdma_reg_write(D_BRDMA_CFG, reg);
if (x1_rdma_ena) {
high = HIGH32BITS(x1_addr);
low = LOW32BITS(x1_addr);
sdp_rdma_reg_write(D_BS_BASE_ADDR_LOW,
low);
sdp_rdma_reg_write(D_BS_BASE_ADDR_HIGH,
high);
sdp_rdma_reg_write(D_BS_LINE_STRIDE,
sdp_surface->x1_data.line_stride);
sdp_rdma_reg_write(D_BS_SURFACE_STRIDE,
sdp_surface->x1_data.surf_stride);
}
/* config x2 source info */
reg = (map_ena[x2_rdma_ena] <<
SHIFT(SDP_RDMA_D_NRDMA_CFG_0,
NRDMA_DISABLE)) |
(map_op_type[x2_op->type] <<
SHIFT(SDP_RDMA_D_NRDMA_CFG_0,
NRDMA_DATA_USE)) |
(map_element_size[x2_op->precision] <<
SHIFT(SDP_RDMA_D_NRDMA_CFG_0,
NRDMA_DATA_SIZE)) |
(map_op_mode[x2_op->mode] <<
SHIFT(SDP_RDMA_D_NRDMA_CFG_0,
NRDMA_DATA_MODE)) |
(map_ram_type[sdp_surface->x2_data.type] <<
SHIFT(SDP_RDMA_D_NRDMA_CFG_0,
NRDMA_RAM_TYPE));
sdp_rdma_reg_write(D_NRDMA_CFG, reg);
if (x2_rdma_ena) {
high = HIGH32BITS(x2_addr);
low = LOW32BITS(x2_addr);
sdp_rdma_reg_write(D_BN_BASE_ADDR_LOW,
low);
sdp_rdma_reg_write(D_BN_BASE_ADDR_HIGH,
high);
sdp_rdma_reg_write(D_BN_LINE_STRIDE,
sdp_surface->x2_data.line_stride);
sdp_rdma_reg_write(D_BN_SURFACE_STRIDE,
sdp_surface->x2_data.surf_stride);
}
/* config y source info */
reg = (map_ena[y_rdma_ena] <<
SHIFT(SDP_RDMA_D_ERDMA_CFG_0,
ERDMA_DISABLE)) |
(map_op_type[y_op->type] <<
SHIFT(SDP_RDMA_D_ERDMA_CFG_0,
ERDMA_DATA_USE)) |
(map_element_size[y_op->precision] <<
SHIFT(SDP_RDMA_D_ERDMA_CFG_0,
ERDMA_DATA_SIZE)) |
(map_op_mode[y_op->mode] <<
SHIFT(SDP_RDMA_D_ERDMA_CFG_0,
ERDMA_DATA_MODE)) |
(map_ram_type[sdp_surface->y_data.type] <<
SHIFT(SDP_RDMA_D_ERDMA_CFG_0,
ERDMA_RAM_TYPE));
sdp_rdma_reg_write(D_ERDMA_CFG, reg);
if (y_rdma_ena) {
high = HIGH32BITS(y_addr);
low = LOW32BITS(y_addr);
sdp_rdma_reg_write(D_EW_BASE_ADDR_LOW,
low);
sdp_rdma_reg_write(D_EW_BASE_ADDR_HIGH,
high);
sdp_rdma_reg_write(D_EW_LINE_STRIDE,
sdp_surface->y_data.line_stride);
sdp_rdma_reg_write(D_EW_SURFACE_STRIDE,
sdp_surface->y_data.surf_stride);
}
}
if (sdp_op->lut_index >= 0)
update_lut(SDP_S_LUT_ACCESS_CFG_0, &lut,
sdp_op->src_precision);
sdp_reg_write(D_DATA_CUBE_WIDTH, sdp_surface->src_data.width - 1);
sdp_reg_write(D_DATA_CUBE_HEIGHT, sdp_surface->src_data.height - 1);
sdp_reg_write(D_DATA_CUBE_CHANNEL, sdp_surface->src_data.channel - 1);
if (out_dma_ena) {
high = HIGH32BITS(dst_addr);
low = LOW32BITS(dst_addr);
sdp_reg_write(D_DST_BASE_ADDR_HIGH,
high);
sdp_reg_write(D_DST_BASE_ADDR_LOW,
low);
sdp_reg_write(D_DST_LINE_STRIDE,
sdp_surface->dst_data.line_stride);
sdp_reg_write(D_DST_SURFACE_STRIDE,
sdp_surface->dst_data.surf_stride);
}
/* Config BS module */
reg = (map_bypass[x1_op->enable] <<
SHIFT(SDP_D_DP_BS_CFG_0,
BS_BYPASS)) |
(map_bypass[x1_op->type != SDP_OP_MUL &&
x1_op->type != SDP_OP_NONE] <<
SHIFT(SDP_D_DP_BS_CFG_0,
BS_ALU_BYPASS)) |
(map_alu_op[x1_op->alu_type] <<
SHIFT(SDP_D_DP_BS_CFG_0,
BS_ALU_ALGO)) |
(map_bypass[x1_op->type != SDP_OP_ADD &&
x1_op->type != SDP_OP_NONE] <<
SHIFT(SDP_D_DP_BS_CFG_0,
BS_MUL_BYPASS)) |
(map_prelu[x1_op->act == ACTIVATION_PRELU]
<< SHIFT(SDP_D_DP_BS_CFG_0,
BS_MUL_PRELU)) |
(map_bypass[x1_op->act == ACTIVATION_RELU] <<
SHIFT(SDP_D_DP_BS_CFG_0,
BS_RELU_BYPASS));
sdp_reg_write(D_DP_BS_CFG, reg);
if (x1_op->enable) {
if (x1_op->type == SDP_OP_ADD ||
x1_op->type == SDP_OP_BOTH) {
reg = (map_alu_src[x1_op->mode == SDP_OP_PER_LAYER] <<
SHIFT(SDP_D_DP_BS_ALU_CFG_0,
BS_ALU_SRC)) |
(x1_op->shift_value <<
SHIFT(SDP_D_DP_BS_ALU_CFG_0,
BS_ALU_SHIFT_VALUE));
sdp_reg_write(D_DP_BS_ALU_CFG, reg);
}
if (x1_op->mode == SDP_OP_PER_LAYER) {
sdp_reg_write(D_DP_BS_ALU_SRC_VALUE,
x1_op->alu_operand);
sdp_reg_write(D_DP_BS_MUL_SRC_VALUE,
x1_op->mul_operand);
}
/**
* MUL truncate will take effect no matter
* MUL is bypassed or not
*/
reg = (map_alu_src[x1_op->mode == SDP_OP_PER_LAYER] <<
SHIFT(SDP_D_DP_BS_MUL_CFG_0,
BS_MUL_SRC)) |
(x1_op->truncate <<
SHIFT(SDP_D_DP_BS_MUL_CFG_0,
BS_MUL_SHIFT_VALUE));
sdp_reg_write(D_DP_BS_MUL_CFG, reg);
}
/* Config BN module */
reg = (map_bypass[x2_op->enable] <<
SHIFT(SDP_D_DP_BN_CFG_0,
BN_BYPASS)) |
(map_bypass[x2_op->type != SDP_OP_MUL &&
x2_op->type != SDP_OP_NONE] <<
SHIFT(SDP_D_DP_BN_CFG_0,
BN_ALU_BYPASS)) |
(map_alu_op[x2_op->alu_type] <<
SHIFT(SDP_D_DP_BN_CFG_0,
BN_ALU_ALGO)) |
(map_bypass[x2_op->type != SDP_OP_ADD &&
x2_op->type != SDP_OP_NONE] <<
SHIFT(SDP_D_DP_BN_CFG_0,
BN_MUL_BYPASS)) |
(map_prelu[x2_op->act == ACTIVATION_PRELU]
<< SHIFT(SDP_D_DP_BN_CFG_0,
BN_MUL_PRELU)) |
(map_bypass[x2_op->act == ACTIVATION_RELU]
<< SHIFT(SDP_D_DP_BN_CFG_0,
BN_RELU_BYPASS));
sdp_reg_write(D_DP_BN_CFG, reg);
if (x2_op->enable) {
if (x2_op->type == SDP_OP_ADD ||
x2_op->type == SDP_OP_BOTH) {
reg = (map_alu_src[x2_op->mode == SDP_OP_PER_LAYER] <<
SHIFT(SDP_D_DP_BN_ALU_CFG_0,
BN_ALU_SRC)) |
(x2_op->shift_value <<
SHIFT(SDP_D_DP_BN_ALU_CFG_0,
BN_ALU_SHIFT_VALUE));
sdp_reg_write(D_DP_BN_ALU_CFG, reg);
}
if (x2_op->mode == SDP_OP_PER_LAYER) {
sdp_reg_write(D_DP_BN_ALU_SRC_VALUE,
x2_op->alu_operand);
sdp_reg_write(D_DP_BN_MUL_SRC_VALUE,
x2_op->mul_operand);
}
reg = (map_alu_src[x2_op->mode == SDP_OP_PER_LAYER] <<
SHIFT(SDP_D_DP_BN_MUL_CFG_0,
BN_MUL_SRC)) |
(x2_op->truncate <<
SHIFT(SDP_D_DP_BN_MUL_CFG_0,
BN_MUL_SHIFT_VALUE));
sdp_reg_write(D_DP_BN_MUL_CFG, reg);
}
/* Config EW module */
reg = (map_bypass[y_op->enable] <<
SHIFT(SDP_D_DP_EW_CFG_0,
EW_BYPASS)) |
(map_bypass[y_op->type != SDP_OP_MUL &&
y_op->type != SDP_OP_NONE] <<
SHIFT(SDP_D_DP_EW_CFG_0,
EW_ALU_BYPASS)) |
(map_alu_op[y_op->alu_type] <<
SHIFT(SDP_D_DP_EW_CFG_0,
EW_ALU_ALGO)) |
(map_bypass[y_op->type != SDP_OP_ADD &&
y_op->type != SDP_OP_NONE] <<
SHIFT(SDP_D_DP_EW_CFG_0,
EW_MUL_BYPASS)) |
((map_prelu[y_op->act == ACTIVATION_PRELU]) <<
SHIFT(SDP_D_DP_EW_CFG_0,
EW_MUL_PRELU)) |
(map_bypass[y_op->act == ACTIVATION_LUT] <<
SHIFT(SDP_D_DP_EW_CFG_0,
EW_LUT_BYPASS));
sdp_reg_write(D_DP_EW_CFG, reg);
if (y_op->enable) {
if (y_op->type == SDP_OP_ADD || y_op->type == SDP_OP_BOTH) {
reg = (map_alu_src[y_op->mode == SDP_OP_PER_LAYER] <<
SHIFT(SDP_D_DP_EW_ALU_CFG_0,
EW_ALU_SRC)) |
(map_bypass[y_op->cvt.alu_cvt.enable] <<
SHIFT(SDP_D_DP_EW_ALU_CFG_0,
EW_ALU_CVT_BYPASS));
sdp_reg_write(D_DP_EW_ALU_CFG, reg);
if (y_op->mode == SDP_OP_PER_LAYER) {
sdp_reg_write(D_DP_EW_ALU_SRC_VALUE,
y_op->alu_operand);
} else {
sdp_reg_write(D_DP_EW_ALU_CVT_OFFSET_VALUE,
y_op->cvt.alu_cvt.offset);
sdp_reg_write(D_DP_EW_ALU_CVT_SCALE_VALUE,
y_op->cvt.alu_cvt.scale);
sdp_reg_write(D_DP_EW_ALU_CVT_TRUNCATE_VALUE,
y_op->cvt.alu_cvt.truncate);
}
}
if (y_op->type == SDP_OP_MUL || y_op->type == SDP_OP_BOTH) {
reg = (map_alu_src[y_op->mode == SDP_OP_PER_LAYER] <<
SHIFT(SDP_D_DP_EW_MUL_CFG_0,
EW_MUL_SRC)) |
(map_bypass[y_op->cvt.mul_cvt.enable] <<
SHIFT(SDP_D_DP_EW_MUL_CFG_0,
EW_MUL_CVT_BYPASS));
sdp_reg_write(D_DP_EW_MUL_CFG, reg);
if (y_op->mode == SDP_OP_PER_LAYER) {
sdp_reg_write(D_DP_EW_MUL_SRC_VALUE,
y_op->mul_operand);
} else {
sdp_reg_write(D_DP_EW_MUL_CVT_OFFSET_VALUE,
y_op->cvt.mul_cvt.offset);
sdp_reg_write(D_DP_EW_MUL_CVT_SCALE_VALUE,
y_op->cvt.mul_cvt.scale);
sdp_reg_write(D_DP_EW_MUL_CVT_TRUNCATE_VALUE,
y_op->cvt.mul_cvt.truncate);
}
}
sdp_reg_write(D_DP_EW_TRUNCATE_VALUE, y_op->truncate);
}
reg = (map_fly[sdp_surface->src_data.type == DLA_MEM_HW] <<
SHIFT(SDP_D_FEATURE_MODE_CFG_0,
FLYING_MODE)) |
(map_dst[sdp_surface->dst_data.type == DLA_MEM_HW] <<
SHIFT(SDP_D_FEATURE_MODE_CFG_0,
OUTPUT_DST)) |
(map_wg[sdp_op->conv_mode == CONV_MODE_WINOGRAD] <<
SHIFT(SDP_D_FEATURE_MODE_CFG_0,
WINOGRAD)) |
((sdp_op->batch_num - 1) <<
SHIFT(SDP_D_FEATURE_MODE_CFG_0,
BATCH_NUMBER));
sdp_reg_write(D_FEATURE_MODE_CFG, reg);
sdp_reg_write(D_DST_DMA_CFG,
map_ram_type[sdp_surface->dst_data.type]);
if (sdp_op->batch_num > 1)
sdp_reg_write(D_DST_BATCH_STRIDE, sdp_op->batch_stride);
reg =
(map_proc_precision[sdp_op->dst_precision][sdp_op->src_precision] <<
SHIFT(SDP_D_DATA_FORMAT_0,
PROC_PRECISION)) |
(map_precision[sdp_op->dst_precision] <<
SHIFT(SDP_D_DATA_FORMAT_0,
OUT_PRECISION));
sdp_reg_write(D_DATA_FORMAT, reg);
sdp_reg_write(D_CVT_OFFSET, sdp_op->out_cvt.offset);
sdp_reg_write(D_CVT_SCALE, sdp_op->out_cvt.scale);
sdp_reg_write(D_CVT_SHIFT, sdp_op->out_cvt.truncate);
exit:
dla_trace("Exit: %s", __func__);
RETURN(ret);
}
int
dla_sdp_is_ready(struct dla_processor *processor,
struct dla_processor_group *group)
{
struct dla_processor_group *next_group;
struct dla_sdp_op_desc *sdp_op;
sdp_op = &group->operation_desc->sdp_op;
next_group = &processor->groups[!group->id];
/**
* Single LUT is shared between two SDP groups, need to make
* sure that usage does not conflict. Also, LUT write
* access is locked when SDP sub-engine is active, so delay
* writing LUT when another group is active.
*/
/**
* if no LUT required for current group then it can be programmed
* without further checks
*/
if (sdp_op->lut_index == -1)
return 1;
/**
* if same LUT is used for both groups then it can be programmed
* without more checks. Even if another group is active and LUT
* is locked, it would have been programmed by another group.
*/
if (next_group->lut_index == sdp_op->lut_index)
return 1;
/**
* if LUT index of another group is not -1 means some LUT is programmed,
* then do not program current LUT as we already know current LUT is not
* -1 and neither same as another group.
*/
if (next_group->lut_index != -1)
return 0;
/**
* if current group needs LUT different than another group and that
* group is not active then program it.
*/
if (!next_group->active)
return 1;
/**
* if control is here it means current group is using LUT different than
* another group and that group is active. Wait for another group to
* become idle.
*/
return 0;
}
void
dla_sdp_dump_config(struct dla_processor_group *group)
{
struct dla_sdp_op_desc *sdp_op;
struct dla_sdp_surface_desc *sdp_surface;
sdp_surface = &group->surface_desc->sdp_surface;
sdp_op = &group->operation_desc->sdp_op;
dla_debug_sdp_surface_desc(sdp_surface, group->roi_index);
dla_debug_sdp_op_desc(sdp_op, group->roi_index);
}
int
dla_sdp_program(struct dla_processor_group *group)
{
int32_t ret;
dla_trace("Enter: %s", __func__);
dla_enable_intr(MASK(GLB_S_INTR_MASK_0, SDP_DONE_MASK1) |
MASK(GLB_S_INTR_MASK_0, SDP_DONE_MASK0));
ret = processor_sdp_program(group);
if (ret)
goto exit;
exit:
dla_trace("Exit: %s", __func__);
RETURN(ret);
}