From f25d0a68beb868147571e395de52ced0c55f6cd4 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Mon, 10 Dec 2018 17:34:54 +0100
Subject: [PATCH 01/26] drm/tegra: Refactor CEC support

Most of the CEC support code already lives in the "output" library code.
Move registration and unregistration to the library code as well to make
use of the same code with HDMI on Tegra210 and later via the SOR.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Reviewed-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/drm.h    |  2 +-
 drivers/gpu/drm/tegra/hdmi.c   |  9 ---------
 drivers/gpu/drm/tegra/output.c | 11 +++++++++--
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/tegra/drm.h b/drivers/gpu/drm/tegra/drm.h
index 1012335bb489..f1763b4d5b5f 100644
--- a/drivers/gpu/drm/tegra/drm.h
+++ b/drivers/gpu/drm/tegra/drm.h
@@ -124,7 +124,7 @@ struct tegra_output {
 	struct drm_panel *panel;
 	struct i2c_adapter *ddc;
 	const struct edid *edid;
-	struct cec_notifier *notifier;
+	struct cec_notifier *cec;
 	unsigned int hpd_irq;
 	int hpd_gpio;
 	enum of_gpio_flags hpd_gpio_flags;
diff --git a/drivers/gpu/drm/tegra/hdmi.c b/drivers/gpu/drm/tegra/hdmi.c
index 0082468f703c..d19973945614 100644
--- a/drivers/gpu/drm/tegra/hdmi.c
+++ b/drivers/gpu/drm/tegra/hdmi.c
@@ -22,8 +22,6 @@
 
 #include <sound/hda_verbs.h>
 
-#include <media/cec-notifier.h>
-
 #include "hdmi.h"
 #include "drm.h"
 #include "dc.h"
@@ -1709,10 +1707,6 @@ static int tegra_hdmi_probe(struct platform_device *pdev)
 		return PTR_ERR(hdmi->vdd);
 	}
 
-	hdmi->output.notifier = cec_notifier_get(&pdev->dev);
-	if (hdmi->output.notifier == NULL)
-		return -ENOMEM;
-
 	hdmi->output.dev = &pdev->dev;
 
 	err = tegra_output_probe(&hdmi->output);
@@ -1771,9 +1765,6 @@ static int tegra_hdmi_remove(struct platform_device *pdev)
 
 	tegra_output_remove(&hdmi->output);
 
-	if (hdmi->output.notifier)
-		cec_notifier_put(hdmi->output.notifier);
-
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/tegra/output.c b/drivers/gpu/drm/tegra/output.c
index c662efc7e413..9c2b9dad55c3 100644
--- a/drivers/gpu/drm/tegra/output.c
+++ b/drivers/gpu/drm/tegra/output.c
@@ -36,7 +36,7 @@ int tegra_output_connector_get_modes(struct drm_connector *connector)
 	else if (output->ddc)
 		edid = drm_get_edid(connector, output->ddc);
 
-	cec_notifier_set_phys_addr_from_edid(output->notifier, edid);
+	cec_notifier_set_phys_addr_from_edid(output->cec, edid);
 	drm_connector_update_edid_property(connector, edid);
 
 	if (edid) {
@@ -73,7 +73,7 @@ tegra_output_connector_detect(struct drm_connector *connector, bool force)
 	}
 
 	if (status != connector_status_connected)
-		cec_notifier_phys_addr_invalidate(output->notifier);
+		cec_notifier_phys_addr_invalidate(output->cec);
 
 	return status;
 }
@@ -174,11 +174,18 @@ int tegra_output_probe(struct tegra_output *output)
 		disable_irq(output->hpd_irq);
 	}
 
+	output->cec = cec_notifier_get(output->dev);
+	if (!output->cec)
+		return -ENOMEM;
+
 	return 0;
 }
 
 void tegra_output_remove(struct tegra_output *output)
 {
+	if (output->cec)
+		cec_notifier_put(output->cec);
+
 	if (gpio_is_valid(output->hpd_gpio)) {
 		free_irq(output->hpd_irq, output);
 		gpio_free(output->hpd_gpio);

From cd54fb96e568cf95bad6deb8b070e05643e80935 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Thu, 3 Jan 2019 15:23:15 +0100
Subject: [PATCH 02/26] drm/tegra: sor: Parse more data from HDA format

The HDA format data passed to the SOR from the HDA codec contains more
information than just the rate and number of channels. Parse all the
fields and store them in an internal structure for subsequent use.

While at it, also fix an off-by-one error in the number of channels.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/sor.c | 69 ++++++++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/tegra/sor.c b/drivers/gpu/drm/tegra/sor.c
index ef8692b7075a..7839223aa040 100644
--- a/drivers/gpu/drm/tegra/sor.c
+++ b/drivers/gpu/drm/tegra/sor.c
@@ -393,6 +393,13 @@ struct tegra_sor_ops {
 	int (*remove)(struct tegra_sor *sor);
 };
 
+struct tegra_sor_audio {
+	unsigned int sample_rate;
+	unsigned int channels;
+	unsigned int bits;
+	bool pcm;
+};
+
 struct tegra_sor {
 	struct host1x_client client;
 	struct tegra_output output;
@@ -429,10 +436,7 @@ struct tegra_sor {
 	struct delayed_work scdc;
 	bool scdc_enabled;
 
-	struct {
-		unsigned int sample_rate;
-		unsigned int channels;
-	} audio;
+	struct tegra_sor_audio audio;
 };
 
 struct tegra_sor_state {
@@ -3195,22 +3199,58 @@ static int tegra_sor_parse_dt(struct tegra_sor *sor)
 	return 0;
 }
 
-static void tegra_hda_parse_format(unsigned int format, unsigned int *rate,
-				   unsigned int *channels)
+static void tegra_hda_parse_format(unsigned int format,
+				   struct tegra_sor_audio *audio)
 {
-	unsigned int mul, div;
+	unsigned int mul, div, bits, channels;
+
+	if (format & AC_FMT_TYPE_NON_PCM)
+		audio->pcm = false;
+	else
+		audio->pcm = true;
 
 	if (format & AC_FMT_BASE_44K)
-		*rate = 44100;
+		audio->sample_rate = 44100;
 	else
-		*rate = 48000;
+		audio->sample_rate = 48000;
 
 	mul = (format & AC_FMT_MULT_MASK) >> AC_FMT_MULT_SHIFT;
 	div = (format & AC_FMT_DIV_MASK) >> AC_FMT_DIV_SHIFT;
 
-	*rate = *rate * (mul + 1) / (div + 1);
+	audio->sample_rate = audio->sample_rate * (mul + 1) / (div + 1);
 
-	*channels = (format & AC_FMT_CHAN_MASK) >> AC_FMT_CHAN_SHIFT;
+	switch (format & AC_FMT_BITS_MASK) {
+	case AC_FMT_BITS_8:
+		audio->bits = 8;
+		break;
+
+	case AC_FMT_BITS_16:
+		audio->bits = 16;
+		break;
+
+	case AC_FMT_BITS_20:
+		audio->bits = 20;
+		break;
+
+	case AC_FMT_BITS_24:
+		audio->bits = 24;
+		break;
+
+	case AC_FMT_BITS_32:
+		audio->bits = 32;
+		break;
+
+	default:
+		bits = (format & AC_FMT_BITS_MASK) >> AC_FMT_BITS_SHIFT;
+		WARN(1, "invalid number of bits: %#x\n", bits);
+		audio->bits = 8;
+		break;
+	}
+
+	channels = (format & AC_FMT_CHAN_MASK) >> AC_FMT_CHAN_SHIFT;
+
+	/* channels are encoded as n - 1 */
+	audio->channels = channels + 1;
 }
 
 static irqreturn_t tegra_sor_irq(int irq, void *data)
@@ -3225,14 +3265,11 @@ static irqreturn_t tegra_sor_irq(int irq, void *data)
 		value = tegra_sor_readl(sor, SOR_AUDIO_HDA_CODEC_SCRATCH0);
 
 		if (value & SOR_AUDIO_HDA_CODEC_SCRATCH0_VALID) {
-			unsigned int format, sample_rate, channels;
+			unsigned int format;
 
 			format = value & SOR_AUDIO_HDA_CODEC_SCRATCH0_FMT_MASK;
 
-			tegra_hda_parse_format(format, &sample_rate, &channels);
-
-			sor->audio.sample_rate = sample_rate;
-			sor->audio.channels = channels;
+			tegra_hda_parse_format(format, &sor->audio);
 
 			tegra_sor_hdmi_audio_enable(sor);
 		} else {

From fad7b80643100d16b157b4367a6d2c7f8b19f812 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Thu, 3 Jan 2019 15:23:16 +0100
Subject: [PATCH 03/26] drm/tegra: hda: Extract HDA format parsing code

This code can be reused for HDMI, so extract it into a reusable
function.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/Makefile |  1 +
 drivers/gpu/drm/tegra/hda.c    | 63 ++++++++++++++++++++++++++
 drivers/gpu/drm/tegra/hda.h    | 20 +++++++++
 drivers/gpu/drm/tegra/sor.c    | 80 ++++------------------------------
 4 files changed, 93 insertions(+), 71 deletions(-)
 create mode 100644 drivers/gpu/drm/tegra/hda.c
 create mode 100644 drivers/gpu/drm/tegra/hda.h

diff --git a/drivers/gpu/drm/tegra/Makefile b/drivers/gpu/drm/tegra/Makefile
index 2e0d6213f6bc..33c463e8d49f 100644
--- a/drivers/gpu/drm/tegra/Makefile
+++ b/drivers/gpu/drm/tegra/Makefile
@@ -10,6 +10,7 @@ tegra-drm-y := \
 	dc.o \
 	output.o \
 	rgb.o \
+	hda.o \
 	hdmi.o \
 	mipi-phy.o \
 	dsi.o \
diff --git a/drivers/gpu/drm/tegra/hda.c b/drivers/gpu/drm/tegra/hda.c
new file mode 100644
index 000000000000..94245a18a043
--- /dev/null
+++ b/drivers/gpu/drm/tegra/hda.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright (C) 2019 NVIDIA Corporation
+ */
+
+#include <linux/bug.h>
+
+#include <sound/hda_verbs.h>
+
+#include "hda.h"
+
+void tegra_hda_parse_format(unsigned int format, struct tegra_hda_format *fmt)
+{
+	unsigned int mul, div, bits, channels;
+
+	if (format & AC_FMT_TYPE_NON_PCM)
+		fmt->pcm = false;
+	else
+		fmt->pcm = true;
+
+	if (format & AC_FMT_BASE_44K)
+		fmt->sample_rate = 44100;
+	else
+		fmt->sample_rate = 48000;
+
+	mul = (format & AC_FMT_MULT_MASK) >> AC_FMT_MULT_SHIFT;
+	div = (format & AC_FMT_DIV_MASK) >> AC_FMT_DIV_SHIFT;
+
+	fmt->sample_rate *= (mul + 1) / (div + 1);
+
+	switch (format & AC_FMT_BITS_MASK) {
+	case AC_FMT_BITS_8:
+		fmt->bits = 8;
+		break;
+
+	case AC_FMT_BITS_16:
+		fmt->bits = 16;
+		break;
+
+	case AC_FMT_BITS_20:
+		fmt->bits = 20;
+		break;
+
+	case AC_FMT_BITS_24:
+		fmt->bits = 24;
+		break;
+
+	case AC_FMT_BITS_32:
+		fmt->bits = 32;
+		break;
+
+	default:
+		bits = (format & AC_FMT_BITS_MASK) >> AC_FMT_BITS_SHIFT;
+		WARN(1, "invalid number of bits: %#x\n", bits);
+		fmt->bits = 8;
+		break;
+	}
+
+	channels = (format & AC_FMT_CHAN_MASK) >> AC_FMT_CHAN_SHIFT;
+
+	/* channels are encoded as n - 1 */
+	fmt->channels = channels + 1;
+}
diff --git a/drivers/gpu/drm/tegra/hda.h b/drivers/gpu/drm/tegra/hda.h
new file mode 100644
index 000000000000..77269955a4f2
--- /dev/null
+++ b/drivers/gpu/drm/tegra/hda.h
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright (C) 2019 NVIDIA Corporation
+ */
+
+#ifndef DRM_TEGRA_HDA_H
+#define DRM_TEGRA_HDA_H 1
+
+#include <linux/types.h>
+
+struct tegra_hda_format {
+	unsigned int sample_rate;
+	unsigned int channels;
+	unsigned int bits;
+	bool pcm;
+};
+
+void tegra_hda_parse_format(unsigned int format, struct tegra_hda_format *fmt);
+
+#endif
diff --git a/drivers/gpu/drm/tegra/sor.c b/drivers/gpu/drm/tegra/sor.c
index 7839223aa040..79602debf231 100644
--- a/drivers/gpu/drm/tegra/sor.c
+++ b/drivers/gpu/drm/tegra/sor.c
@@ -19,8 +19,6 @@
 
 #include <soc/tegra/pmc.h>
 
-#include <sound/hda_verbs.h>
-
 #include <drm/drm_atomic_helper.h>
 #include <drm/drm_dp_helper.h>
 #include <drm/drm_panel.h>
@@ -28,6 +26,7 @@
 
 #include "dc.h"
 #include "drm.h"
+#include "hda.h"
 #include "sor.h"
 #include "trace.h"
 
@@ -393,13 +392,6 @@ struct tegra_sor_ops {
 	int (*remove)(struct tegra_sor *sor);
 };
 
-struct tegra_sor_audio {
-	unsigned int sample_rate;
-	unsigned int channels;
-	unsigned int bits;
-	bool pcm;
-};
-
 struct tegra_sor {
 	struct host1x_client client;
 	struct tegra_output output;
@@ -436,7 +428,7 @@ struct tegra_sor {
 	struct delayed_work scdc;
 	bool scdc_enabled;
 
-	struct tegra_sor_audio audio;
+	struct tegra_hda_format format;
 };
 
 struct tegra_sor_state {
@@ -2189,7 +2181,7 @@ static int tegra_sor_hdmi_enable_audio_infoframe(struct tegra_sor *sor)
 		return err;
 	}
 
-	frame.channels = sor->audio.channels;
+	frame.channels = sor->format.channels;
 
 	err = hdmi_audio_infoframe_pack(&frame, buffer, sizeof(buffer));
 	if (err < 0) {
@@ -2218,7 +2210,7 @@ static void tegra_sor_hdmi_audio_enable(struct tegra_sor *sor)
 	value |= SOR_AUDIO_CNTRL_SOURCE_SELECT(SOURCE_SELECT_HDA);
 
 	/* inject null samples */
-	if (sor->audio.channels != 2)
+	if (sor->format.channels != 2)
 		value &= ~SOR_AUDIO_CNTRL_INJECT_NULLSMPL;
 	else
 		value |= SOR_AUDIO_CNTRL_INJECT_NULLSMPL;
@@ -2249,7 +2241,7 @@ static void tegra_sor_hdmi_audio_enable(struct tegra_sor *sor)
 	value = SOR_HDMI_AUDIO_N_RESET | SOR_HDMI_AUDIO_N_LOOKUP;
 	tegra_sor_writel(sor, value, SOR_HDMI_AUDIO_N);
 
-	value = (24000 * 4096) / (128 * sor->audio.sample_rate / 1000);
+	value = (24000 * 4096) / (128 * sor->format.sample_rate / 1000);
 	tegra_sor_writel(sor, value, SOR_AUDIO_AVAL_0320);
 	tegra_sor_writel(sor, 4096, SOR_AUDIO_NVAL_0320);
 
@@ -2262,15 +2254,15 @@ static void tegra_sor_hdmi_audio_enable(struct tegra_sor *sor)
 	tegra_sor_writel(sor, 20000, SOR_AUDIO_AVAL_1764);
 	tegra_sor_writel(sor, 18816, SOR_AUDIO_NVAL_1764);
 
-	value = (24000 * 6144) / (128 * sor->audio.sample_rate / 1000);
+	value = (24000 * 6144) / (128 * sor->format.sample_rate / 1000);
 	tegra_sor_writel(sor, value, SOR_AUDIO_AVAL_0480);
 	tegra_sor_writel(sor, 6144, SOR_AUDIO_NVAL_0480);
 
-	value = (24000 * 12288) / (128 * sor->audio.sample_rate / 1000);
+	value = (24000 * 12288) / (128 * sor->format.sample_rate / 1000);
 	tegra_sor_writel(sor, value, SOR_AUDIO_AVAL_0960);
 	tegra_sor_writel(sor, 12288, SOR_AUDIO_NVAL_0960);
 
-	value = (24000 * 24576) / (128 * sor->audio.sample_rate / 1000);
+	value = (24000 * 24576) / (128 * sor->format.sample_rate / 1000);
 	tegra_sor_writel(sor, value, SOR_AUDIO_AVAL_1920);
 	tegra_sor_writel(sor, 24576, SOR_AUDIO_NVAL_1920);
 
@@ -3199,60 +3191,6 @@ static int tegra_sor_parse_dt(struct tegra_sor *sor)
 	return 0;
 }
 
-static void tegra_hda_parse_format(unsigned int format,
-				   struct tegra_sor_audio *audio)
-{
-	unsigned int mul, div, bits, channels;
-
-	if (format & AC_FMT_TYPE_NON_PCM)
-		audio->pcm = false;
-	else
-		audio->pcm = true;
-
-	if (format & AC_FMT_BASE_44K)
-		audio->sample_rate = 44100;
-	else
-		audio->sample_rate = 48000;
-
-	mul = (format & AC_FMT_MULT_MASK) >> AC_FMT_MULT_SHIFT;
-	div = (format & AC_FMT_DIV_MASK) >> AC_FMT_DIV_SHIFT;
-
-	audio->sample_rate = audio->sample_rate * (mul + 1) / (div + 1);
-
-	switch (format & AC_FMT_BITS_MASK) {
-	case AC_FMT_BITS_8:
-		audio->bits = 8;
-		break;
-
-	case AC_FMT_BITS_16:
-		audio->bits = 16;
-		break;
-
-	case AC_FMT_BITS_20:
-		audio->bits = 20;
-		break;
-
-	case AC_FMT_BITS_24:
-		audio->bits = 24;
-		break;
-
-	case AC_FMT_BITS_32:
-		audio->bits = 32;
-		break;
-
-	default:
-		bits = (format & AC_FMT_BITS_MASK) >> AC_FMT_BITS_SHIFT;
-		WARN(1, "invalid number of bits: %#x\n", bits);
-		audio->bits = 8;
-		break;
-	}
-
-	channels = (format & AC_FMT_CHAN_MASK) >> AC_FMT_CHAN_SHIFT;
-
-	/* channels are encoded as n - 1 */
-	audio->channels = channels + 1;
-}
-
 static irqreturn_t tegra_sor_irq(int irq, void *data)
 {
 	struct tegra_sor *sor = data;
@@ -3269,7 +3207,7 @@ static irqreturn_t tegra_sor_irq(int irq, void *data)
 
 			format = value & SOR_AUDIO_HDA_CODEC_SCRATCH0_FMT_MASK;
 
-			tegra_hda_parse_format(format, &sor->audio);
+			tegra_hda_parse_format(format, &sor->format);
 
 			tegra_sor_hdmi_audio_enable(sor);
 		} else {

From e3c702dcc7b047a5187a5ad132bfdf3850cc33e1 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Thu, 3 Jan 2019 15:23:17 +0100
Subject: [PATCH 04/26] drm/tegra: hdmi: Reuse common HDA format parser

Eliminate some duplicate code by reusing the HDA format parser already
used by the SOR.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/hdmi.c | 43 +++++++-----------------------------
 1 file changed, 8 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/tegra/hdmi.c b/drivers/gpu/drm/tegra/hdmi.c
index d19973945614..8fdff801c6f2 100644
--- a/drivers/gpu/drm/tegra/hdmi.c
+++ b/drivers/gpu/drm/tegra/hdmi.c
@@ -20,8 +20,7 @@
 #include <drm/drm_crtc.h>
 #include <drm/drm_crtc_helper.h>
 
-#include <sound/hda_verbs.h>
-
+#include "hda.h"
 #include "hdmi.h"
 #include "drm.h"
 #include "dc.h"
@@ -69,8 +68,7 @@ struct tegra_hdmi {
 	const struct tegra_hdmi_config *config;
 
 	unsigned int audio_source;
-	unsigned int audio_sample_rate;
-	unsigned int audio_channels;
+	struct tegra_hda_format format;
 
 	unsigned int pixel_clock;
 	bool stereo;
@@ -508,7 +506,7 @@ static void tegra_hdmi_write_aval(struct tegra_hdmi *hdmi, u32 value)
 	unsigned int i;
 
 	for (i = 0; i < ARRAY_SIZE(regs); i++) {
-		if (regs[i].sample_rate == hdmi->audio_sample_rate) {
+		if (regs[i].sample_rate == hdmi->format.sample_rate) {
 			tegra_hdmi_writel(hdmi, value, regs[i].offset);
 			break;
 		}
@@ -562,7 +560,7 @@ static int tegra_hdmi_setup_audio(struct tegra_hdmi *hdmi)
 		 * play back system startup sounds early. It is possibly not
 		 * needed on Linux at all.
 		 */
-		if (hdmi->audio_channels == 2)
+		if (hdmi->format.channels == 2)
 			value = SOR_AUDIO_CNTRL0_INJECT_NULLSMPL;
 		else
 			value = 0;
@@ -593,12 +591,12 @@ static int tegra_hdmi_setup_audio(struct tegra_hdmi *hdmi)
 		tegra_hdmi_writel(hdmi, value, HDMI_NV_PDISP_SOR_AUDIO_SPARE0);
 	}
 
-	config = tegra_hdmi_get_audio_config(hdmi->audio_sample_rate,
+	config = tegra_hdmi_get_audio_config(hdmi->format.sample_rate,
 					     hdmi->pixel_clock);
 	if (!config) {
 		dev_err(hdmi->dev,
 			"cannot set audio to %u Hz at %u Hz pixel clock\n",
-			hdmi->audio_sample_rate, hdmi->pixel_clock);
+			hdmi->format.sample_rate, hdmi->pixel_clock);
 		return -EINVAL;
 	}
 
@@ -785,7 +783,7 @@ static void tegra_hdmi_setup_audio_infoframe(struct tegra_hdmi *hdmi)
 		return;
 	}
 
-	frame.channels = hdmi->audio_channels;
+	frame.channels = hdmi->format.channels;
 
 	err = hdmi_audio_infoframe_pack(&frame, buffer, sizeof(buffer));
 	if (err < 0) {
@@ -1587,24 +1585,6 @@ static const struct of_device_id tegra_hdmi_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, tegra_hdmi_of_match);
 
-static void hda_format_parse(unsigned int format, unsigned int *rate,
-			     unsigned int *channels)
-{
-	unsigned int mul, div;
-
-	if (format & AC_FMT_BASE_44K)
-		*rate = 44100;
-	else
-		*rate = 48000;
-
-	mul = (format & AC_FMT_MULT_MASK) >> AC_FMT_MULT_SHIFT;
-	div = (format & AC_FMT_DIV_MASK) >> AC_FMT_DIV_SHIFT;
-
-	*rate = *rate * (mul + 1) / (div + 1);
-
-	*channels = (format & AC_FMT_CHAN_MASK) >> AC_FMT_CHAN_SHIFT;
-}
-
 static irqreturn_t tegra_hdmi_irq(int irq, void *data)
 {
 	struct tegra_hdmi *hdmi = data;
@@ -1621,14 +1601,9 @@ static irqreturn_t tegra_hdmi_irq(int irq, void *data)
 		value = tegra_hdmi_readl(hdmi, HDMI_NV_PDISP_SOR_AUDIO_HDA_CODEC_SCRATCH0);
 
 		if (value & SOR_AUDIO_HDA_CODEC_SCRATCH0_VALID) {
-			unsigned int sample_rate, channels;
-
 			format = value & SOR_AUDIO_HDA_CODEC_SCRATCH0_FMT_MASK;
 
-			hda_format_parse(format, &sample_rate, &channels);
-
-			hdmi->audio_sample_rate = sample_rate;
-			hdmi->audio_channels = channels;
+			tegra_hda_parse_format(format, &hdmi->format);
 
 			err = tegra_hdmi_setup_audio(hdmi);
 			if (err < 0) {
@@ -1662,8 +1637,6 @@ static int tegra_hdmi_probe(struct platform_device *pdev)
 	hdmi->dev = &pdev->dev;
 
 	hdmi->audio_source = AUTO;
-	hdmi->audio_sample_rate = 48000;
-	hdmi->audio_channels = 2;
 	hdmi->stereo = false;
 	hdmi->dvi = false;
 

From db5adf4d6dced4e3326ce369fe0c213c968095f4 Mon Sep 17 00:00:00 2001
From: Alban Bedel <alban.bedel@avionic-design.de>
Date: Wed, 14 Dec 2016 18:20:39 +0100
Subject: [PATCH 05/26] drm/tegra: hdmi: Fix audio to work with any pixel clock
 rate

The audio setting implementation was limited to a few specific pixel
clocks. This prevented HDMI audio from working on several test devices
as they need a pixel clock that is not supported by this implementation.

Fix this by implementing the algorithm provided in the TRM using fixed
point arithmetic. This allows the driver to cope with any sane pixel
clock rate.

Signed-off-by: Alban Bedel <alban.bedel@avionic-design.de>
[treding@nvidia.com: fix uninitialized variable warning]
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/hdmi.c | 155 ++++++++++++-----------------------
 1 file changed, 52 insertions(+), 103 deletions(-)

diff --git a/drivers/gpu/drm/tegra/hdmi.c b/drivers/gpu/drm/tegra/hdmi.c
index 8fdff801c6f2..3ce551911548 100644
--- a/drivers/gpu/drm/tegra/hdmi.c
+++ b/drivers/gpu/drm/tegra/hdmi.c
@@ -11,6 +11,7 @@
 #include <linux/debugfs.h>
 #include <linux/gpio.h>
 #include <linux/hdmi.h>
+#include <linux/math64.h>
 #include <linux/of_device.h>
 #include <linux/pm_runtime.h>
 #include <linux/regulator/consumer.h>
@@ -115,68 +116,11 @@ static inline void tegra_hdmi_writel(struct tegra_hdmi *hdmi, u32 value,
 }
 
 struct tegra_hdmi_audio_config {
-	unsigned int pclk;
 	unsigned int n;
 	unsigned int cts;
 	unsigned int aval;
 };
 
-static const struct tegra_hdmi_audio_config tegra_hdmi_audio_32k[] = {
-	{  25200000, 4096,  25200, 24000 },
-	{  27000000, 4096,  27000, 24000 },
-	{  74250000, 4096,  74250, 24000 },
-	{ 148500000, 4096, 148500, 24000 },
-	{         0,    0,      0,     0 },
-};
-
-static const struct tegra_hdmi_audio_config tegra_hdmi_audio_44_1k[] = {
-	{  25200000, 5880,  26250, 25000 },
-	{  27000000, 5880,  28125, 25000 },
-	{  74250000, 4704,  61875, 20000 },
-	{ 148500000, 4704, 123750, 20000 },
-	{         0,    0,      0,     0 },
-};
-
-static const struct tegra_hdmi_audio_config tegra_hdmi_audio_48k[] = {
-	{  25200000, 6144,  25200, 24000 },
-	{  27000000, 6144,  27000, 24000 },
-	{  74250000, 6144,  74250, 24000 },
-	{ 148500000, 6144, 148500, 24000 },
-	{         0,    0,      0,     0 },
-};
-
-static const struct tegra_hdmi_audio_config tegra_hdmi_audio_88_2k[] = {
-	{  25200000, 11760,  26250, 25000 },
-	{  27000000, 11760,  28125, 25000 },
-	{  74250000,  9408,  61875, 20000 },
-	{ 148500000,  9408, 123750, 20000 },
-	{         0,     0,      0,     0 },
-};
-
-static const struct tegra_hdmi_audio_config tegra_hdmi_audio_96k[] = {
-	{  25200000, 12288,  25200, 24000 },
-	{  27000000, 12288,  27000, 24000 },
-	{  74250000, 12288,  74250, 24000 },
-	{ 148500000, 12288, 148500, 24000 },
-	{         0,     0,      0,     0 },
-};
-
-static const struct tegra_hdmi_audio_config tegra_hdmi_audio_176_4k[] = {
-	{  25200000, 23520,  26250, 25000 },
-	{  27000000, 23520,  28125, 25000 },
-	{  74250000, 18816,  61875, 20000 },
-	{ 148500000, 18816, 123750, 20000 },
-	{         0,     0,      0,     0 },
-};
-
-static const struct tegra_hdmi_audio_config tegra_hdmi_audio_192k[] = {
-	{  25200000, 24576,  25200, 24000 },
-	{  27000000, 24576,  27000, 24000 },
-	{  74250000, 24576,  74250, 24000 },
-	{ 148500000, 24576, 148500, 24000 },
-	{         0,     0,      0,     0 },
-};
-
 static const struct tmds_config tegra20_tmds_config[] = {
 	{ /* slow pixel clock modes */
 		.pclk = 27000000,
@@ -414,52 +358,53 @@ static const struct tmds_config tegra124_tmds_config[] = {
 	},
 };
 
-static const struct tegra_hdmi_audio_config *
-tegra_hdmi_get_audio_config(unsigned int sample_rate, unsigned int pclk)
+static int
+tegra_hdmi_get_audio_config(unsigned int audio_freq, unsigned int pix_clock,
+			    struct tegra_hdmi_audio_config *config)
 {
-	const struct tegra_hdmi_audio_config *table;
+	const unsigned int afreq = 128 * audio_freq;
+	const unsigned int min_n = afreq / 1500;
+	const unsigned int max_n = afreq / 300;
+	const unsigned int ideal_n = afreq / 1000;
+	int64_t min_err = (uint64_t)-1 >> 1;
+	unsigned int min_delta = -1;
+	int n;
 
-	switch (sample_rate) {
-	case 32000:
-		table = tegra_hdmi_audio_32k;
-		break;
+	memset(config, 0, sizeof(*config));
+	config->n = -1;
 
-	case 44100:
-		table = tegra_hdmi_audio_44_1k;
-		break;
+	for (n = min_n; n <= max_n; n++) {
+		uint64_t cts_f, aval_f;
+		unsigned int delta;
+		int64_t cts, err;
 
-	case 48000:
-		table = tegra_hdmi_audio_48k;
-		break;
+		/* compute aval in 48.16 fixed point */
+		aval_f = ((int64_t)24000000 << 16) * n;
+		do_div(aval_f, afreq);
+		/* It should round without any rest */
+		if (aval_f & 0xFFFF)
+			continue;
 
-	case 88200:
-		table = tegra_hdmi_audio_88_2k;
-		break;
+		/* Compute cts in 48.16 fixed point */
+		cts_f = ((int64_t)pix_clock << 16) * n;
+		do_div(cts_f, afreq);
+		/* Round it to the nearest integer */
+		cts = (cts_f & ~0xFFFF) + ((cts_f & BIT(15)) << 1);
 
-	case 96000:
-		table = tegra_hdmi_audio_96k;
-		break;
+		delta = abs(n - ideal_n);
 
-	case 176400:
-		table = tegra_hdmi_audio_176_4k;
-		break;
-
-	case 192000:
-		table = tegra_hdmi_audio_192k;
-		break;
-
-	default:
-		return NULL;
+		/* Compute the absolute error */
+		err = abs((int64_t)cts_f - cts);
+		if (err < min_err || (err == min_err && delta < min_delta)) {
+			config->n = n;
+			config->cts = cts >> 16;
+			config->aval = aval_f >> 16;
+			min_delta = delta;
+			min_err = err;
+		}
 	}
 
-	while (table->pclk) {
-		if (table->pclk == pclk)
-			return table;
-
-		table++;
-	}
-
-	return NULL;
+	return config->n != -1 ? 0 : -EINVAL;
 }
 
 static void tegra_hdmi_setup_audio_fs_tables(struct tegra_hdmi *hdmi)
@@ -515,8 +460,9 @@ static void tegra_hdmi_write_aval(struct tegra_hdmi *hdmi, u32 value)
 
 static int tegra_hdmi_setup_audio(struct tegra_hdmi *hdmi)
 {
-	const struct tegra_hdmi_audio_config *config;
+	struct tegra_hdmi_audio_config config;
 	u32 source, value;
+	int err;
 
 	switch (hdmi->audio_source) {
 	case HDA:
@@ -591,25 +537,28 @@ static int tegra_hdmi_setup_audio(struct tegra_hdmi *hdmi)
 		tegra_hdmi_writel(hdmi, value, HDMI_NV_PDISP_SOR_AUDIO_SPARE0);
 	}
 
-	config = tegra_hdmi_get_audio_config(hdmi->format.sample_rate,
-					     hdmi->pixel_clock);
-	if (!config) {
+	err = tegra_hdmi_get_audio_config(hdmi->format.sample_rate,
+					  hdmi->pixel_clock, &config);
+	if (err < 0) {
 		dev_err(hdmi->dev,
 			"cannot set audio to %u Hz at %u Hz pixel clock\n",
 			hdmi->format.sample_rate, hdmi->pixel_clock);
-		return -EINVAL;
+		return err;
 	}
 
+	dev_dbg(hdmi->dev, "audio: pixclk=%u, n=%u, cts=%u, aval=%u\n",
+		hdmi->pixel_clock, config.n, config.cts, config.aval);
+
 	tegra_hdmi_writel(hdmi, 0, HDMI_NV_PDISP_HDMI_ACR_CTRL);
 
 	value = AUDIO_N_RESETF | AUDIO_N_GENERATE_ALTERNATE |
-		AUDIO_N_VALUE(config->n - 1);
+		AUDIO_N_VALUE(config.n - 1);
 	tegra_hdmi_writel(hdmi, value, HDMI_NV_PDISP_AUDIO_N);
 
-	tegra_hdmi_writel(hdmi, ACR_SUBPACK_N(config->n) | ACR_ENABLE,
+	tegra_hdmi_writel(hdmi, ACR_SUBPACK_N(config.n) | ACR_ENABLE,
 			  HDMI_NV_PDISP_HDMI_ACR_0441_SUBPACK_HIGH);
 
-	tegra_hdmi_writel(hdmi, ACR_SUBPACK_CTS(config->cts),
+	tegra_hdmi_writel(hdmi, ACR_SUBPACK_CTS(config.cts),
 			  HDMI_NV_PDISP_HDMI_ACR_0441_SUBPACK_LOW);
 
 	value = SPARE_HW_CTS | SPARE_FORCE_SW_CTS | SPARE_CTS_RESET_VAL(1);
@@ -620,7 +569,7 @@ static int tegra_hdmi_setup_audio(struct tegra_hdmi *hdmi)
 	tegra_hdmi_writel(hdmi, value, HDMI_NV_PDISP_AUDIO_N);
 
 	if (hdmi->config->has_hda)
-		tegra_hdmi_write_aval(hdmi, config->aval);
+		tegra_hdmi_write_aval(hdmi, config.aval);
 
 	tegra_hdmi_setup_audio_fs_tables(hdmi);
 

From 0747a672a32829809675028ba396f394b104ee31 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 10 Dec 2018 22:51:04 +0100
Subject: [PATCH 06/26] gpu: host1x: Use completion instead of semaphore

In this usage, the two are completely equivalent, but the completion
documents better what is going on, and we generally try to avoid
semaphores these days.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/cdma.c | 6 +++---
 drivers/gpu/host1x/cdma.h | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
index 91df51e631b2..15bdeb836e3d 100644
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -210,7 +210,7 @@ unsigned int host1x_cdma_wait_locked(struct host1x_cdma *cdma,
 		cdma->event = event;
 
 		mutex_unlock(&cdma->lock);
-		down(&cdma->sem);
+		wait_for_completion(&cdma->complete);
 		mutex_lock(&cdma->lock);
 	}
 
@@ -314,7 +314,7 @@ static void update_cdma_locked(struct host1x_cdma *cdma)
 
 	if (signal) {
 		cdma->event = CDMA_EVENT_NONE;
-		up(&cdma->sem);
+		complete(&cdma->complete);
 	}
 }
 
@@ -416,7 +416,7 @@ int host1x_cdma_init(struct host1x_cdma *cdma)
 	int err;
 
 	mutex_init(&cdma->lock);
-	sema_init(&cdma->sem, 0);
+	init_completion(&cdma->complete);
 
 	INIT_LIST_HEAD(&cdma->sync_queue);
 
diff --git a/drivers/gpu/host1x/cdma.h b/drivers/gpu/host1x/cdma.h
index e97e17b82370..71078359c626 100644
--- a/drivers/gpu/host1x/cdma.h
+++ b/drivers/gpu/host1x/cdma.h
@@ -20,7 +20,7 @@
 #define __HOST1X_CDMA_H
 
 #include <linux/sched.h>
-#include <linux/semaphore.h>
+#include <linux/completion.h>
 #include <linux/list.h>
 
 struct host1x_syncpt;
@@ -69,8 +69,8 @@ enum cdma_event {
 
 struct host1x_cdma {
 	struct mutex lock;		/* controls access to shared state */
-	struct semaphore sem;		/* signalled when event occurs */
-	enum cdma_event event;		/* event that sem is waiting for */
+	struct completion complete;	/* signalled when event occurs */
+	enum cdma_event event;		/* event that complete is waiting for */
 	unsigned int slots_used;	/* pb slots used in current submit */
 	unsigned int slots_free;	/* pb slots free in current submit */
 	unsigned int first_get;		/* DMAGET value, where submit begins */

From f67524caf49949b8d1a219f1fd8ea263854a6683 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 23 Jan 2019 12:34:05 +0100
Subject: [PATCH 07/26] gpu: host1x: Represent host1x bus devices in debugfs

This new debugfs file represents the state of host1x bus devices,
specifying the list of subdevices and marking which ones have
successfully registered.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/bus.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c
index b4c385d4a6af..103fffc1904b 100644
--- a/drivers/gpu/host1x/bus.c
+++ b/drivers/gpu/host1x/bus.c
@@ -15,8 +15,10 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/debugfs.h>
 #include <linux/host1x.h>
 #include <linux/of.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/of_device.h>
 
@@ -500,6 +502,36 @@ static void host1x_detach_driver(struct host1x *host1x,
 	mutex_unlock(&host1x->devices_lock);
 }
 
+static int host1x_devices_show(struct seq_file *s, void *data)
+{
+	struct host1x *host1x = s->private;
+	struct host1x_device *device;
+
+	mutex_lock(&host1x->devices_lock);
+
+	list_for_each_entry(device, &host1x->devices, list) {
+		struct host1x_subdev *subdev;
+
+		seq_printf(s, "%s\n", dev_name(&device->dev));
+
+		mutex_lock(&device->subdevs_lock);
+
+		list_for_each_entry(subdev, &device->active, list)
+			seq_printf(s, "  %pOFf: %s\n", subdev->np,
+				   dev_name(subdev->client->dev));
+
+		list_for_each_entry(subdev, &device->subdevs, list)
+			seq_printf(s, "  %pOFf:\n", subdev->np);
+
+		mutex_unlock(&device->subdevs_lock);
+	}
+
+	mutex_unlock(&host1x->devices_lock);
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(host1x_devices);
+
 /**
  * host1x_register() - register a host1x controller
  * @host1x: host1x controller
@@ -523,6 +555,9 @@ int host1x_register(struct host1x *host1x)
 
 	mutex_unlock(&drivers_lock);
 
+	debugfs_create_file("devices", S_IRUGO, host1x->debugfs, host1x,
+			    &host1x_devices_fops);
+
 	return 0;
 }
 

From 6841482b82e5ba8a403559cbc0c15706624db17a Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 1 Feb 2019 14:28:22 +0100
Subject: [PATCH 08/26] gpu: host1x: Set up stream ID table

In order to enable the MMIO path stream ID protection provided by the
incarnation of host1x found in Tegra186 and later, the host1x must be
provided with the list of stream ID register offsets for each of its
clients. Some clients (such as VIC) have multiple stream ID registers
that are assumed to be contiguous. The host1x is programmed with the
base offset and a limit which provide the range of registers that the
host1x needs to monitor for writes.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/dev.c | 38 ++++++++++++++++++++++++++++++++++++++
 drivers/gpu/host1x/dev.h |  8 ++++++++
 2 files changed, 46 insertions(+)

diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c
index 419d8929a98f..4c044ee54fe6 100644
--- a/drivers/gpu/host1x/dev.c
+++ b/drivers/gpu/host1x/dev.c
@@ -120,6 +120,15 @@ static const struct host1x_info host1x05_info = {
 	.dma_mask = DMA_BIT_MASK(34),
 };
 
+static const struct host1x_sid_entry tegra186_sid_table[] = {
+	{
+		/* VIC */
+		.base = 0x1af0,
+		.offset = 0x30,
+		.limit = 0x34
+	},
+};
+
 static const struct host1x_info host1x06_info = {
 	.nb_channels = 63,
 	.nb_pts = 576,
@@ -129,6 +138,17 @@ static const struct host1x_info host1x06_info = {
 	.sync_offset = 0x0,
 	.dma_mask = DMA_BIT_MASK(34),
 	.has_hypervisor = true,
+	.num_sid_entries = ARRAY_SIZE(tegra186_sid_table),
+	.sid_table = tegra186_sid_table,
+};
+
+static const struct host1x_sid_entry tegra194_sid_table[] = {
+	{
+		/* VIC */
+		.base = 0x1af0,
+		.offset = 0x30,
+		.limit = 0x34
+	},
 };
 
 static const struct host1x_info host1x07_info = {
@@ -140,6 +160,8 @@ static const struct host1x_info host1x07_info = {
 	.sync_offset = 0x0,
 	.dma_mask = DMA_BIT_MASK(40),
 	.has_hypervisor = true,
+	.num_sid_entries = ARRAY_SIZE(tegra194_sid_table),
+	.sid_table = tegra194_sid_table,
 };
 
 static const struct of_device_id host1x_of_match[] = {
@@ -154,6 +176,19 @@ static const struct of_device_id host1x_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, host1x_of_match);
 
+static void host1x_setup_sid_table(struct host1x *host)
+{
+	const struct host1x_info *info = host->info;
+	unsigned int i;
+
+	for (i = 0; i < info->num_sid_entries; i++) {
+		const struct host1x_sid_entry *entry = &info->sid_table[i];
+
+		host1x_hypervisor_writel(host, entry->offset, entry->base);
+		host1x_hypervisor_writel(host, entry->limit, entry->base + 4);
+	}
+}
+
 static int host1x_probe(struct platform_device *pdev)
 {
 	struct host1x *host;
@@ -316,6 +351,9 @@ skip_iommu:
 
 	host1x_debug_init(host);
 
+	if (host->info->has_hypervisor)
+		host1x_setup_sid_table(host);
+
 	err = host1x_register(host);
 	if (err < 0)
 		goto fail_deinit_intr;
diff --git a/drivers/gpu/host1x/dev.h b/drivers/gpu/host1x/dev.h
index 36f44ffebe73..05216a7e4830 100644
--- a/drivers/gpu/host1x/dev.h
+++ b/drivers/gpu/host1x/dev.h
@@ -94,6 +94,12 @@ struct host1x_intr_ops {
 	int (*free_syncpt_irq)(struct host1x *host);
 };
 
+struct host1x_sid_entry {
+	unsigned int base;
+	unsigned int offset;
+	unsigned int limit;
+};
+
 struct host1x_info {
 	unsigned int nb_channels; /* host1x: number of channels supported */
 	unsigned int nb_pts; /* host1x: number of syncpoints supported */
@@ -103,6 +109,8 @@ struct host1x_info {
 	unsigned int sync_offset; /* offset of syncpoint registers */
 	u64 dma_mask; /* mask of addressable memory */
 	bool has_hypervisor; /* has hypervisor registers */
+	unsigned int num_sid_entries;
+	const struct host1x_sid_entry *sid_table;
 };
 
 struct host1x {

From de5469c21ff9c3e5ba6162dae58379ed51443164 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 1 Feb 2019 14:28:23 +0100
Subject: [PATCH 09/26] gpu: host1x: Program the channel stream ID

When processing command streams, make sure the host1x's stream ID is
programmed for the channel so that addresses are properly translated
through the SMMU.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/hw/channel_hw.c          | 13 +++++++++++++
 drivers/gpu/host1x/hw/host1x06_hardware.h   |  1 +
 drivers/gpu/host1x/hw/host1x07_hardware.h   |  1 +
 drivers/gpu/host1x/hw/hw_host1x06_channel.h | 11 +++++++++++
 drivers/gpu/host1x/hw/hw_host1x07_channel.h | 11 +++++++++++
 5 files changed, 37 insertions(+)
 create mode 100644 drivers/gpu/host1x/hw/hw_host1x06_channel.h
 create mode 100644 drivers/gpu/host1x/hw/hw_host1x07_channel.h

diff --git a/drivers/gpu/host1x/hw/channel_hw.c b/drivers/gpu/host1x/hw/channel_hw.c
index 95ea81172a83..3067af4452cd 100644
--- a/drivers/gpu/host1x/hw/channel_hw.c
+++ b/drivers/gpu/host1x/hw/channel_hw.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/host1x.h>
+#include <linux/iommu.h>
 #include <linux/slab.h>
 
 #include <trace/events/host1x.h>
@@ -89,6 +90,16 @@ static inline void synchronize_syncpt_base(struct host1x_job *job)
 			 HOST1X_UCLASS_LOAD_SYNCPT_BASE_VALUE_F(value));
 }
 
+static void host1x_channel_set_streamid(struct host1x_channel *channel)
+{
+#if HOST1X_HW >= 6
+	struct iommu_fwspec *spec = dev_iommu_fwspec_get(channel->dev->parent);
+	u32 sid = spec ? spec->ids[0] & 0xffff : 0x7f;
+
+	host1x_ch_writel(channel, sid, HOST1X_CHANNEL_SMMU_STREAMID);
+#endif
+}
+
 static int channel_submit(struct host1x_job *job)
 {
 	struct host1x_channel *ch = job->channel;
@@ -120,6 +131,8 @@ static int channel_submit(struct host1x_job *job)
 		goto error;
 	}
 
+	host1x_channel_set_streamid(ch);
+
 	/* begin a CDMA submit */
 	err = host1x_cdma_begin(&ch->cdma, job);
 	if (err) {
diff --git a/drivers/gpu/host1x/hw/host1x06_hardware.h b/drivers/gpu/host1x/hw/host1x06_hardware.h
index 3039c92ea605..eab753b91f24 100644
--- a/drivers/gpu/host1x/hw/host1x06_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x06_hardware.h
@@ -22,6 +22,7 @@
 #include <linux/types.h>
 #include <linux/bitops.h>
 
+#include "hw_host1x06_channel.h"
 #include "hw_host1x06_uclass.h"
 #include "hw_host1x06_vm.h"
 #include "hw_host1x06_hypervisor.h"
diff --git a/drivers/gpu/host1x/hw/host1x07_hardware.h b/drivers/gpu/host1x/hw/host1x07_hardware.h
index 1353e7ab71dd..a79f57dc87bb 100644
--- a/drivers/gpu/host1x/hw/host1x07_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x07_hardware.h
@@ -22,6 +22,7 @@
 #include <linux/types.h>
 #include <linux/bitops.h>
 
+#include "hw_host1x07_channel.h"
 #include "hw_host1x07_uclass.h"
 #include "hw_host1x07_vm.h"
 #include "hw_host1x07_hypervisor.h"
diff --git a/drivers/gpu/host1x/hw/hw_host1x06_channel.h b/drivers/gpu/host1x/hw/hw_host1x06_channel.h
new file mode 100644
index 000000000000..18ae1c57bbea
--- /dev/null
+++ b/drivers/gpu/host1x/hw/hw_host1x06_channel.h
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 NVIDIA Corporation.
+ */
+
+#ifndef HOST1X_HW_HOST1X06_CHANNEL_H
+#define HOST1X_HW_HOST1X06_CHANNEL_H
+
+#define HOST1X_CHANNEL_SMMU_STREAMID 0x084
+
+#endif
diff --git a/drivers/gpu/host1x/hw/hw_host1x07_channel.h b/drivers/gpu/host1x/hw/hw_host1x07_channel.h
new file mode 100644
index 000000000000..96fa72bbd7ab
--- /dev/null
+++ b/drivers/gpu/host1x/hw/hw_host1x07_channel.h
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 NVIDIA Corporation.
+ */
+
+#ifndef HOST1X_HW_HOST1X07_CHANNEL_H
+#define HOST1X_HW_HOST1X07_CHANNEL_H
+
+#define HOST1X_CHANNEL_SMMU_STREAMID 0x084
+
+#endif

From 5a5fccbd8c315c08db01e585e1cbe88e30b70691 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 1 Feb 2019 14:28:24 +0100
Subject: [PATCH 10/26] gpu: host1x: Introduce support for wide opcodes

The CDMA push buffer can currently only handle opcodes that take a
single word parameter. However, the host1x implementation on Tegra186
and later supports opcodes that require multiple words as parameters.

Unfortunately the way the push buffer is structured, these wide opcodes
cannot simply be composed of two regular opcodes because that could
result in the wide opcode being split across the end of the push buffer
and the final RESTART opcode required to wrap the push buffer around
would break the wide opcode.

One way to fix this would be to remove the concept of slots to simplify
push buffer operations. However, that's not entirely trivial and should
be done in a separate patch. For now, simply use a different function
to push four-word opcodes into the push buffer. Technically only three
words are pushed, with the fourth word used as padding to preserve the
2-word alignment required by the slots abstraction. The fourth word is
always a NOP opcode.

Additional care must be taken when the end of the push buffer is
reached. If a four-word opcode doesn't fit into the push buffer without
being split by the boundary, NOP opcodes will be introduced and the new
wide opcode placed at the beginning of the push buffer.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/cdma.c     | 92 +++++++++++++++++++++++++++++++++++
 drivers/gpu/host1x/cdma.h     |  2 +
 include/trace/events/host1x.h | 26 ++++++++++
 3 files changed, 120 insertions(+)

diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
index 15bdeb836e3d..64099cc1964b 100644
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -217,6 +217,45 @@ unsigned int host1x_cdma_wait_locked(struct host1x_cdma *cdma,
 	return 0;
 }
 
+/*
+ * Sleep (if necessary) until the push buffer has enough free space.
+ *
+ * Must be called with the cdma lock held.
+ */
+int host1x_cdma_wait_pushbuffer_space(struct host1x *host1x,
+				      struct host1x_cdma *cdma,
+				      unsigned int needed)
+{
+	while (true) {
+		struct push_buffer *pb = &cdma->push_buffer;
+		unsigned int space;
+
+		space = host1x_pushbuffer_space(pb);
+		if (space >= needed)
+			break;
+
+		trace_host1x_wait_cdma(dev_name(cdma_to_channel(cdma)->dev),
+				       CDMA_EVENT_PUSH_BUFFER_SPACE);
+
+		host1x_hw_cdma_flush(host1x, cdma);
+
+		/* If somebody has managed to already start waiting, yield */
+		if (cdma->event != CDMA_EVENT_NONE) {
+			mutex_unlock(&cdma->lock);
+			schedule();
+			mutex_lock(&cdma->lock);
+			continue;
+		}
+
+		cdma->event = CDMA_EVENT_PUSH_BUFFER_SPACE;
+
+		mutex_unlock(&cdma->lock);
+		wait_for_completion(&cdma->complete);
+		mutex_lock(&cdma->lock);
+	}
+
+	return 0;
+}
 /*
  * Start timer that tracks the time spent by the job.
  * Must be called with the cdma lock held.
@@ -509,6 +548,59 @@ void host1x_cdma_push(struct host1x_cdma *cdma, u32 op1, u32 op2)
 	host1x_pushbuffer_push(pb, op1, op2);
 }
 
+/*
+ * Push four words into two consecutive push buffer slots. Note that extra
+ * care needs to be taken not to split the two slots across the end of the
+ * push buffer. Otherwise the RESTART opcode at the end of the push buffer
+ * that ensures processing will restart at the beginning will break up the
+ * four words.
+ *
+ * Blocks as necessary if the push buffer is full.
+ */
+void host1x_cdma_push_wide(struct host1x_cdma *cdma, u32 op1, u32 op2,
+			   u32 op3, u32 op4)
+{
+	struct host1x_channel *channel = cdma_to_channel(cdma);
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	struct push_buffer *pb = &cdma->push_buffer;
+	unsigned int needed = 2, extra = 0, i;
+	unsigned int space = cdma->slots_free;
+
+	if (host1x_debug_trace_cmdbuf)
+		trace_host1x_cdma_push_wide(dev_name(channel->dev), op1, op2,
+					    op3, op4);
+
+	/* compute number of extra slots needed for padding */
+	if (pb->pos + 16 > pb->size) {
+		extra = (pb->size - pb->pos) / 8;
+		needed += extra;
+	}
+
+	host1x_cdma_wait_pushbuffer_space(host1x, cdma, needed);
+	space = host1x_pushbuffer_space(pb);
+
+	cdma->slots_free = space - needed;
+	cdma->slots_used += needed;
+
+	/*
+	 * Note that we rely on the fact that this is only used to submit wide
+	 * gather opcodes, which consist of 3 words, and they are padded with
+	 * a NOP to avoid having to deal with fractional slots (a slot always
+	 * represents 2 words). The fourth opcode passed to this function will
+	 * therefore always be a NOP.
+	 *
+	 * This works around a slight ambiguity when it comes to opcodes. For
+	 * all current host1x incarnations the NOP opcode uses the exact same
+	 * encoding (0x20000000), so we could hard-code the value here, but a
+	 * new incarnation may change it and break that assumption.
+	 */
+	for (i = 0; i < extra; i++)
+		host1x_pushbuffer_push(pb, op4, op4);
+
+	host1x_pushbuffer_push(pb, op1, op2);
+	host1x_pushbuffer_push(pb, op3, op4);
+}
+
 /*
  * End a cdma submit
  * Kick off DMA, add job to the sync queue, and a number of slots to be freed
diff --git a/drivers/gpu/host1x/cdma.h b/drivers/gpu/host1x/cdma.h
index 71078359c626..3a5e0408b8d1 100644
--- a/drivers/gpu/host1x/cdma.h
+++ b/drivers/gpu/host1x/cdma.h
@@ -90,6 +90,8 @@ int host1x_cdma_init(struct host1x_cdma *cdma);
 int host1x_cdma_deinit(struct host1x_cdma *cdma);
 int host1x_cdma_begin(struct host1x_cdma *cdma, struct host1x_job *job);
 void host1x_cdma_push(struct host1x_cdma *cdma, u32 op1, u32 op2);
+void host1x_cdma_push_wide(struct host1x_cdma *cdma, u32 op1, u32 op2,
+			   u32 op3, u32 op4);
 void host1x_cdma_end(struct host1x_cdma *cdma, struct host1x_job *job);
 void host1x_cdma_update(struct host1x_cdma *cdma);
 void host1x_cdma_peek(struct host1x_cdma *cdma, u32 dmaget, int slot,
diff --git a/include/trace/events/host1x.h b/include/trace/events/host1x.h
index a37ef73092e5..3d340b6f1ea3 100644
--- a/include/trace/events/host1x.h
+++ b/include/trace/events/host1x.h
@@ -80,6 +80,32 @@ TRACE_EVENT(host1x_cdma_push,
 		__entry->name, __entry->op1, __entry->op2)
 );
 
+TRACE_EVENT(host1x_cdma_push_wide,
+	TP_PROTO(const char *name, u32 op1, u32 op2, u32 op3, u32 op4),
+
+	TP_ARGS(name, op1, op2, op3, op4),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(u32, op1)
+		__field(u32, op2)
+		__field(u32, op3)
+		__field(u32, op4)
+	),
+
+	TP_fast_assign(
+		__entry->name = name;
+		__entry->op1 = op1;
+		__entry->op2 = op2;
+		__entry->op3 = op3;
+		__entry->op4 = op4;
+	),
+
+	TP_printk("name=%s, op1=%08x, op2=%08x, op3=%08x op4=%08x",
+		__entry->name, __entry->op1, __entry->op2, __entry->op3,
+		__entry->op4)
+);
+
 TRACE_EVENT(host1x_cdma_push_gather,
 	TP_PROTO(const char *name, struct host1x_bo *bo,
 			u32 words, u32 offset, void *cmdbuf),

From 67a82dbc0a374df7a348cc8fb28982945035bd25 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 1 Feb 2019 14:28:25 +0100
Subject: [PATCH 11/26] gpu: host1x: Support 40-bit addressing

Tegra186 and later support 40 bits of address space. Additional
registers need to be programmed to store the full 40 bits of push
buffer addresses.

Since command stream gathers can also reside in buffers in a 40-bit
address space, a new variant of the GATHER opcode is also introduced.
It takes two parameters: the first parameter contains the lower 32
bits of the address and the second parameter contains bits 32 to 39.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/hw/cdma_hw.c           | 32 ++++++++++++++++++-----
 drivers/gpu/host1x/hw/channel_hw.c        | 30 ++++++++++++++++++---
 drivers/gpu/host1x/hw/host1x06_hardware.h |  5 ++++
 drivers/gpu/host1x/hw/host1x07_hardware.h |  5 ++++
 4 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/host1x/hw/cdma_hw.c b/drivers/gpu/host1x/hw/cdma_hw.c
index ce320534cbed..485aef5761af 100644
--- a/drivers/gpu/host1x/hw/cdma_hw.c
+++ b/drivers/gpu/host1x/hw/cdma_hw.c
@@ -68,20 +68,31 @@ static void cdma_timeout_cpu_incr(struct host1x_cdma *cdma, u32 getptr,
 static void cdma_start(struct host1x_cdma *cdma)
 {
 	struct host1x_channel *ch = cdma_to_channel(cdma);
+	u64 start, end;
 
 	if (cdma->running)
 		return;
 
 	cdma->last_pos = cdma->push_buffer.pos;
+	start = cdma->push_buffer.dma;
+	end = start + cdma->push_buffer.size + 4;
 
 	host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP,
 			 HOST1X_CHANNEL_DMACTRL);
 
 	/* set base, put and end pointer */
-	host1x_ch_writel(ch, cdma->push_buffer.dma, HOST1X_CHANNEL_DMASTART);
+	host1x_ch_writel(ch, lower_32_bits(start), HOST1X_CHANNEL_DMASTART);
+#if HOST1X_HW >= 6
+	host1x_ch_writel(ch, upper_32_bits(start), HOST1X_CHANNEL_DMASTART_HI);
+#endif
 	host1x_ch_writel(ch, cdma->push_buffer.pos, HOST1X_CHANNEL_DMAPUT);
-	host1x_ch_writel(ch, cdma->push_buffer.dma + cdma->push_buffer.size + 4,
-			 HOST1X_CHANNEL_DMAEND);
+#if HOST1X_HW >= 6
+	host1x_ch_writel(ch, 0, HOST1X_CHANNEL_DMAPUT_HI);
+#endif
+	host1x_ch_writel(ch, lower_32_bits(end), HOST1X_CHANNEL_DMAEND);
+#if HOST1X_HW >= 6
+	host1x_ch_writel(ch, upper_32_bits(end), HOST1X_CHANNEL_DMAEND_HI);
+#endif
 
 	/* reset GET */
 	host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP |
@@ -104,6 +115,7 @@ static void cdma_timeout_restart(struct host1x_cdma *cdma, u32 getptr)
 {
 	struct host1x *host1x = cdma_to_host1x(cdma);
 	struct host1x_channel *ch = cdma_to_channel(cdma);
+	u64 start, end;
 
 	if (cdma->running)
 		return;
@@ -113,10 +125,18 @@ static void cdma_timeout_restart(struct host1x_cdma *cdma, u32 getptr)
 	host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP,
 			 HOST1X_CHANNEL_DMACTRL);
 
+	start = cdma->push_buffer.dma;
+	end = start + cdma->push_buffer.size + 4;
+
 	/* set base, end pointer (all of memory) */
-	host1x_ch_writel(ch, cdma->push_buffer.dma, HOST1X_CHANNEL_DMASTART);
-	host1x_ch_writel(ch, cdma->push_buffer.dma + cdma->push_buffer.size,
-			 HOST1X_CHANNEL_DMAEND);
+	host1x_ch_writel(ch, lower_32_bits(start), HOST1X_CHANNEL_DMASTART);
+#if HOST1X_HW >= 6
+	host1x_ch_writel(ch, upper_32_bits(start), HOST1X_CHANNEL_DMASTART_HI);
+#endif
+	host1x_ch_writel(ch, lower_32_bits(end), HOST1X_CHANNEL_DMAEND);
+#if HOST1X_HW >= 6
+	host1x_ch_writel(ch, upper_32_bits(end), HOST1X_CHANNEL_DMAEND_HI);
+#endif
 
 	/* set GET, by loading the value in PUT (then reset GET) */
 	host1x_ch_writel(ch, getptr, HOST1X_CHANNEL_DMAPUT);
diff --git a/drivers/gpu/host1x/hw/channel_hw.c b/drivers/gpu/host1x/hw/channel_hw.c
index 3067af4452cd..27101c04a827 100644
--- a/drivers/gpu/host1x/hw/channel_hw.c
+++ b/drivers/gpu/host1x/hw/channel_hw.c
@@ -61,15 +61,37 @@ static void trace_write_gather(struct host1x_cdma *cdma, struct host1x_bo *bo,
 static void submit_gathers(struct host1x_job *job)
 {
 	struct host1x_cdma *cdma = &job->channel->cdma;
+#if HOST1X_HW < 6
+	struct device *dev = job->channel->dev;
+#endif
 	unsigned int i;
 
 	for (i = 0; i < job->num_gathers; i++) {
 		struct host1x_job_gather *g = &job->gathers[i];
-		u32 op1 = host1x_opcode_gather(g->words);
-		u32 op2 = g->base + g->offset;
+		dma_addr_t addr = g->base + g->offset;
+		u32 op2, op3;
 
-		trace_write_gather(cdma, g->bo, g->offset, op1 & 0xffff);
-		host1x_cdma_push(cdma, op1, op2);
+		op2 = lower_32_bits(addr);
+		op3 = upper_32_bits(addr);
+
+		trace_write_gather(cdma, g->bo, g->offset, g->words);
+
+		if (op3 != 0) {
+#if HOST1X_HW >= 6
+			u32 op1 = host1x_opcode_gather_wide(g->words);
+			u32 op4 = HOST1X_OPCODE_NOP;
+
+			host1x_cdma_push_wide(cdma, op1, op2, op3, op4);
+#else
+			dev_err(dev, "invalid gather for push buffer %pad\n",
+				&addr);
+			continue;
+#endif
+		} else {
+			u32 op1 = host1x_opcode_gather(g->words);
+
+			host1x_cdma_push(cdma, op1, op2);
+		}
 	}
 }
 
diff --git a/drivers/gpu/host1x/hw/host1x06_hardware.h b/drivers/gpu/host1x/hw/host1x06_hardware.h
index eab753b91f24..dd37b10c8d04 100644
--- a/drivers/gpu/host1x/hw/host1x06_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x06_hardware.h
@@ -138,6 +138,11 @@ static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
 	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
 }
 
+static inline u32 host1x_opcode_gather_wide(unsigned count)
+{
+	return (12 << 28) | count;
+}
+
 #define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
 
 #endif
diff --git a/drivers/gpu/host1x/hw/host1x07_hardware.h b/drivers/gpu/host1x/hw/host1x07_hardware.h
index a79f57dc87bb..9f6da4ee5443 100644
--- a/drivers/gpu/host1x/hw/host1x07_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x07_hardware.h
@@ -138,6 +138,11 @@ static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
 	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
 }
 
+static inline u32 host1x_opcode_gather_wide(unsigned count)
+{
+	return (12 << 28) | count;
+}
+
 #define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
 
 #endif

From 38fabcc953883741313d707a919326f77c2d7214 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 1 Feb 2019 14:28:27 +0100
Subject: [PATCH 12/26] gpu: host1x: Restrict IOVA space to DMA mask

On Tegra186 and later, the ARM SMMU provides an input address space that
is 48 bits wide. However, memory clients can only address up to 40 bits.
If the geometry is used as-is, allocations of IOVA space can end up in a
region that is not addressable by the memory clients.

To fix this, restrict the IOVA space to the DMA mask of the host1x
device.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/dev.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c
index 4c044ee54fe6..544b67f2b3ff 100644
--- a/drivers/gpu/host1x/dev.c
+++ b/drivers/gpu/host1x/dev.c
@@ -283,6 +283,8 @@ static int host1x_probe(struct platform_device *pdev)
 	host->group = iommu_group_get(&pdev->dev);
 	if (host->group) {
 		struct iommu_domain_geometry *geometry;
+		u64 mask = dma_get_mask(host->dev);
+		dma_addr_t start, end;
 		unsigned long order;
 
 		err = iova_cache_get();
@@ -310,11 +312,12 @@ static int host1x_probe(struct platform_device *pdev)
 		}
 
 		geometry = &host->domain->geometry;
+		start = geometry->aperture_start & mask;
+		end = geometry->aperture_end & mask;
 
 		order = __ffs(host->domain->pgsize_bitmap);
-		init_iova_domain(&host->iova, 1UL << order,
-				 geometry->aperture_start >> order);
-		host->iova_end = geometry->aperture_end;
+		init_iova_domain(&host->iova, 1UL << order, start >> order);
+		host->iova_end = end;
 	}
 
 skip_iommu:

From 8de896eb206fea3caa26b5fc8e637934d8486f0f Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 1 Feb 2019 14:28:28 +0100
Subject: [PATCH 13/26] gpu: host1x: Support 40-bit addressing on Tegra186

The host1x and clients instantiated on Tegra186 support addressing 40
bits of memory.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c
index 544b67f2b3ff..ee3c7b81a29d 100644
--- a/drivers/gpu/host1x/dev.c
+++ b/drivers/gpu/host1x/dev.c
@@ -136,7 +136,7 @@ static const struct host1x_info host1x06_info = {
 	.nb_bases = 16,
 	.init = host1x06_init,
 	.sync_offset = 0x0,
-	.dma_mask = DMA_BIT_MASK(34),
+	.dma_mask = DMA_BIT_MASK(40),
 	.has_hypervisor = true,
 	.num_sid_entries = ARRAY_SIZE(tegra186_sid_table),
 	.sid_table = tegra186_sid_table,

From 0e43b8da154a95f4369da4068a43ad9d700f4cea Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 1 Feb 2019 14:28:29 +0100
Subject: [PATCH 14/26] gpu: host1x: Use correct semantics for
 HOST1X_CHANNEL_DMAEND

The HOST1X_CHANNEL_DMAEND is an offset relative to the value written to
the HOST1X_CHANNEL_DMASTART register, but it is currently treated as an
absolute address. This can cause SMMU faults if the CDMA fetches past a
pushbuffer's IOMMU mapping.

Properly setting the DMAEND prevents the CDMA from fetching beyond that
address and avoid such issues. This is currently not observed because a
whole (almost) page of essentially scratch space absorbs any excessive
prefetching by CDMA. However, changing the number of slots in the push
buffer can trigger these SMMU faults.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/hw/cdma_hw.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/host1x/hw/cdma_hw.c b/drivers/gpu/host1x/hw/cdma_hw.c
index 485aef5761af..a24c090ac96f 100644
--- a/drivers/gpu/host1x/hw/cdma_hw.c
+++ b/drivers/gpu/host1x/hw/cdma_hw.c
@@ -75,7 +75,7 @@ static void cdma_start(struct host1x_cdma *cdma)
 
 	cdma->last_pos = cdma->push_buffer.pos;
 	start = cdma->push_buffer.dma;
-	end = start + cdma->push_buffer.size + 4;
+	end = cdma->push_buffer.size + 4;
 
 	host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP,
 			 HOST1X_CHANNEL_DMACTRL);
@@ -126,7 +126,7 @@ static void cdma_timeout_restart(struct host1x_cdma *cdma, u32 getptr)
 			 HOST1X_CHANNEL_DMACTRL);
 
 	start = cdma->push_buffer.dma;
-	end = start + cdma->push_buffer.size + 4;
+	end = cdma->push_buffer.size + 4;
 
 	/* set base, end pointer (all of memory) */
 	host1x_ch_writel(ch, lower_32_bits(start), HOST1X_CHANNEL_DMASTART);

From e1f338c0f8a9aacf351e42e5cfd0639fc73dc5b9 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 1 Feb 2019 14:28:30 +0100
Subject: [PATCH 15/26] gpu: host1x: Optimize CDMA push buffer memory usage

The host1x CDMA push buffer is terminated by a special opcode (RESTART)
that tells the CDMA to wrap around to the beginning of the push buffer.
To accomodate the RESTART opcode, an extra 4 bytes are allocated on top
of the 512 * 8 = 4096 bytes needed for the 512 slots (1 slot = 2 words)
that are used for other commands passed to CDMA. This requires that two
memory pages are allocated, but most of the second page (4092 bytes) is
never used.

Decrease the number of slots to 511 so that the RESTART opcode fits
within the page. Adjust the push buffer wraparound code to take into
account push buffer sizes that are not a power of two.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/cdma.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
index 64099cc1964b..07df85b92ebf 100644
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -41,7 +41,17 @@
  * means that the push buffer is full, not empty.
  */
 
-#define HOST1X_PUSHBUFFER_SLOTS	512
+/*
+ * Typically the commands written into the push buffer are a pair of words. We
+ * use slots to represent each of these pairs and to simplify things. Note the
+ * strange number of slots allocated here. 512 slots will fit exactly within a
+ * single memory page. We also need one additional word at the end of the push
+ * buffer for the RESTART opcode that will instruct the CDMA to jump back to
+ * the beginning of the push buffer. With 512 slots, this means that we'll use
+ * 2 memory pages and waste 4092 bytes of the second page that will never be
+ * used.
+ */
+#define HOST1X_PUSHBUFFER_SLOTS	511
 
 /*
  * Clean up push buffer resources
@@ -143,7 +153,10 @@ static void host1x_pushbuffer_push(struct push_buffer *pb, u32 op1, u32 op2)
 	WARN_ON(pb->pos == pb->fence);
 	*(p++) = op1;
 	*(p++) = op2;
-	pb->pos = (pb->pos + 8) & (pb->size - 1);
+	pb->pos += 8;
+
+	if (pb->pos >= pb->size)
+		pb->pos -= pb->size;
 }
 
 /*
@@ -153,7 +166,10 @@ static void host1x_pushbuffer_push(struct push_buffer *pb, u32 op1, u32 op2)
 static void host1x_pushbuffer_pop(struct push_buffer *pb, unsigned int slots)
 {
 	/* Advance the next write position */
-	pb->fence = (pb->fence + slots * 8) & (pb->size - 1);
+	pb->fence += slots * 8;
+
+	if (pb->fence >= pb->size)
+		pb->fence -= pb->size;
 }
 
 /*
@@ -161,7 +177,12 @@ static void host1x_pushbuffer_pop(struct push_buffer *pb, unsigned int slots)
  */
 static u32 host1x_pushbuffer_space(struct push_buffer *pb)
 {
-	return ((pb->fence - pb->pos) & (pb->size - 1)) / 8;
+	unsigned int fence = pb->fence;
+
+	if (pb->fence < pb->pos)
+		fence += pb->size;
+
+	return (fence - pb->pos) / 8;
 }
 
 /*

From 8e5d19c625f94e91d42aaf808e1231f636cf1365 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 1 Feb 2019 14:28:31 +0100
Subject: [PATCH 16/26] drm/tegra: Store parent pointer in Tegra DRM clients

Tegra DRM clients need access to their parent, so store a pointer to it
upon registration. It's technically possible to get at this by going via
the host1x client's parent and getting the driver data, but that's quite
complicated and not very transparent. It's much more straightforward and
natural to let the children know about their parent.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Reviewed-by: Dmitry Osipenko <digetx@gmail.com>
---
 drivers/gpu/drm/tegra/drm.c | 2 ++
 drivers/gpu/drm/tegra/drm.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index 4b70ce664c41..61dcbd218ffc 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -1041,6 +1041,7 @@ int tegra_drm_register_client(struct tegra_drm *tegra,
 {
 	mutex_lock(&tegra->clients_lock);
 	list_add_tail(&client->list, &tegra->clients);
+	client->drm = tegra;
 	mutex_unlock(&tegra->clients_lock);
 
 	return 0;
@@ -1051,6 +1052,7 @@ int tegra_drm_unregister_client(struct tegra_drm *tegra,
 {
 	mutex_lock(&tegra->clients_lock);
 	list_del_init(&client->list);
+	client->drm = NULL;
 	mutex_unlock(&tegra->clients_lock);
 
 	return 0;
diff --git a/drivers/gpu/drm/tegra/drm.h b/drivers/gpu/drm/tegra/drm.h
index f1763b4d5b5f..2c809755bca7 100644
--- a/drivers/gpu/drm/tegra/drm.h
+++ b/drivers/gpu/drm/tegra/drm.h
@@ -88,6 +88,7 @@ int tegra_drm_submit(struct tegra_drm_context *context,
 struct tegra_drm_client {
 	struct host1x_client base;
 	struct list_head list;
+	struct tegra_drm *drm;
 
 	unsigned int version;
 	const struct tegra_drm_client_ops *ops;

From 77a0b09dd993c83ee7c770cc704e9bec18fd19c7 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 1 Feb 2019 14:28:32 +0100
Subject: [PATCH 17/26] drm/tegra: vic: Load firmware on demand

Loading the firmware requires an allocation of IOVA space to make sure
that the VIC's Falcon microcontroller can read the firmware if address
translation via the SMMU is enabled.

However, the allocation currently happens at a time where the geometry
of an IOMMU domain may not have been initialized yet. This happens for
example on Tegra186 and later where an ARM SMMU is used. Domains which
are created by the ARM SMMU driver postpone the geometry setup until a
device is attached to the domain. This is because IOMMU domains aren't
attached to a specific IOMMU instance at allocation time and hence the
input address space, which defines the geometry, is not known yet.

Work around this by postponing the firmware load until it is needed at
the time where a channel is opened to the VIC. At this time the shared
IOMMU domain's geometry has been properly initialized.

As a byproduct this allows the Tegra DRM to be created in the absence
of VIC firmware, since the VIC initialization no longer fails if the
firmware can't be found.

Based on an earlier patch by Dmitry Osipenko <digetx@gmail.com>.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Reviewed-by: Dmitry Osipenko <digetx@gmail.com>
---
 drivers/gpu/drm/tegra/vic.c | 53 +++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/tegra/vic.c b/drivers/gpu/drm/tegra/vic.c
index d47983deb1cf..739b894661ab 100644
--- a/drivers/gpu/drm/tegra/vic.c
+++ b/drivers/gpu/drm/tegra/vic.c
@@ -181,13 +181,6 @@ static int vic_init(struct host1x_client *client)
 		vic->domain = tegra->domain;
 	}
 
-	if (!vic->falcon.data) {
-		vic->falcon.data = tegra;
-		err = falcon_load_firmware(&vic->falcon);
-		if (err < 0)
-			goto detach;
-	}
-
 	vic->channel = host1x_channel_request(client->dev);
 	if (!vic->channel) {
 		err = -ENOMEM;
@@ -246,6 +239,30 @@ static const struct host1x_client_ops vic_client_ops = {
 	.exit = vic_exit,
 };
 
+static int vic_load_firmware(struct vic *vic)
+{
+	int err;
+
+	if (vic->falcon.data)
+		return 0;
+
+	vic->falcon.data = vic->client.drm;
+
+	err = falcon_read_firmware(&vic->falcon, vic->config->firmware);
+	if (err < 0)
+		goto cleanup;
+
+	err = falcon_load_firmware(&vic->falcon);
+	if (err < 0)
+		goto cleanup;
+
+	return 0;
+
+cleanup:
+	vic->falcon.data = NULL;
+	return err;
+}
+
 static int vic_open_channel(struct tegra_drm_client *client,
 			    struct tegra_drm_context *context)
 {
@@ -256,19 +273,25 @@ static int vic_open_channel(struct tegra_drm_client *client,
 	if (err < 0)
 		return err;
 
+	err = vic_load_firmware(vic);
+	if (err < 0)
+		goto rpm_put;
+
 	err = vic_boot(vic);
-	if (err < 0) {
-		pm_runtime_put(vic->dev);
-		return err;
-	}
+	if (err < 0)
+		goto rpm_put;
 
 	context->channel = host1x_channel_get(vic->channel);
 	if (!context->channel) {
-		pm_runtime_put(vic->dev);
-		return -ENOMEM;
+		err = -ENOMEM;
+		goto rpm_put;
 	}
 
 	return 0;
+
+rpm_put:
+	pm_runtime_put(vic->dev);
+	return err;
 }
 
 static void vic_close_channel(struct tegra_drm_context *context)
@@ -372,10 +395,6 @@ static int vic_probe(struct platform_device *pdev)
 	if (err < 0)
 		return err;
 
-	err = falcon_read_firmware(&vic->falcon, vic->config->firmware);
-	if (err < 0)
-		goto exit_falcon;
-
 	platform_set_drvdata(pdev, vic);
 
 	INIT_LIST_HEAD(&vic->client.base.list);

From b9f8b09ce256e2e197b9a38984dea64e3404728c Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 1 Feb 2019 14:28:33 +0100
Subject: [PATCH 18/26] drm/tegra: Setup shared IOMMU domain after
 initialization

Move initialization of the shared IOMMU domain after the host1x device
has been initialized. At this point all the Tegra DRM clients have been
attached to the shared IOMMU domain.

This is important because Tegra186 and later use an ARM SMMU, for which
the driver defers setting up the geometry for a domain until a device is
attached to it. This is to ensure that the domain is properly set up for
a specific ARM SMMU instance, which is unknown at allocation time.

Reviewed-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/drm.c | 54 ++++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index 61dcbd218ffc..271c7a5fc954 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -92,10 +92,6 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
 		return -ENOMEM;
 
 	if (iommu_present(&platform_bus_type)) {
-		u64 carveout_start, carveout_end, gem_start, gem_end;
-		struct iommu_domain_geometry *geometry;
-		unsigned long order;
-
 		tegra->domain = iommu_domain_alloc(&platform_bus_type);
 		if (!tegra->domain) {
 			err = -ENOMEM;
@@ -105,27 +101,6 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
 		err = iova_cache_get();
 		if (err < 0)
 			goto domain;
-
-		geometry = &tegra->domain->geometry;
-		gem_start = geometry->aperture_start;
-		gem_end = geometry->aperture_end - CARVEOUT_SZ;
-		carveout_start = gem_end + 1;
-		carveout_end = geometry->aperture_end;
-
-		order = __ffs(tegra->domain->pgsize_bitmap);
-		init_iova_domain(&tegra->carveout.domain, 1UL << order,
-				 carveout_start >> order);
-
-		tegra->carveout.shift = iova_shift(&tegra->carveout.domain);
-		tegra->carveout.limit = carveout_end >> tegra->carveout.shift;
-
-		drm_mm_init(&tegra->mm, gem_start, gem_end - gem_start + 1);
-		mutex_init(&tegra->mm_lock);
-
-		DRM_DEBUG("IOMMU apertures:\n");
-		DRM_DEBUG("  GEM: %#llx-%#llx\n", gem_start, gem_end);
-		DRM_DEBUG("  Carveout: %#llx-%#llx\n", carveout_start,
-			  carveout_end);
 	}
 
 	mutex_init(&tegra->clients_lock);
@@ -159,6 +134,35 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
 	if (err < 0)
 		goto fbdev;
 
+	if (tegra->domain) {
+		u64 carveout_start, carveout_end, gem_start, gem_end;
+		dma_addr_t start, end;
+		unsigned long order;
+
+		start = tegra->domain->geometry.aperture_start;
+		end = tegra->domain->geometry.aperture_end;
+
+		gem_start = start;
+		gem_end = end - CARVEOUT_SZ;
+		carveout_start = gem_end + 1;
+		carveout_end = end;
+
+		order = __ffs(tegra->domain->pgsize_bitmap);
+		init_iova_domain(&tegra->carveout.domain, 1UL << order,
+				 carveout_start >> order);
+
+		tegra->carveout.shift = iova_shift(&tegra->carveout.domain);
+		tegra->carveout.limit = carveout_end >> tegra->carveout.shift;
+
+		drm_mm_init(&tegra->mm, gem_start, gem_end - gem_start + 1);
+		mutex_init(&tegra->mm_lock);
+
+		DRM_DEBUG("IOMMU apertures:\n");
+		DRM_DEBUG("  GEM: %#llx-%#llx\n", gem_start, gem_end);
+		DRM_DEBUG("  Carveout: %#llx-%#llx\n", carveout_start,
+			  carveout_end);
+	}
+
 	if (tegra->hub) {
 		err = tegra_display_hub_prepare(tegra->hub);
 		if (err < 0)

From 02be8e4fbb1f352f2ed6251db9fddd646d531b6e Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 1 Feb 2019 14:28:34 +0100
Subject: [PATCH 19/26] drm/tegra: Restrict IOVA space to DMA mask

On Tegra186 and later, the ARM SMMU provides an input address space that
is 48 bits wide. However, memory clients can only address up to 40 bits.
If the geometry is used as-is, allocations of IOVA space can end up in a
region that cannot be addressed by the memory clients.

To fix this, restrict the IOVA space to the DMA mask of the host1x
device. Note that, technically, the IOVA space needs to be restricted to
the intersection of the DMA masks for all clients that are attached to
the IOMMU domain. In practice using the DMA mask of the host1x device is
sufficient because all host1x clients share the same DMA mask.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/drm.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index 271c7a5fc954..0c5f1e6a0446 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -136,11 +136,12 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
 
 	if (tegra->domain) {
 		u64 carveout_start, carveout_end, gem_start, gem_end;
+		u64 dma_mask = dma_get_mask(&device->dev);
 		dma_addr_t start, end;
 		unsigned long order;
 
-		start = tegra->domain->geometry.aperture_start;
-		end = tegra->domain->geometry.aperture_end;
+		start = tegra->domain->geometry.aperture_start & dma_mask;
+		end = tegra->domain->geometry.aperture_end & dma_mask;
 
 		gem_start = start;
 		gem_end = end - CARVEOUT_SZ;

From 3ff41673d5c6842e6668f95b0a14e5f6a74d043f Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 1 Feb 2019 14:28:35 +0100
Subject: [PATCH 20/26] drm/tegra: vic: Do not clear driver data

Upon driver failure, the driver core will take care of clearing the
driver data, so there's no need to do so explicitly in the driver.

Reviewed-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/vic.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/tegra/vic.c b/drivers/gpu/drm/tegra/vic.c
index 739b894661ab..55a8cc162e9d 100644
--- a/drivers/gpu/drm/tegra/vic.c
+++ b/drivers/gpu/drm/tegra/vic.c
@@ -412,7 +412,6 @@ static int vic_probe(struct platform_device *pdev)
 	err = host1x_client_register(&vic->client.base);
 	if (err < 0) {
 		dev_err(dev, "failed to register host1x client: %d\n", err);
-		platform_set_drvdata(pdev, NULL);
 		goto exit_falcon;
 	}
 

From f3779cb190a5a12d2e26fd5af724fb1384a9144f Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 1 Feb 2019 14:28:36 +0100
Subject: [PATCH 21/26] drm/tegra: vic: Support stream ID register programming

The version of VIC found in Tegra186 and later incorporates improvements
with regards to context isolation. As part of those improvements, stream
ID registers were added that allow to specify separate stream IDs for
the Falcon microcontroller and the VIC memory interface.

While it is possible to also set the stream ID dynamically at runtime to
allow userspace contexts to be completely separated, this commit doesn't
implement that yet. Instead, the static VIC stream ID is programmed when
the Falcon is booted. This ensures that memory accesses by the Falcon or
the VIC are properly translated via the SMMU.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/vic.c | 21 +++++++++++++++++++++
 drivers/gpu/drm/tegra/vic.h |  9 +++++++++
 2 files changed, 30 insertions(+)

diff --git a/drivers/gpu/drm/tegra/vic.c b/drivers/gpu/drm/tegra/vic.c
index 55a8cc162e9d..39bfed9623de 100644
--- a/drivers/gpu/drm/tegra/vic.c
+++ b/drivers/gpu/drm/tegra/vic.c
@@ -26,6 +26,7 @@
 struct vic_config {
 	const char *firmware;
 	unsigned int version;
+	bool supports_sid;
 };
 
 struct vic {
@@ -105,6 +106,22 @@ static int vic_boot(struct vic *vic)
 	if (vic->booted)
 		return 0;
 
+	if (vic->config->supports_sid) {
+		struct iommu_fwspec *spec = dev_iommu_fwspec_get(vic->dev);
+		u32 value;
+
+		value = TRANSCFG_ATT(1, TRANSCFG_SID_FALCON) |
+			TRANSCFG_ATT(0, TRANSCFG_SID_HW);
+		vic_writel(vic, value, VIC_TFBIF_TRANSCFG);
+
+		if (spec && spec->num_ids > 0) {
+			value = spec->ids[0] & 0xffff;
+
+			vic_writel(vic, value, VIC_THI_STREAMID0);
+			vic_writel(vic, value, VIC_THI_STREAMID1);
+		}
+	}
+
 	/* setup clockgating registers */
 	vic_writel(vic, CG_IDLE_CG_DLY_CNT(4) |
 			CG_IDLE_CG_EN |
@@ -314,6 +331,7 @@ static const struct tegra_drm_client_ops vic_ops = {
 static const struct vic_config vic_t124_config = {
 	.firmware = NVIDIA_TEGRA_124_VIC_FIRMWARE,
 	.version = 0x40,
+	.supports_sid = false,
 };
 
 #define NVIDIA_TEGRA_210_VIC_FIRMWARE "nvidia/tegra210/vic04_ucode.bin"
@@ -321,6 +339,7 @@ static const struct vic_config vic_t124_config = {
 static const struct vic_config vic_t210_config = {
 	.firmware = NVIDIA_TEGRA_210_VIC_FIRMWARE,
 	.version = 0x21,
+	.supports_sid = false,
 };
 
 #define NVIDIA_TEGRA_186_VIC_FIRMWARE "nvidia/tegra186/vic04_ucode.bin"
@@ -328,6 +347,7 @@ static const struct vic_config vic_t210_config = {
 static const struct vic_config vic_t186_config = {
 	.firmware = NVIDIA_TEGRA_186_VIC_FIRMWARE,
 	.version = 0x18,
+	.supports_sid = true,
 };
 
 #define NVIDIA_TEGRA_194_VIC_FIRMWARE "nvidia/tegra194/vic.bin"
@@ -335,6 +355,7 @@ static const struct vic_config vic_t186_config = {
 static const struct vic_config vic_t194_config = {
 	.firmware = NVIDIA_TEGRA_194_VIC_FIRMWARE,
 	.version = 0x19,
+	.supports_sid = true,
 };
 
 static const struct of_device_id vic_match[] = {
diff --git a/drivers/gpu/drm/tegra/vic.h b/drivers/gpu/drm/tegra/vic.h
index 21844817a7e1..017584340dd6 100644
--- a/drivers/gpu/drm/tegra/vic.h
+++ b/drivers/gpu/drm/tegra/vic.h
@@ -17,11 +17,20 @@
 
 /* VIC registers */
 
+#define VIC_THI_STREAMID0	0x00000030
+#define VIC_THI_STREAMID1	0x00000034
+
 #define NV_PVIC_MISC_PRI_VIC_CG			0x000016d0
 #define CG_IDLE_CG_DLY_CNT(val)			((val & 0x3f) << 0)
 #define CG_IDLE_CG_EN				(1 << 6)
 #define CG_WAKEUP_DLY_CNT(val)			((val & 0xf) << 16)
 
+#define VIC_TFBIF_TRANSCFG	0x00002044
+#define  TRANSCFG_ATT(i, v)	(((v) & 0x3) << (i * 4))
+#define  TRANSCFG_SID_HW	0
+#define  TRANSCFG_SID_PHY	1
+#define  TRANSCFG_SID_FALCON	2
+
 /* Firmware offsets */
 
 #define VIC_UCODE_FCE_HEADER_OFFSET		(6*4)

From 6c2b3881d0df85fed7e3f43dcc9cbc3e5124bc12 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 25 Jan 2019 11:00:57 +0100
Subject: [PATCH 22/26] dt-bindings: display: tegra: Support SOR crossbar
 configuration

The SOR has a crossbar that can map each lane of the SOR to each of the
SOR pads. The mapping is usually the same across designs for a specific
SoC generation, but every now and then there's a design that doesn't.

Allow the crossbar configuration to be specified in device tree to make
it possible to support these designs.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 .../bindings/display/tegra/nvidia,tegra20-host1x.txt           | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.txt b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.txt
index 593be44a53c9..9999255ac5b6 100644
--- a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.txt
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.txt
@@ -238,6 +238,9 @@ of the following host1x client modules:
   - nvidia,hpd-gpio: specifies a GPIO used for hotplug detection
   - nvidia,edid: supplies a binary EDID blob
   - nvidia,panel: phandle of a display panel
+  - nvidia,xbar-cfg: 5 cells containing the crossbar configuration. Each lane
+    of the SOR, identified by the cell's index, is mapped via the crossbar to
+    the pad specified by the cell's value.
 
   Optional properties when driving an eDP output:
   - nvidia,dpaux: phandle to a DispayPort AUX interface

From 6d6c815daad8ec3469135c310ed39849cbe7752c Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 25 Jan 2019 11:00:58 +0100
Subject: [PATCH 23/26] drm/tegra: sor: Support device tree crossbar
 configuration

The crossbar configuration is usually the same across all designs for a
given SoC generation. But sometimes there are designs that require some
other configuration.

Implement support for parsing the crossbar configuration from a device
tree. If the crossbar configuration is not present in the device tree,
fall back to the default crossbar configuration.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/sor.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/tegra/sor.c b/drivers/gpu/drm/tegra/sor.c
index 79602debf231..beb8fda25826 100644
--- a/drivers/gpu/drm/tegra/sor.c
+++ b/drivers/gpu/drm/tegra/sor.c
@@ -410,6 +410,8 @@ struct tegra_sor {
 	struct clk *clk_dp;
 	struct clk *clk;
 
+	u8 xbar_cfg[5];
+
 	struct drm_dp_aux *aux;
 
 	struct drm_info_list *debugfs_files;
@@ -1814,7 +1816,7 @@ static void tegra_sor_edp_enable(struct drm_encoder *encoder)
 
 	/* XXX not in TRM */
 	for (value = 0, i = 0; i < 5; i++)
-		value |= SOR_XBAR_CTRL_LINK0_XSEL(i, sor->soc->xbar_cfg[i]) |
+		value |= SOR_XBAR_CTRL_LINK0_XSEL(i, sor->xbar_cfg[i]) |
 			 SOR_XBAR_CTRL_LINK1_XSEL(i, i);
 
 	tegra_sor_writel(sor, 0x00000000, SOR_XBAR_POL);
@@ -2550,7 +2552,7 @@ static void tegra_sor_hdmi_enable(struct drm_encoder *encoder)
 
 	/* XXX not in TRM */
 	for (value = 0, i = 0; i < 5; i++)
-		value |= SOR_XBAR_CTRL_LINK0_XSEL(i, sor->soc->xbar_cfg[i]) |
+		value |= SOR_XBAR_CTRL_LINK0_XSEL(i, sor->xbar_cfg[i]) |
 			 SOR_XBAR_CTRL_LINK1_XSEL(i, i);
 
 	tegra_sor_writel(sor, 0x00000000, SOR_XBAR_POL);
@@ -3171,6 +3173,8 @@ MODULE_DEVICE_TABLE(of, tegra_sor_of_match);
 static int tegra_sor_parse_dt(struct tegra_sor *sor)
 {
 	struct device_node *np = sor->dev->of_node;
+	u32 xbar_cfg[5];
+	unsigned int i;
 	u32 value;
 	int err;
 
@@ -3188,6 +3192,17 @@ static int tegra_sor_parse_dt(struct tegra_sor *sor)
 		sor->pad = TEGRA_IO_PAD_HDMI_DP0 + sor->index;
 	}
 
+	err = of_property_read_u32_array(np, "nvidia,xbar-cfg", xbar_cfg, 5);
+	if (err < 0) {
+		/* fall back to default per-SoC XBAR configuration */
+		for (i = 0; i < 5; i++)
+			sor->xbar_cfg[i] = sor->soc->xbar_cfg[i];
+	} else {
+		/* copy cells to SOR XBAR configuration */
+		for (i = 0; i < 5; i++)
+			sor->xbar_cfg[i] = xbar_cfg[i];
+	}
+
 	return 0;
 }
 

From e8bad659381e9d63164f61e3bdf37e1c12274016 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Tue, 7 Aug 2018 16:07:11 +0300
Subject: [PATCH 24/26] gpu: host1x: Cancel only job that actually got stuck

Host1x doesn't have information about jobs inter-dependency, that is
something that will become available once host1x will get a proper
jobs scheduler implementation. Currently a hang job causes other unrelated
jobs to be canceled, that is a relic from downstream driver which is
irrelevant to upstream. Let's cancel only the hanging job and not to touch
other jobs in queue.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Reviewed-by: Mikko Perttunen <mperttunen@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/cdma.c | 33 +++++++--------------------------
 1 file changed, 7 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
index 07df85b92ebf..2314f37cb735 100644
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -408,13 +408,11 @@ void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
 	}
 
 	/*
-	 * Walk the sync_queue, first incrementing with the CPU syncpts that
-	 * are partially executed (the first buffer) or fully skipped while
-	 * still in the current context (slots are also NOP-ed).
+	 * Increment with CPU the remaining syncpts of a partially executed job.
 	 *
-	 * At the point contexts are interleaved, syncpt increments must be
-	 * done inline with the pushbuffer from a GATHER buffer to maintain
-	 * the order (slots are modified to be a GATHER of syncpt incrs).
+	 * Syncpt increments must be done inline with the pushbuffer from a
+	 * GATHER buffer to maintain the order (slots are modified to be a
+	 * GATHER of syncpt incrs).
 	 *
 	 * Note: save in restart_addr the location where the timed out buffer
 	 * started in the PB, so we can start the refetch from there (with the
@@ -422,20 +420,15 @@ void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
 	 * properly for this buffer and resources are freed.
 	 */
 
-	dev_dbg(dev, "%s: perform CPU incr on pending same ctx buffers\n",
-		__func__);
+	dev_dbg(dev, "%s: perform CPU incr on pending buffers\n", __func__);
 
 	if (!list_empty(&cdma->sync_queue))
 		restart_addr = job->first_get;
 	else
 		restart_addr = cdma->last_pos;
 
-	/* do CPU increments as long as this context continues */
-	list_for_each_entry_from(job, &cdma->sync_queue, list) {
-		/* different context, gets us out of this loop */
-		if (job->client != cdma->timeout.client)
-			break;
-
+	/* do CPU increments for the remaining syncpts */
+	if (job) {
 		/* won't need a timeout when replayed */
 		job->timeout = 0;
 
@@ -448,20 +441,8 @@ void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
 		host1x_hw_cdma_timeout_cpu_incr(host1x, cdma, job->first_get,
 						syncpt_incrs, job->syncpt_end,
 						job->num_slots);
-
-		syncpt_val += syncpt_incrs;
 	}
 
-	/*
-	 * The following sumbits from the same client may be dependent on the
-	 * failed submit and therefore they may fail. Force a small timeout
-	 * to make the queue cleanup faster.
-	 */
-
-	list_for_each_entry_from(job, &cdma->sync_queue, list)
-		if (job->client == cdma->timeout.client)
-			job->timeout = min_t(unsigned int, job->timeout, 500);
-
 	dev_dbg(dev, "%s: finished sync_queue modification\n", __func__);
 
 	/* roll back DMAGET and start up channel again */

From 5d6f043685fed86153ea6aeadd098a11267f58ef Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Tue, 7 Aug 2018 16:07:12 +0300
Subject: [PATCH 25/26] gpu: host1x: Don't complete a completed job

There is a chance that the last job has been completed at the time of
CDMA timeout handler invocation. In this case there is no need to complete
the completed job.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Reviewed-by: Mikko Perttunen <mperttunen@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/cdma.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
index 2314f37cb735..7556e0efd35a 100644
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -383,7 +383,7 @@ void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
 {
 	struct host1x *host1x = cdma_to_host1x(cdma);
 	u32 restart_addr, syncpt_incrs, syncpt_val;
-	struct host1x_job *job = NULL;
+	struct host1x_job *job;
 
 	syncpt_val = host1x_syncpt_load(cdma->timeout.syncpt);
 
@@ -402,11 +402,16 @@ void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
 
 	list_for_each_entry(job, &cdma->sync_queue, list) {
 		if (syncpt_val < job->syncpt_end)
-			break;
+			goto syncpt_incr;
 
 		host1x_job_dump(dev, job);
 	}
 
+	/* all jobs have been completed */
+	job = NULL;
+
+syncpt_incr:
+
 	/*
 	 * Increment with CPU the remaining syncpts of a partially executed job.
 	 *
@@ -419,16 +424,16 @@ void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
 	 * modified NOP-ed PB slots). This lets things appear to have completed
 	 * properly for this buffer and resources are freed.
 	 */
-
-	dev_dbg(dev, "%s: perform CPU incr on pending buffers\n", __func__);
-
-	if (!list_empty(&cdma->sync_queue))
+	if (job)
 		restart_addr = job->first_get;
 	else
 		restart_addr = cdma->last_pos;
 
 	/* do CPU increments for the remaining syncpts */
 	if (job) {
+		dev_dbg(dev, "%s: perform CPU incr on pending buffers\n",
+			__func__);
+
 		/* won't need a timeout when replayed */
 		job->timeout = 0;
 
@@ -441,9 +446,10 @@ void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
 		host1x_hw_cdma_timeout_cpu_incr(host1x, cdma, job->first_get,
 						syncpt_incrs, job->syncpt_end,
 						job->num_slots);
-	}
 
-	dev_dbg(dev, "%s: finished sync_queue modification\n", __func__);
+		dev_dbg(dev, "%s: finished sync_queue modification\n",
+			__func__);
+	}
 
 	/* roll back DMAGET and start up channel again */
 	host1x_hw_cdma_resume(host1x, cdma, restart_addr);

From 79930bafe2802c3a67a70ad4904032d9154bf3fa Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Tue, 7 Aug 2018 16:07:13 +0300
Subject: [PATCH 26/26] gpu: host1x: Continue CDMA execution starting with a
 next job

Currently gathers of a hung job are getting NOP'ed and a restarted CDMA
executes the NOP'ed gathers. There shouldn't be a reason to not restart
CDMA execution starting with a next job, avoiding the unnecessary churning
with gathers NOP'ing.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Reviewed-by: Mikko Perttunen <mperttunen@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/cdma.c       | 23 +++++++++++------------
 drivers/gpu/host1x/hw/cdma_hw.c | 14 --------------
 2 files changed, 11 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
index 7556e0efd35a..f45b7c69b694 100644
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -383,7 +383,7 @@ void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
 {
 	struct host1x *host1x = cdma_to_host1x(cdma);
 	u32 restart_addr, syncpt_incrs, syncpt_val;
-	struct host1x_job *job;
+	struct host1x_job *job, *next_job = NULL;
 
 	syncpt_val = host1x_syncpt_load(cdma->timeout.syncpt);
 
@@ -401,8 +401,13 @@ void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
 		__func__);
 
 	list_for_each_entry(job, &cdma->sync_queue, list) {
-		if (syncpt_val < job->syncpt_end)
+		if (syncpt_val < job->syncpt_end) {
+
+			if (!list_is_last(&job->list, &cdma->sync_queue))
+				next_job = list_next_entry(job, list);
+
 			goto syncpt_incr;
+		}
 
 		host1x_job_dump(dev, job);
 	}
@@ -415,17 +420,11 @@ syncpt_incr:
 	/*
 	 * Increment with CPU the remaining syncpts of a partially executed job.
 	 *
-	 * Syncpt increments must be done inline with the pushbuffer from a
-	 * GATHER buffer to maintain the order (slots are modified to be a
-	 * GATHER of syncpt incrs).
-	 *
-	 * Note: save in restart_addr the location where the timed out buffer
-	 * started in the PB, so we can start the refetch from there (with the
-	 * modified NOP-ed PB slots). This lets things appear to have completed
-	 * properly for this buffer and resources are freed.
+	 * CDMA will continue execution starting with the next job or will get
+	 * into idle state.
 	 */
-	if (job)
-		restart_addr = job->first_get;
+	if (next_job)
+		restart_addr = next_job->first_get;
 	else
 		restart_addr = cdma->last_pos;
 
diff --git a/drivers/gpu/host1x/hw/cdma_hw.c b/drivers/gpu/host1x/hw/cdma_hw.c
index a24c090ac96f..5d61088db2bb 100644
--- a/drivers/gpu/host1x/hw/cdma_hw.c
+++ b/drivers/gpu/host1x/hw/cdma_hw.c
@@ -39,8 +39,6 @@ static void push_buffer_init(struct push_buffer *pb)
 static void cdma_timeout_cpu_incr(struct host1x_cdma *cdma, u32 getptr,
 				u32 syncpt_incrs, u32 syncval, u32 nr_slots)
 {
-	struct host1x *host1x = cdma_to_host1x(cdma);
-	struct push_buffer *pb = &cdma->push_buffer;
 	unsigned int i;
 
 	for (i = 0; i < syncpt_incrs; i++)
@@ -48,18 +46,6 @@ static void cdma_timeout_cpu_incr(struct host1x_cdma *cdma, u32 getptr,
 
 	/* after CPU incr, ensure shadow is up to date */
 	host1x_syncpt_load(cdma->timeout.syncpt);
-
-	/* NOP all the PB slots */
-	while (nr_slots--) {
-		u32 *p = (u32 *)(pb->mapped + getptr);
-		*(p++) = HOST1X_OPCODE_NOP;
-		*(p++) = HOST1X_OPCODE_NOP;
-		dev_dbg(host1x->dev, "%s: NOP at %pad+%#x\n", __func__,
-			&pb->dma, getptr);
-		getptr = (getptr + 8) & (pb->size - 1);
-	}
-
-	wmb();
 }
 
 /*