aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3510-drm-amd-display-Optimize-regamma-calculations.patch
blob: 61d10836eac5cf6e7d415a276f389edd0f4d3557 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
From c4737669669d41e583029b94acfc2a26631b0813 Mon Sep 17 00:00:00 2001
From: Krunoslav Kovac <Krunoslav.Kovac@amd.com>
Date: Fri, 19 Jan 2018 17:55:26 -0500
Subject: [PATCH 3510/4131] drm/amd/display: Optimize regamma calculations

There are several optimizations:
1) Use predefined SRGB, don't calculate. This is the most common case.
2) Precompute HW X points at boot since they're fixed in ColModule
3) Precompute PQ - it never changes and is very CPU intensive in fixed pt.
4) Reduce number of points in ColModule to 512 (32x16) from 1024. This also
requires reducing some regions for legacy DCEs to 16 pts at most.

Performance
1) is super-fast, build_output_tf is 1-2us, down from 25000-30000.
Programming also fast since only one reg write.
2)+3) gives build_output_tf for PQ in ~100us range, down from ~80000-110000
2) + 4) results in slightly over 50% improvement. It gives an idea of the
savings when we can't use SRGB or PQ table (e.g. sdr white level > 80).

There's also a bit of refactoring: renaming some stuff that was misleading
and removing a lot of magic numbers that novices might not be able to
understand where they come from and what they mean.

Signed-off-by: Krunoslav Kovac <Krunoslav.Kovac@amd.com>
Reviewed-by: Tony Cheng <Tony.Cheng@amd.com>
Acked-by: Harry Wentland <harry.wentland@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 .../amd/display/dc/dce110/dce110_hw_sequencer.c    | 56 ++++++++++------------
 .../gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c | 39 ++++++++-------
 drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c   |  2 +-
 drivers/gpu/drm/amd/display/dc/inc/hw/hw_shared.h  |  2 +-
 4 files changed, 47 insertions(+), 52 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c
index b87974e..54c933b 100644
--- a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c
+++ b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c
@@ -407,6 +407,10 @@ static bool convert_to_custom_float(struct pwl_result_data *rgb_resulted,
 	return true;
 }
 
+#define MAX_LOW_POINT      11
+#define NUMBER_REGIONS     16
+#define NUMBER_SW_SEGMENTS 16
+
 static bool
 dce110_translate_regamma_to_hw_format(const struct dc_transfer_func *output_tf,
 				      struct pwl_params *regamma_params)
@@ -421,8 +425,8 @@ dce110_translate_regamma_to_hw_format(const struct dc_transfer_func *output_tf,
 	struct fixed31_32 y1_min;
 	struct fixed31_32 y3_max;
 
-	int32_t segment_start, segment_end;
-	uint32_t i, j, k, seg_distr[16], increment, start_index, hw_points;
+	int32_t region_start, region_end;
+	uint32_t i, j, k, seg_distr[NUMBER_REGIONS], increment, start_index, hw_points;
 
 	if (output_tf == NULL || regamma_params == NULL || output_tf->type == TF_TYPE_BYPASS)
 		return false;
@@ -437,34 +441,20 @@ dce110_translate_regamma_to_hw_format(const struct dc_transfer_func *output_tf,
 		/* 16 segments
 		 * segments are from 2^-11 to 2^5
 		 */
-		segment_start = -11;
-		segment_end = 5;
-
-		seg_distr[0] = 2;
-		seg_distr[1] = 2;
-		seg_distr[2] = 2;
-		seg_distr[3] = 2;
-		seg_distr[4] = 2;
-		seg_distr[5] = 2;
-		seg_distr[6] = 3;
-		seg_distr[7] = 4;
-		seg_distr[8] = 4;
-		seg_distr[9] = 4;
-		seg_distr[10] = 4;
-		seg_distr[11] = 5;
-		seg_distr[12] = 5;
-		seg_distr[13] = 5;
-		seg_distr[14] = 5;
-		seg_distr[15] = 5;
+		region_start = -MAX_LOW_POINT;
+		region_end = NUMBER_REGIONS - MAX_LOW_POINT;
+
+		for (i = 0; i < NUMBER_REGIONS; i++)
+			seg_distr[i] = 4;
 
 	} else {
 		/* 10 segments
 		 * segment is from 2^-10 to 2^0
 		 */
-		segment_start = -10;
-		segment_end = 0;
+		region_start = -10;
+		region_end = 0;
 
-		seg_distr[0] = 3;
+		seg_distr[0] = 4;
 		seg_distr[1] = 4;
 		seg_distr[2] = 4;
 		seg_distr[3] = 4;
@@ -472,8 +462,8 @@ dce110_translate_regamma_to_hw_format(const struct dc_transfer_func *output_tf,
 		seg_distr[5] = 4;
 		seg_distr[6] = 4;
 		seg_distr[7] = 4;
-		seg_distr[8] = 5;
-		seg_distr[9] = 5;
+		seg_distr[8] = 4;
+		seg_distr[9] = 4;
 		seg_distr[10] = -1;
 		seg_distr[11] = -1;
 		seg_distr[12] = -1;
@@ -488,10 +478,12 @@ dce110_translate_regamma_to_hw_format(const struct dc_transfer_func *output_tf,
 	}
 
 	j = 0;
-	for (k = 0; k < (segment_end - segment_start); k++) {
+	for (k = 0; k < (region_end - region_start); k++) {
 		increment = 32 / (1 << seg_distr[k]);
-		start_index = (segment_start + k + 25) * 32;
-		for (i = start_index; i < start_index + 32; i += increment) {
+		start_index = (region_start + k + MAX_LOW_POINT) *
+				NUMBER_SW_SEGMENTS;
+		for (i = start_index; i < start_index + NUMBER_SW_SEGMENTS;
+				i += increment) {
 			if (j == hw_points - 1)
 				break;
 			rgb_resulted[j].red = output_tf->tf_pts.red[i];
@@ -502,15 +494,15 @@ dce110_translate_regamma_to_hw_format(const struct dc_transfer_func *output_tf,
 	}
 
 	/* last point */
-	start_index = (segment_end + 25) * 32;
+	start_index = (region_end + MAX_LOW_POINT) * NUMBER_SW_SEGMENTS;
 	rgb_resulted[hw_points - 1].red = output_tf->tf_pts.red[start_index];
 	rgb_resulted[hw_points - 1].green = output_tf->tf_pts.green[start_index];
 	rgb_resulted[hw_points - 1].blue = output_tf->tf_pts.blue[start_index];
 
 	arr_points[0].x = dal_fixed31_32_pow(dal_fixed31_32_from_int(2),
-					     dal_fixed31_32_from_int(segment_start));
+					     dal_fixed31_32_from_int(region_start));
 	arr_points[1].x = dal_fixed31_32_pow(dal_fixed31_32_from_int(2),
-					     dal_fixed31_32_from_int(segment_end));
+					     dal_fixed31_32_from_int(region_end));
 
 	y_r = rgb_resulted[0].red;
 	y_g = rgb_resulted[0].green;
diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c
index 53ba360..b3db639 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c
@@ -232,10 +232,11 @@ bool cm_helper_convert_to_custom_float(
 	return true;
 }
 
-
+/* driver uses 32 regions or less, but DCN HW has 34, extra 2 are set to 0 */
 #define MAX_REGIONS_NUMBER 34
 #define MAX_LOW_POINT      25
-#define NUMBER_SEGMENTS    32
+#define NUMBER_REGIONS     32
+#define NUMBER_SW_SEGMENTS 16
 
 bool cm_helper_translate_curve_to_hw_format(
 				const struct dc_transfer_func *output_tf,
@@ -251,7 +252,7 @@ bool cm_helper_translate_curve_to_hw_format(
 	struct fixed31_32 y1_min;
 	struct fixed31_32 y3_max;
 
-	int32_t segment_start, segment_end;
+	int32_t region_start, region_end;
 	int32_t i;
 	uint32_t j, k, seg_distr[MAX_REGIONS_NUMBER], increment, start_index, hw_points;
 
@@ -271,11 +272,11 @@ bool cm_helper_translate_curve_to_hw_format(
 		/* 32 segments
 		 * segments are from 2^-25 to 2^7
 		 */
-		for (i = 0; i < 32 ; i++)
+		for (i = 0; i < NUMBER_REGIONS ; i++)
 			seg_distr[i] = 3;
 
-		segment_start = -25;
-		segment_end   = 7;
+		region_start = -MAX_LOW_POINT;
+		region_end   = NUMBER_REGIONS - MAX_LOW_POINT;
 	} else {
 		/* 10 segments
 		 * segment is from 2^-10 to 2^0
@@ -289,14 +290,14 @@ bool cm_helper_translate_curve_to_hw_format(
 		seg_distr[5] = 4;
 		seg_distr[6] = 4;
 		seg_distr[7] = 4;
-		seg_distr[8] = 5;
-		seg_distr[9] = 5;
+		seg_distr[8] = 4;
+		seg_distr[9] = 4;
 
-		segment_start = -10;
-		segment_end = 0;
+		region_start = -10;
+		region_end = 0;
 	}
 
-	for (i = segment_end - segment_start; i < MAX_REGIONS_NUMBER ; i++)
+	for (i = region_end - region_start; i < MAX_REGIONS_NUMBER ; i++)
 		seg_distr[i] = -1;
 
 	for (k = 0; k < MAX_REGIONS_NUMBER; k++) {
@@ -305,10 +306,12 @@ bool cm_helper_translate_curve_to_hw_format(
 	}
 
 	j = 0;
-	for (k = 0; k < (segment_end - segment_start); k++) {
-		increment = NUMBER_SEGMENTS / (1 << seg_distr[k]);
-		start_index = (segment_start + k + MAX_LOW_POINT) * NUMBER_SEGMENTS;
-		for (i = start_index; i < start_index + NUMBER_SEGMENTS; i += increment) {
+	for (k = 0; k < (region_end - region_start); k++) {
+		increment = NUMBER_SW_SEGMENTS / (1 << seg_distr[k]);
+		start_index = (region_start + k + MAX_LOW_POINT) *
+				NUMBER_SW_SEGMENTS;
+		for (i = start_index; i < start_index + NUMBER_SW_SEGMENTS;
+				i += increment) {
 			if (j == hw_points - 1)
 				break;
 			rgb_resulted[j].red = output_tf->tf_pts.red[i];
@@ -319,15 +322,15 @@ bool cm_helper_translate_curve_to_hw_format(
 	}
 
 	/* last point */
-	start_index = (segment_end + MAX_LOW_POINT) * NUMBER_SEGMENTS;
+	start_index = (region_end + MAX_LOW_POINT) * NUMBER_SW_SEGMENTS;
 	rgb_resulted[hw_points - 1].red = output_tf->tf_pts.red[start_index];
 	rgb_resulted[hw_points - 1].green = output_tf->tf_pts.green[start_index];
 	rgb_resulted[hw_points - 1].blue = output_tf->tf_pts.blue[start_index];
 
 	arr_points[0].x = dal_fixed31_32_pow(dal_fixed31_32_from_int(2),
-					     dal_fixed31_32_from_int(segment_start));
+					     dal_fixed31_32_from_int(region_start));
 	arr_points[1].x = dal_fixed31_32_pow(dal_fixed31_32_from_int(2),
-					     dal_fixed31_32_from_int(segment_end));
+					     dal_fixed31_32_from_int(region_end));
 
 	y_r = rgb_resulted[0].red;
 	y_g = rgb_resulted[0].green;
diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c
index 080c253..8725cab 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c
@@ -196,7 +196,7 @@ static void dpp1_cm_set_regamma_pwl(
 	case OPP_REGAMMA_SRGB:
 		re_mode = 1;
 		break;
-	case OPP_REGAMMA_3_6:
+	case OPP_REGAMMA_XVYCC:
 		re_mode = 2;
 		break;
 	case OPP_REGAMMA_USER:
diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/hw_shared.h b/drivers/gpu/drm/amd/display/dc/inc/hw/hw_shared.h
index e3f0b40..b221581 100644
--- a/drivers/gpu/drm/amd/display/dc/inc/hw/hw_shared.h
+++ b/drivers/gpu/drm/amd/display/dc/inc/hw/hw_shared.h
@@ -136,7 +136,7 @@ struct out_csc_color_matrix {
 enum opp_regamma {
 	OPP_REGAMMA_BYPASS = 0,
 	OPP_REGAMMA_SRGB,
-	OPP_REGAMMA_3_6,
+	OPP_REGAMMA_XVYCC,
 	OPP_REGAMMA_USER
 };
 
-- 
2.7.4