meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3061-drm-amd-display-Optimize-gamma-calculations.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333

From 9d796880fd196a54088a4ab097b441d5fadfba05 Mon Sep 17 00:00:00 2001
From: Krunoslav Kovac <Krunoslav.Kovac@amd.com>
Date: Tue, 18 Jun 2019 17:38:43 -0400
Subject: [PATCH 3061/4256] drm/amd/display: Optimize gamma calculations

[Why&How]

1. Stack usage is pretty high as fixed31_32 struct is 8 bytes and we
have functions with >30 vars on the stack.

2. Optimize gamma calculation by reducing number of calls to
dc_fixpt_pow Our X points are divided into 32 regions wth 16 pts each.
Each region is 2x the previous, meaning x[i] = 2*x[i-16] for i>=16.
Using (2x)^gamma = 2^gamma * x^gamma, we can recursively compute powers
of gamma, we just need first 16 pts to start it up. dc_fixpt_pow() is
expensive, it computes x^y by doing exp(y*logx) Exp is done by Taylor
series approximation, and log by Newton-like approximation that also
uses exp internally. In short, it's significantly heavier than
run-of-the-mill addition/subtraction/multiply.

Signed-off-by: Krunoslav Kovac <Krunoslav.Kovac@amd.com>
Reviewed-by: Anthony Koo <Anthony.Koo@amd.com>
Acked-by: Aric Cyr <Aric.Cyr@amd.com>
Acked-by: Leo Li <sunpeng.li@amd.com>
---
 drivers/gpu/drm/amd/display/dc/dc_hw_types.h  |   1 -
 .../amd/display/modules/color/color_gamma.c   | 163 +++++++++++-------
 .../amd/display/modules/color/color_gamma.h   |   9 +
 3 files changed, 111 insertions(+), 62 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dc_hw_types.h b/drivers/gpu/drm/amd/display/dc/dc_hw_types.h
index 22db5682aa6c..e9a6225f4720 100644
--- a/drivers/gpu/drm/amd/display/dc/dc_hw_types.h
+++ b/drivers/gpu/drm/amd/display/dc/dc_hw_types.h
@@ -482,7 +482,6 @@ struct dc_gamma {
 	 * is_logical_identity indicates the given gamma ramp regardless of type is identity.
 	 */
 	bool is_identity;
-	bool is_logical_identity;
 };
 
 /* Used by both ipp amd opp functions*/
diff --git a/drivers/gpu/drm/amd/display/modules/color/color_gamma.c b/drivers/gpu/drm/amd/display/modules/color/color_gamma.c
index 3f413fb9f2ce..294fe4f0cb67 100644
--- a/drivers/gpu/drm/amd/display/modules/color/color_gamma.c
+++ b/drivers/gpu/drm/amd/display/modules/color/color_gamma.c
@@ -37,6 +37,33 @@ static struct hw_x_point coordinates_x[MAX_HW_POINTS + 2];
 static struct fixed31_32 pq_table[MAX_HW_POINTS + 2];
 static struct fixed31_32 de_pq_table[MAX_HW_POINTS + 2];
 
+// these are helpers for calculations to reduce stack usage
+// do not depend on these being preserved across calls
+static struct fixed31_32 scratch_1;
+static struct fixed31_32 scratch_2;
+static struct translate_from_linear_space_args scratch_gamma_args;
+
+/* Helper to optimize gamma calculation, only use in translate_from_linear, in
+ * particular the dc_fixpt_pow function which is very expensive
+ * The idea is that our regions for X points are exponential and currently they all use
+ * the same number of points (NUM_PTS_IN_REGION) and in each region every point
+ * is exactly 2x the one at the same index in the previous region. In other words
+ * X[i] = 2 * X[i-NUM_PTS_IN_REGION] for i>=16
+ * The other fact is that (2x)^gamma = 2^gamma * x^gamma
+ * So we compute and save x^gamma for the first 16 regions, and for every next region
+ * just multiply with 2^gamma which can be computed once, and save the result so we
+ * recursively compute all the values.
+ */
+static struct fixed31_32 pow_buffer[NUM_PTS_IN_REGION];
+static struct fixed31_32 gamma_of_2; // 2^gamma
+int pow_buffer_ptr = -1;
+
+static const int32_t gamma_numerator01[] = { 31308,	180000,	0};
+static const int32_t gamma_numerator02[] = { 12920,	4500,	0};
+static const int32_t gamma_numerator03[] = { 55,	99,	0};
+static const int32_t gamma_numerator04[] = { 55,	99,	0};
+static const int32_t gamma_numerator05[] = { 2400,	2200, 2200};
+
 static bool pq_initialized; /* = false; */
 static bool de_pq_initialized; /* = false; */
 
@@ -248,11 +275,7 @@ enum gamma_type_index {
 
 static void build_coefficients(struct gamma_coefficients *coefficients, enum gamma_type_index type)
 {
-	static const int32_t numerator01[] = { 31308,	180000,	0};
-	static const int32_t numerator02[] = { 12920,	4500,	0};
-	static const int32_t numerator03[] = { 55,		99,		0};
-	static const int32_t numerator04[] = { 55,		99,		0};
-	static const int32_t numerator05[] = { 2400,	2200, 2200};
+
 
 	uint32_t i = 0;
 	uint32_t index = 0;
@@ -264,69 +287,74 @@ static void build_coefficients(struct gamma_coefficients *coefficients, enum gam
 
 	do {
 		coefficients->a0[i] = dc_fixpt_from_fraction(
-			numerator01[index], 10000000);
+			gamma_numerator01[index], 10000000);
 		coefficients->a1[i] = dc_fixpt_from_fraction(
-			numerator02[index], 1000);
+			gamma_numerator02[index], 1000);
 		coefficients->a2[i] = dc_fixpt_from_fraction(
-			numerator03[index], 1000);
+			gamma_numerator03[index], 1000);
 		coefficients->a3[i] = dc_fixpt_from_fraction(
-			numerator04[index], 1000);
+			gamma_numerator04[index], 1000);
 		coefficients->user_gamma[i] = dc_fixpt_from_fraction(
-			numerator05[index], 1000);
+			gamma_numerator05[index], 1000);
 
 		++i;
 	} while (i != ARRAY_SIZE(coefficients->a0));
 }
 
 static struct fixed31_32 translate_from_linear_space(
-	struct fixed31_32 arg,
-	struct fixed31_32 a0,
-	struct fixed31_32 a1,
-	struct fixed31_32 a2,
-	struct fixed31_32 a3,
-	struct fixed31_32 gamma)
+		struct translate_from_linear_space_args *args)
 {
 	const struct fixed31_32 one = dc_fixpt_from_int(1);
 
-	if (dc_fixpt_lt(one, arg))
+	if (dc_fixpt_le(one, args->arg))
 		return one;
 
-	if (dc_fixpt_le(arg, dc_fixpt_neg(a0)))
-		return dc_fixpt_sub(
-			a2,
-			dc_fixpt_mul(
-				dc_fixpt_add(
-					one,
-					a3),
-				dc_fixpt_pow(
-					dc_fixpt_neg(arg),
-					dc_fixpt_recip(gamma))));
-	else if (dc_fixpt_le(a0, arg))
-		return dc_fixpt_sub(
-			dc_fixpt_mul(
-				dc_fixpt_add(
-					one,
-					a3),
-				dc_fixpt_pow(
-					arg,
-					dc_fixpt_recip(gamma))),
-			a2);
+	if (dc_fixpt_le(args->arg, dc_fixpt_neg(args->a0))) {
+		scratch_1 = dc_fixpt_add(one, args->a3);
+		scratch_2 = dc_fixpt_pow(
+				dc_fixpt_neg(args->arg),
+				dc_fixpt_recip(args->gamma));
+		scratch_1 = dc_fixpt_mul(scratch_1, scratch_2);
+		scratch_1 = dc_fixpt_sub(args->a2, scratch_1);
+
+		return scratch_1;
+	} else if (dc_fixpt_le(args->a0, args->arg)) {
+		if (pow_buffer_ptr == 0) {
+			gamma_of_2 = dc_fixpt_pow(dc_fixpt_from_int(2),
+					dc_fixpt_recip(args->gamma));
+		}
+		scratch_1 = dc_fixpt_add(one, args->a3);
+		if (pow_buffer_ptr < 16)
+			scratch_2 = dc_fixpt_pow(args->arg,
+					dc_fixpt_recip(args->gamma));
+		else
+			scratch_2 = dc_fixpt_mul(gamma_of_2,
+					pow_buffer[pow_buffer_ptr%16]);
+
+		pow_buffer[pow_buffer_ptr%16] = scratch_2;
+		pow_buffer_ptr++;
+
+		scratch_1 = dc_fixpt_mul(scratch_1, scratch_2);
+		scratch_1 = dc_fixpt_sub(scratch_1, args->a2);
+
+		return scratch_1;
+	}
 	else
-		return dc_fixpt_mul(
-			arg,
-			a1);
+		return dc_fixpt_mul(args->arg, args->a1);
 }
 
 static struct fixed31_32 calculate_gamma22(struct fixed31_32 arg)
 {
 	struct fixed31_32 gamma = dc_fixpt_from_fraction(22, 10);
 
-	return translate_from_linear_space(arg,
-			dc_fixpt_zero,
-			dc_fixpt_zero,
-			dc_fixpt_zero,
-			dc_fixpt_zero,
-			gamma);
+	scratch_gamma_args.arg = arg;
+	scratch_gamma_args.a0 = dc_fixpt_zero;
+	scratch_gamma_args.a1 = dc_fixpt_zero;
+	scratch_gamma_args.a2 = dc_fixpt_zero;
+	scratch_gamma_args.a3 = dc_fixpt_zero;
+	scratch_gamma_args.gamma = gamma;
+
+	return translate_from_linear_space(&scratch_gamma_args);
 }
 
 static struct fixed31_32 translate_to_linear_space(
@@ -362,18 +390,19 @@ static struct fixed31_32 translate_to_linear_space(
 	return linear;
 }
 
-static inline struct fixed31_32 translate_from_linear_space_ex(
+static struct fixed31_32 translate_from_linear_space_ex(
 	struct fixed31_32 arg,
 	struct gamma_coefficients *coeff,
 	uint32_t color_index)
 {
-	return translate_from_linear_space(
-		arg,
-		coeff->a0[color_index],
-		coeff->a1[color_index],
-		coeff->a2[color_index],
-		coeff->a3[color_index],
-		coeff->user_gamma[color_index]);
+	scratch_gamma_args.arg = arg;
+	scratch_gamma_args.a0 = coeff->a0[color_index];
+	scratch_gamma_args.a1 = coeff->a1[color_index];
+	scratch_gamma_args.a2 = coeff->a2[color_index];
+	scratch_gamma_args.a3 = coeff->a3[color_index];
+	scratch_gamma_args.gamma = coeff->user_gamma[color_index];
+
+	return translate_from_linear_space(&scratch_gamma_args);
 }
 
 
@@ -712,24 +741,32 @@ static void build_regamma(struct pwl_float_data_ex *rgb_regamma,
 {
 	uint32_t i;
 
-	struct gamma_coefficients coeff;
+	struct gamma_coefficients *coeff;
 	struct pwl_float_data_ex *rgb = rgb_regamma;
 	const struct hw_x_point *coord_x = coordinate_x;
 
-	build_coefficients(&coeff, type);
+	coeff = kvzalloc(sizeof(*coeff), GFP_KERNEL);
+	if (!coeff)
+		return;
 
-	i = 0;
+	build_coefficients(coeff, type);
 
-	while (i != hw_points_num + 1) {
+	memset(pow_buffer, 0, NUM_PTS_IN_REGION * sizeof(struct fixed31_32));
+	pow_buffer_ptr = 0; // see variable definition for more info
+	i = 0;
+	while (i <= hw_points_num) {
 		/*TODO use y vs r,g,b*/
 		rgb->r = translate_from_linear_space_ex(
-			coord_x->x, &coeff, 0);
+			coord_x->x, coeff, 0);
 		rgb->g = rgb->r;
 		rgb->b = rgb->r;
 		++coord_x;
 		++rgb;
 		++i;
 	}
+	pow_buffer_ptr = -1; // reset back to no optimize
+
+	kfree(coeff);
 }
 
 static void hermite_spline_eetf(struct fixed31_32 input_x,
@@ -859,6 +896,8 @@ static bool build_freesync_hdr(struct pwl_float_data_ex *rgb_regamma,
 	else
 		max_content = max_display;
 
+	if (!use_eetf)
+		pow_buffer_ptr = 0; // see var definition for more info
 	rgb += 32; // first 32 points have problems with fixed point, too small
 	coord_x += 32;
 	for (i = 32; i <= hw_points_num; i++) {
@@ -897,6 +936,7 @@ static bool build_freesync_hdr(struct pwl_float_data_ex *rgb_regamma,
 		++coord_x;
 		++rgb;
 	}
+	pow_buffer_ptr = -1;
 
 	return true;
 }
@@ -1569,14 +1609,15 @@ bool mod_color_calculate_regamma_params(struct dc_transfer_func *output_tf,
 			output_tf->tf == TRANSFER_FUNCTION_SRGB) {
 		if (ramp == NULL)
 			return true;
-		if ((ramp->is_logical_identity) ||
+		if ((ramp->is_identity && ramp->type != GAMMA_CS_TFM_1D) ||
 				(!mapUserRamp && ramp->type == GAMMA_RGB_256))
 			return true;
 	}
 
 	output_tf->type = TF_TYPE_DISTRIBUTED_POINTS;
 
-	if (ramp && (mapUserRamp || ramp->type != GAMMA_RGB_256)) {
+	if (ramp && ramp->type != GAMMA_CS_TFM_1D &&
+			(mapUserRamp || ramp->type != GAMMA_RGB_256)) {
 		rgb_user = kvcalloc(ramp->num_entries + _EXTRA_POINTS,
 			    sizeof(*rgb_user),
 			    GFP_KERNEL);
diff --git a/drivers/gpu/drm/amd/display/modules/color/color_gamma.h b/drivers/gpu/drm/amd/display/modules/color/color_gamma.h
index 369953fafadf..69cecd2ec251 100644
--- a/drivers/gpu/drm/amd/display/modules/color/color_gamma.h
+++ b/drivers/gpu/drm/amd/display/modules/color/color_gamma.h
@@ -82,6 +82,15 @@ struct freesync_hdr_tf_params {
 	unsigned int skip_tm; // skip tm
 };
 
+struct translate_from_linear_space_args {
+	struct fixed31_32 arg;
+	struct fixed31_32 a0;
+	struct fixed31_32 a1;
+	struct fixed31_32 a2;
+	struct fixed31_32 a3;
+	struct fixed31_32 gamma;
+};
+
 void setup_x_points_distribution(void);
 void precompute_pq(void);
 void precompute_de_pq(void);
-- 
2.17.1