diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3061-drm-amd-display-Optimize-gamma-calculations.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3061-drm-amd-display-Optimize-gamma-calculations.patch | 333 |
1 files changed, 333 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3061-drm-amd-display-Optimize-gamma-calculations.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3061-drm-amd-display-Optimize-gamma-calculations.patch new file mode 100644 index 00000000..61b986dd --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3061-drm-amd-display-Optimize-gamma-calculations.patch @@ -0,0 +1,333 @@ +From 9d796880fd196a54088a4ab097b441d5fadfba05 Mon Sep 17 00:00:00 2001 +From: Krunoslav Kovac <Krunoslav.Kovac@amd.com> +Date: Tue, 18 Jun 2019 17:38:43 -0400 +Subject: [PATCH 3061/4256] drm/amd/display: Optimize gamma calculations + +[Why&How] + +1. Stack usage is pretty high as fixed31_32 struct is 8 bytes and we +have functions with >30 vars on the stack. + +2. Optimize gamma calculation by reducing number of calls to +dc_fixpt_pow Our X points are divided into 32 regions wth 16 pts each. +Each region is 2x the previous, meaning x[i] = 2*x[i-16] for i>=16. +Using (2x)^gamma = 2^gamma * x^gamma, we can recursively compute powers +of gamma, we just need first 16 pts to start it up. dc_fixpt_pow() is +expensive, it computes x^y by doing exp(y*logx) Exp is done by Taylor +series approximation, and log by Newton-like approximation that also +uses exp internally. In short, it's significantly heavier than +run-of-the-mill addition/subtraction/multiply. + +Signed-off-by: Krunoslav Kovac <Krunoslav.Kovac@amd.com> +Reviewed-by: Anthony Koo <Anthony.Koo@amd.com> +Acked-by: Aric Cyr <Aric.Cyr@amd.com> +Acked-by: Leo Li <sunpeng.li@amd.com> +--- + drivers/gpu/drm/amd/display/dc/dc_hw_types.h | 1 - + .../amd/display/modules/color/color_gamma.c | 163 +++++++++++------- + .../amd/display/modules/color/color_gamma.h | 9 + + 3 files changed, 111 insertions(+), 62 deletions(-) + +diff --git a/drivers/gpu/drm/amd/display/dc/dc_hw_types.h b/drivers/gpu/drm/amd/display/dc/dc_hw_types.h +index 22db5682aa6c..e9a6225f4720 100644 +--- a/drivers/gpu/drm/amd/display/dc/dc_hw_types.h ++++ b/drivers/gpu/drm/amd/display/dc/dc_hw_types.h +@@ -482,7 +482,6 @@ struct dc_gamma { + * is_logical_identity indicates the given gamma ramp regardless of type is identity. + */ + bool is_identity; +- bool is_logical_identity; + }; + + /* Used by both ipp amd opp functions*/ +diff --git a/drivers/gpu/drm/amd/display/modules/color/color_gamma.c b/drivers/gpu/drm/amd/display/modules/color/color_gamma.c +index 3f413fb9f2ce..294fe4f0cb67 100644 +--- a/drivers/gpu/drm/amd/display/modules/color/color_gamma.c ++++ b/drivers/gpu/drm/amd/display/modules/color/color_gamma.c +@@ -37,6 +37,33 @@ static struct hw_x_point coordinates_x[MAX_HW_POINTS + 2]; + static struct fixed31_32 pq_table[MAX_HW_POINTS + 2]; + static struct fixed31_32 de_pq_table[MAX_HW_POINTS + 2]; + ++// these are helpers for calculations to reduce stack usage ++// do not depend on these being preserved across calls ++static struct fixed31_32 scratch_1; ++static struct fixed31_32 scratch_2; ++static struct translate_from_linear_space_args scratch_gamma_args; ++ ++/* Helper to optimize gamma calculation, only use in translate_from_linear, in ++ * particular the dc_fixpt_pow function which is very expensive ++ * The idea is that our regions for X points are exponential and currently they all use ++ * the same number of points (NUM_PTS_IN_REGION) and in each region every point ++ * is exactly 2x the one at the same index in the previous region. In other words ++ * X[i] = 2 * X[i-NUM_PTS_IN_REGION] for i>=16 ++ * The other fact is that (2x)^gamma = 2^gamma * x^gamma ++ * So we compute and save x^gamma for the first 16 regions, and for every next region ++ * just multiply with 2^gamma which can be computed once, and save the result so we ++ * recursively compute all the values. ++ */ ++static struct fixed31_32 pow_buffer[NUM_PTS_IN_REGION]; ++static struct fixed31_32 gamma_of_2; // 2^gamma ++int pow_buffer_ptr = -1; ++ ++static const int32_t gamma_numerator01[] = { 31308, 180000, 0}; ++static const int32_t gamma_numerator02[] = { 12920, 4500, 0}; ++static const int32_t gamma_numerator03[] = { 55, 99, 0}; ++static const int32_t gamma_numerator04[] = { 55, 99, 0}; ++static const int32_t gamma_numerator05[] = { 2400, 2200, 2200}; ++ + static bool pq_initialized; /* = false; */ + static bool de_pq_initialized; /* = false; */ + +@@ -248,11 +275,7 @@ enum gamma_type_index { + + static void build_coefficients(struct gamma_coefficients *coefficients, enum gamma_type_index type) + { +- static const int32_t numerator01[] = { 31308, 180000, 0}; +- static const int32_t numerator02[] = { 12920, 4500, 0}; +- static const int32_t numerator03[] = { 55, 99, 0}; +- static const int32_t numerator04[] = { 55, 99, 0}; +- static const int32_t numerator05[] = { 2400, 2200, 2200}; ++ + + uint32_t i = 0; + uint32_t index = 0; +@@ -264,69 +287,74 @@ static void build_coefficients(struct gamma_coefficients *coefficients, enum gam + + do { + coefficients->a0[i] = dc_fixpt_from_fraction( +- numerator01[index], 10000000); ++ gamma_numerator01[index], 10000000); + coefficients->a1[i] = dc_fixpt_from_fraction( +- numerator02[index], 1000); ++ gamma_numerator02[index], 1000); + coefficients->a2[i] = dc_fixpt_from_fraction( +- numerator03[index], 1000); ++ gamma_numerator03[index], 1000); + coefficients->a3[i] = dc_fixpt_from_fraction( +- numerator04[index], 1000); ++ gamma_numerator04[index], 1000); + coefficients->user_gamma[i] = dc_fixpt_from_fraction( +- numerator05[index], 1000); ++ gamma_numerator05[index], 1000); + + ++i; + } while (i != ARRAY_SIZE(coefficients->a0)); + } + + static struct fixed31_32 translate_from_linear_space( +- struct fixed31_32 arg, +- struct fixed31_32 a0, +- struct fixed31_32 a1, +- struct fixed31_32 a2, +- struct fixed31_32 a3, +- struct fixed31_32 gamma) ++ struct translate_from_linear_space_args *args) + { + const struct fixed31_32 one = dc_fixpt_from_int(1); + +- if (dc_fixpt_lt(one, arg)) ++ if (dc_fixpt_le(one, args->arg)) + return one; + +- if (dc_fixpt_le(arg, dc_fixpt_neg(a0))) +- return dc_fixpt_sub( +- a2, +- dc_fixpt_mul( +- dc_fixpt_add( +- one, +- a3), +- dc_fixpt_pow( +- dc_fixpt_neg(arg), +- dc_fixpt_recip(gamma)))); +- else if (dc_fixpt_le(a0, arg)) +- return dc_fixpt_sub( +- dc_fixpt_mul( +- dc_fixpt_add( +- one, +- a3), +- dc_fixpt_pow( +- arg, +- dc_fixpt_recip(gamma))), +- a2); ++ if (dc_fixpt_le(args->arg, dc_fixpt_neg(args->a0))) { ++ scratch_1 = dc_fixpt_add(one, args->a3); ++ scratch_2 = dc_fixpt_pow( ++ dc_fixpt_neg(args->arg), ++ dc_fixpt_recip(args->gamma)); ++ scratch_1 = dc_fixpt_mul(scratch_1, scratch_2); ++ scratch_1 = dc_fixpt_sub(args->a2, scratch_1); ++ ++ return scratch_1; ++ } else if (dc_fixpt_le(args->a0, args->arg)) { ++ if (pow_buffer_ptr == 0) { ++ gamma_of_2 = dc_fixpt_pow(dc_fixpt_from_int(2), ++ dc_fixpt_recip(args->gamma)); ++ } ++ scratch_1 = dc_fixpt_add(one, args->a3); ++ if (pow_buffer_ptr < 16) ++ scratch_2 = dc_fixpt_pow(args->arg, ++ dc_fixpt_recip(args->gamma)); ++ else ++ scratch_2 = dc_fixpt_mul(gamma_of_2, ++ pow_buffer[pow_buffer_ptr%16]); ++ ++ pow_buffer[pow_buffer_ptr%16] = scratch_2; ++ pow_buffer_ptr++; ++ ++ scratch_1 = dc_fixpt_mul(scratch_1, scratch_2); ++ scratch_1 = dc_fixpt_sub(scratch_1, args->a2); ++ ++ return scratch_1; ++ } + else +- return dc_fixpt_mul( +- arg, +- a1); ++ return dc_fixpt_mul(args->arg, args->a1); + } + + static struct fixed31_32 calculate_gamma22(struct fixed31_32 arg) + { + struct fixed31_32 gamma = dc_fixpt_from_fraction(22, 10); + +- return translate_from_linear_space(arg, +- dc_fixpt_zero, +- dc_fixpt_zero, +- dc_fixpt_zero, +- dc_fixpt_zero, +- gamma); ++ scratch_gamma_args.arg = arg; ++ scratch_gamma_args.a0 = dc_fixpt_zero; ++ scratch_gamma_args.a1 = dc_fixpt_zero; ++ scratch_gamma_args.a2 = dc_fixpt_zero; ++ scratch_gamma_args.a3 = dc_fixpt_zero; ++ scratch_gamma_args.gamma = gamma; ++ ++ return translate_from_linear_space(&scratch_gamma_args); + } + + static struct fixed31_32 translate_to_linear_space( +@@ -362,18 +390,19 @@ static struct fixed31_32 translate_to_linear_space( + return linear; + } + +-static inline struct fixed31_32 translate_from_linear_space_ex( ++static struct fixed31_32 translate_from_linear_space_ex( + struct fixed31_32 arg, + struct gamma_coefficients *coeff, + uint32_t color_index) + { +- return translate_from_linear_space( +- arg, +- coeff->a0[color_index], +- coeff->a1[color_index], +- coeff->a2[color_index], +- coeff->a3[color_index], +- coeff->user_gamma[color_index]); ++ scratch_gamma_args.arg = arg; ++ scratch_gamma_args.a0 = coeff->a0[color_index]; ++ scratch_gamma_args.a1 = coeff->a1[color_index]; ++ scratch_gamma_args.a2 = coeff->a2[color_index]; ++ scratch_gamma_args.a3 = coeff->a3[color_index]; ++ scratch_gamma_args.gamma = coeff->user_gamma[color_index]; ++ ++ return translate_from_linear_space(&scratch_gamma_args); + } + + +@@ -712,24 +741,32 @@ static void build_regamma(struct pwl_float_data_ex *rgb_regamma, + { + uint32_t i; + +- struct gamma_coefficients coeff; ++ struct gamma_coefficients *coeff; + struct pwl_float_data_ex *rgb = rgb_regamma; + const struct hw_x_point *coord_x = coordinate_x; + +- build_coefficients(&coeff, type); ++ coeff = kvzalloc(sizeof(*coeff), GFP_KERNEL); ++ if (!coeff) ++ return; + +- i = 0; ++ build_coefficients(coeff, type); + +- while (i != hw_points_num + 1) { ++ memset(pow_buffer, 0, NUM_PTS_IN_REGION * sizeof(struct fixed31_32)); ++ pow_buffer_ptr = 0; // see variable definition for more info ++ i = 0; ++ while (i <= hw_points_num) { + /*TODO use y vs r,g,b*/ + rgb->r = translate_from_linear_space_ex( +- coord_x->x, &coeff, 0); ++ coord_x->x, coeff, 0); + rgb->g = rgb->r; + rgb->b = rgb->r; + ++coord_x; + ++rgb; + ++i; + } ++ pow_buffer_ptr = -1; // reset back to no optimize ++ ++ kfree(coeff); + } + + static void hermite_spline_eetf(struct fixed31_32 input_x, +@@ -859,6 +896,8 @@ static bool build_freesync_hdr(struct pwl_float_data_ex *rgb_regamma, + else + max_content = max_display; + ++ if (!use_eetf) ++ pow_buffer_ptr = 0; // see var definition for more info + rgb += 32; // first 32 points have problems with fixed point, too small + coord_x += 32; + for (i = 32; i <= hw_points_num; i++) { +@@ -897,6 +936,7 @@ static bool build_freesync_hdr(struct pwl_float_data_ex *rgb_regamma, + ++coord_x; + ++rgb; + } ++ pow_buffer_ptr = -1; + + return true; + } +@@ -1569,14 +1609,15 @@ bool mod_color_calculate_regamma_params(struct dc_transfer_func *output_tf, + output_tf->tf == TRANSFER_FUNCTION_SRGB) { + if (ramp == NULL) + return true; +- if ((ramp->is_logical_identity) || ++ if ((ramp->is_identity && ramp->type != GAMMA_CS_TFM_1D) || + (!mapUserRamp && ramp->type == GAMMA_RGB_256)) + return true; + } + + output_tf->type = TF_TYPE_DISTRIBUTED_POINTS; + +- if (ramp && (mapUserRamp || ramp->type != GAMMA_RGB_256)) { ++ if (ramp && ramp->type != GAMMA_CS_TFM_1D && ++ (mapUserRamp || ramp->type != GAMMA_RGB_256)) { + rgb_user = kvcalloc(ramp->num_entries + _EXTRA_POINTS, + sizeof(*rgb_user), + GFP_KERNEL); +diff --git a/drivers/gpu/drm/amd/display/modules/color/color_gamma.h b/drivers/gpu/drm/amd/display/modules/color/color_gamma.h +index 369953fafadf..69cecd2ec251 100644 +--- a/drivers/gpu/drm/amd/display/modules/color/color_gamma.h ++++ b/drivers/gpu/drm/amd/display/modules/color/color_gamma.h +@@ -82,6 +82,15 @@ struct freesync_hdr_tf_params { + unsigned int skip_tm; // skip tm + }; + ++struct translate_from_linear_space_args { ++ struct fixed31_32 arg; ++ struct fixed31_32 a0; ++ struct fixed31_32 a1; ++ struct fixed31_32 a2; ++ struct fixed31_32 a3; ++ struct fixed31_32 gamma; ++}; ++ + void setup_x_points_distribution(void); + void precompute_pq(void); + void precompute_de_pq(void); +-- +2.17.1 + |