diff options
Diffstat (limited to 'scripts/lib/resulttool/regression.py')
-rw-r--r-- | scripts/lib/resulttool/regression.py | 281 |
1 files changed, 271 insertions, 10 deletions
diff --git a/scripts/lib/resulttool/regression.py b/scripts/lib/resulttool/regression.py index 9f952951b3..10e7d13841 100644 --- a/scripts/lib/resulttool/regression.py +++ b/scripts/lib/resulttool/regression.py @@ -7,15 +7,209 @@ # import resulttool.resultutils as resultutils -import json from oeqa.utils.git import GitRepo import oeqa.utils.gitarchive as gitarchive -def compare_result(logger, base_name, target_name, base_result, target_result): +METADATA_MATCH_TABLE = { + "oeselftest": "OESELFTEST_METADATA" +} + +OESELFTEST_METADATA_GUESS_TABLE={ + "trigger-build-posttrigger": { + "run_all_tests": False, + "run_tests":["buildoptions.SourceMirroring.test_yocto_source_mirror"], + "skips": None, + "machine": None, + "select_tags":None, + "exclude_tags": None + }, + "reproducible": { + "run_all_tests": False, + "run_tests":["reproducible"], + "skips": None, + "machine": None, + "select_tags":None, + "exclude_tags": None + }, + "arch-qemu-quick": { + "run_all_tests": True, + "run_tests":None, + "skips": None, + "machine": None, + "select_tags":["machine"], + "exclude_tags": None + }, + "arch-qemu-full-x86-or-x86_64": { + "run_all_tests": True, + "run_tests":None, + "skips": None, + "machine": None, + "select_tags":["machine", "toolchain-system"], + "exclude_tags": None + }, + "arch-qemu-full-others": { + "run_all_tests": True, + "run_tests":None, + "skips": None, + "machine": None, + "select_tags":["machine", "toolchain-user"], + "exclude_tags": None + }, + "selftest": { + "run_all_tests": True, + "run_tests":None, + "skips": ["distrodata.Distrodata.test_checkpkg", "buildoptions.SourceMirroring.test_yocto_source_mirror", "reproducible"], + "machine": None, + "select_tags":None, + "exclude_tags": ["machine", "toolchain-system", "toolchain-user"] + }, + "bringup": { + "run_all_tests": True, + "run_tests":None, + "skips": ["distrodata.Distrodata.test_checkpkg", "buildoptions.SourceMirroring.test_yocto_source_mirror"], + "machine": None, + "select_tags":None, + "exclude_tags": ["machine", "toolchain-system", "toolchain-user"] + } +} + +STATUS_STRINGS = { + "None": "No matching test result" +} + +REGRESSIONS_DISPLAY_LIMIT=50 + +MISSING_TESTS_BANNER = "-------------------------- Missing tests --------------------------" +ADDITIONAL_DATA_BANNER = "--------------------- Matches and improvements --------------------" + +def test_has_at_least_one_matching_tag(test, tag_list): + return "oetags" in test and any(oetag in tag_list for oetag in test["oetags"]) + +def all_tests_have_at_least_one_matching_tag(results, tag_list): + return all(test_has_at_least_one_matching_tag(test_result, tag_list) or test_name.startswith("ptestresult") for (test_name, test_result) in results.items()) + +def any_test_have_any_matching_tag(results, tag_list): + return any(test_has_at_least_one_matching_tag(test, tag_list) for test in results.values()) + +def have_skipped_test(result, test_prefix): + return all( result[test]['status'] == "SKIPPED" for test in result if test.startswith(test_prefix)) + +def have_all_tests_skipped(result, test_prefixes_list): + return all(have_skipped_test(result, test_prefix) for test_prefix in test_prefixes_list) + +def guess_oeselftest_metadata(results): + """ + When an oeselftest test result is lacking OESELFTEST_METADATA, we can try to guess it based on results content. + Check results for specific values (absence/presence of oetags, number and name of executed tests...), + and if it matches one of known configuration from autobuilder configuration, apply guessed OSELFTEST_METADATA + to it to allow proper test filtering. + This guessing process is tightly coupled to config.json in autobuilder. It should trigger less and less, + as new tests will have OESELFTEST_METADATA properly appended at test reporting time + """ + + if len(results) == 1 and "buildoptions.SourceMirroring.test_yocto_source_mirror" in results: + return OESELFTEST_METADATA_GUESS_TABLE['trigger-build-posttrigger'] + elif all(result.startswith("reproducible") for result in results): + return OESELFTEST_METADATA_GUESS_TABLE['reproducible'] + elif all_tests_have_at_least_one_matching_tag(results, ["machine"]): + return OESELFTEST_METADATA_GUESS_TABLE['arch-qemu-quick'] + elif all_tests_have_at_least_one_matching_tag(results, ["machine", "toolchain-system"]): + return OESELFTEST_METADATA_GUESS_TABLE['arch-qemu-full-x86-or-x86_64'] + elif all_tests_have_at_least_one_matching_tag(results, ["machine", "toolchain-user"]): + return OESELFTEST_METADATA_GUESS_TABLE['arch-qemu-full-others'] + elif not any_test_have_any_matching_tag(results, ["machine", "toolchain-user", "toolchain-system"]): + if have_all_tests_skipped(results, ["distrodata.Distrodata.test_checkpkg", "buildoptions.SourceMirroring.test_yocto_source_mirror", "reproducible"]): + return OESELFTEST_METADATA_GUESS_TABLE['selftest'] + elif have_all_tests_skipped(results, ["distrodata.Distrodata.test_checkpkg", "buildoptions.SourceMirroring.test_yocto_source_mirror"]): + return OESELFTEST_METADATA_GUESS_TABLE['bringup'] + + return None + + +def metadata_matches(base_configuration, target_configuration): + """ + For passed base and target, check test type. If test type matches one of + properties described in METADATA_MATCH_TABLE, compare metadata if it is + present in base. Return true if metadata matches, or if base lacks some + data (either TEST_TYPE or the corresponding metadata) + """ + test_type = base_configuration.get('TEST_TYPE') + if test_type not in METADATA_MATCH_TABLE: + return True + + metadata_key = METADATA_MATCH_TABLE.get(test_type) + if target_configuration.get(metadata_key) != base_configuration.get(metadata_key): + return False + + return True + + +def machine_matches(base_configuration, target_configuration): + return base_configuration.get('MACHINE') == target_configuration.get('MACHINE') + + +def can_be_compared(logger, base, target): + """ + Some tests are not relevant to be compared, for example some oeselftest + run with different tests sets or parameters. Return true if tests can be + compared + """ + ret = True + base_configuration = base['configuration'] + target_configuration = target['configuration'] + + # Older test results lack proper OESELFTEST_METADATA: if not present, try to guess it based on tests results. + if base_configuration.get('TEST_TYPE') == 'oeselftest' and 'OESELFTEST_METADATA' not in base_configuration: + guess = guess_oeselftest_metadata(base['result']) + if guess is None: + logger.error(f"ERROR: did not manage to guess oeselftest metadata for {base_configuration['STARTTIME']}") + else: + logger.debug(f"Enriching {base_configuration['STARTTIME']} with {guess}") + base_configuration['OESELFTEST_METADATA'] = guess + if target_configuration.get('TEST_TYPE') == 'oeselftest' and 'OESELFTEST_METADATA' not in target_configuration: + guess = guess_oeselftest_metadata(target['result']) + if guess is None: + logger.error(f"ERROR: did not manage to guess oeselftest metadata for {target_configuration['STARTTIME']}") + else: + logger.debug(f"Enriching {target_configuration['STARTTIME']} with {guess}") + target_configuration['OESELFTEST_METADATA'] = guess + + # Test runs with LTP results in should only be compared with other runs with LTP tests in them + if base_configuration.get('TEST_TYPE') == 'runtime' and any(result.startswith("ltpresult") for result in base['result']): + ret = target_configuration.get('TEST_TYPE') == 'runtime' and any(result.startswith("ltpresult") for result in target['result']) + + return ret and metadata_matches(base_configuration, target_configuration) \ + and machine_matches(base_configuration, target_configuration) + +def get_status_str(raw_status): + raw_status_lower = raw_status.lower() if raw_status else "None" + return STATUS_STRINGS.get(raw_status_lower, raw_status) + +def get_additional_info_line(new_pass_count, new_tests): + result=[] + if new_tests: + result.append(f'+{new_tests} test(s) present') + if new_pass_count: + result.append(f'+{new_pass_count} test(s) now passing') + + if not result: + return "" + + return ' -> ' + ', '.join(result) + '\n' + +def compare_result(logger, base_name, target_name, base_result, target_result, display_limit=None): base_result = base_result.get('result') target_result = target_result.get('result') result = {} + new_tests = 0 + regressions = {} + resultstring = "" + new_tests = 0 + new_pass_count = 0 + + display_limit = int(display_limit) if display_limit else REGRESSIONS_DISPLAY_LIMIT + if base_result and target_result: for k in base_result: base_testcase = base_result[k] @@ -27,12 +221,47 @@ def compare_result(logger, base_name, target_name, base_result, target_result): result[k] = {'base': base_status, 'target': target_status} else: logger.error('Failed to retrieved base test case status: %s' % k) + + # Also count new tests that were not present in base results: it + # could be newly added tests, but it could also highlights some tests + # renames or fixed faulty ptests + for k in target_result: + if k not in base_result: + new_tests += 1 if result: - resultstring = "Regression: %s\n %s\n" % (base_name, target_name) - for k in sorted(result): - resultstring += ' %s: %s -> %s\n' % (k, result[k]['base'], result[k]['target']) + new_pass_count = sum(test['target'] is not None and test['target'].startswith("PASS") for test in result.values()) + # Print a regression report only if at least one test has a regression status (FAIL, SKIPPED, absent...) + if new_pass_count < len(result): + resultstring = "Regression: %s\n %s\n" % (base_name, target_name) + for k in sorted(result): + if not result[k]['target'] or not result[k]['target'].startswith("PASS"): + # Differentiate each ptest kind when listing regressions + key_parts = k.split('.') + key = '.'.join(key_parts[:2]) if k.startswith('ptest') else key_parts[0] + # Append new regression to corresponding test family + regressions[key] = regressions.setdefault(key, []) + [' %s: %s -> %s\n' % (k, get_status_str(result[k]['base']), get_status_str(result[k]['target']))] + resultstring += f" Total: {sum([len(regressions[r]) for r in regressions])} new regression(s):\n" + for k in regressions: + resultstring += f" {len(regressions[k])} regression(s) for {k}\n" + count_to_print=min([display_limit, len(regressions[k])]) if display_limit > 0 else len(regressions[k]) + resultstring += ''.join(regressions[k][:count_to_print]) + if count_to_print < len(regressions[k]): + resultstring+=' [...]\n' + if new_pass_count > 0: + resultstring += f' Additionally, {new_pass_count} previously failing test(s) is/are now passing\n' + if new_tests > 0: + resultstring += f' Additionally, {new_tests} new test(s) is/are present\n' + else: + resultstring = "%s\n%s\n" % (base_name, target_name) + result = None else: - resultstring = "Match: %s\n %s" % (base_name, target_name) + resultstring = "%s\n%s\n" % (base_name, target_name) + + if not result: + additional_info = get_additional_info_line(new_pass_count, new_tests) + if additional_info: + resultstring += additional_info + return result, resultstring def get_results(logger, source): @@ -44,12 +273,38 @@ def regression(args, logger): regression_common(args, logger, base_results, target_results) +# Some test case naming is poor and contains random strings, particularly lttng/babeltrace. +# Truncating the test names works since they contain file and line number identifiers +# which allows us to match them without the random components. +def fixup_ptest_names(results, logger): + for r in results: + for i in results[r]: + tests = list(results[r][i]['result'].keys()) + for test in tests: + new = None + if test.startswith(("ptestresult.lttng-tools.", "ptestresult.babeltrace.", "ptestresult.babeltrace2")) and "_-_" in test: + new = test.split("_-_")[0] + elif test.startswith(("ptestresult.curl.")) and "__" in test: + new = test.split("__")[0] + elif test.startswith(("ptestresult.dbus.")) and "__" in test: + new = test.split("__")[0] + elif test.startswith("ptestresult.binutils") and "build-st-" in test: + new = test.split(" ")[0] + elif test.startswith("ptestresult.gcc") and "/tmp/runtest." in test: + new = ".".join(test.split(".")[:2]) + if new: + results[r][i]['result'][new] = results[r][i]['result'][test] + del results[r][i]['result'][test] + def regression_common(args, logger, base_results, target_results): if args.base_result_id: base_results = resultutils.filter_resultsdata(base_results, args.base_result_id) if args.target_result_id: target_results = resultutils.filter_resultsdata(target_results, args.target_result_id) + fixup_ptest_names(base_results, logger) + fixup_ptest_names(target_results, logger) + matches = [] regressions = [] notfound = [] @@ -62,7 +317,9 @@ def regression_common(args, logger, base_results, target_results): # removing any pairs which match for c in base.copy(): for b in target.copy(): - res, resstr = compare_result(logger, c, b, base_results[a][c], target_results[a][b]) + if not can_be_compared(logger, base_results[a][c], target_results[a][b]): + continue + res, resstr = compare_result(logger, c, b, base_results[a][c], target_results[a][b], args.limit) if not res: matches.append(resstr) base.remove(c) @@ -71,15 +328,18 @@ def regression_common(args, logger, base_results, target_results): # Should only now see regressions, we may not be able to match multiple pairs directly for c in base: for b in target: - res, resstr = compare_result(logger, c, b, base_results[a][c], target_results[a][b]) + if not can_be_compared(logger, base_results[a][c], target_results[a][b]): + continue + res, resstr = compare_result(logger, c, b, base_results[a][c], target_results[a][b], args.limit) if res: regressions.append(resstr) else: notfound.append("%s not found in target" % a) - print("\n".join(sorted(matches))) print("\n".join(sorted(regressions))) + print("\n" + MISSING_TESTS_BANNER + "\n") print("\n".join(sorted(notfound))) - + print("\n" + ADDITIONAL_DATA_BANNER + "\n") + print("\n".join(sorted(matches))) return 0 def regression_git(args, logger): @@ -183,4 +443,5 @@ def register_commands(subparsers): parser_build.add_argument('--commit-number', help="Revision number to search for, redundant if --commit is specified") parser_build.add_argument('--commit2', help="Revision to compare with") parser_build.add_argument('--commit-number2', help="Revision number to compare with, redundant if --commit2 is specified") + parser_build.add_argument('-l', '--limit', default=REGRESSIONS_DISPLAY_LIMIT, help="Maximum number of changes to display per test. Can be set to 0 to print all changes") |