Bug 1363104 - Fix perf-reftest to compare perf numbers of basic vs ref pages; r=jmaher
MozReview-Commit-ID: JMtaa9I0atY
--- a/testing/talos/talos.json
+++ b/testing/talos/talos.json
@@ -77,20 +77,20 @@
"svgr": {
"tests": ["tsvgx", "tsvgr_opacity", "tart", "tscrollx", "cart", "tsvg_static"],
"talos_options": ["--disable-e10s"]
},
"svgr-e10s": {
"tests": ["tsvgx", "tsvgr_opacity", "tart", "tscrollx", "cart", "tsvg_static"]
},
"perf-reftest": {
- "tests": ["bloom_basic", "bloom_basic_ref"]
+ "tests": ["bloom_basic"]
},
"perf-reftest-e10s": {
- "tests": ["bloom_basic", "bloom_basic_ref"]
+ "tests": ["bloom_basic"]
},
"tp5o": {
"tests": ["tp5o"],
"pagesets_name": "tp5n.zip",
"talos_options": ["--disable-e10s"]
},
"tp5o-e10s": {
"tests": ["tp5o"],
--- a/testing/talos/talos/config.py
+++ b/testing/talos/talos/config.py
@@ -37,16 +37,17 @@ DEFAULTS = dict(
shutdown=False,
timeout=3600,
tpchrome=True,
tpcycles=10,
tpmozafterpaint=False,
firstpaint=False,
userready=False,
testeventmap=[],
+ base_vs_ref=False,
tpdisable_e10s=False,
tpnoisy=True,
tppagecycles=1,
tploadnocache=False,
tpscrolltest=False,
tprender=False,
win_counters=[],
w7_counters=[],
--- a/testing/talos/talos/output.py
+++ b/testing/talos/talos/output.py
@@ -63,18 +63,17 @@ class Output(object):
'extraOptions': self.results.extra_options or [],
'subtests': subtests
}
suites.append(suite)
vals = []
replicates = {}
- # TODO: counters!!!! we don't have any, but they suffer the
- # same
+ # TODO: counters!!!! we don't have any, but they suffer the same
for result in test.results:
# XXX this will not work for manifests which list
# the same page name twice. It also ignores cycles
for page, val in result.raw_values():
if page == 'NULL':
page = test.name()
if tsresult is None:
tsresult = r = TalosResults.Results()
@@ -97,16 +96,24 @@ class Output(object):
if page == 'NULL':
# no real subtests
page = test.name()
subtest = {
'name': page,
'value': val['filtered'],
'replicates': replicates[page],
}
+ # if results are from a comparison test i.e. perf-reftest, it will also
+ # contain replicates for 'base' and 'reference'; we wish to keep those
+ # to reference; actual results were calculated as the difference of those
+ base_runs = result.results[0].get('base_runs', None)
+ ref_runs = result.results[0].get('ref_runs', None)
+ if base_runs and ref_runs:
+ subtest['base_replicates'] = base_runs
+ subtest['ref_replicates'] = ref_runs
subtests.append(subtest)
if test.test_config.get('lower_is_better') is not None:
subtest['lowerIsBetter'] = \
test.test_config['lower_is_better']
if test.test_config.get('alert_threshold') is not None:
subtest['alertThreshold'] = \
test.test_config['alert_threshold']
if test.test_config.get('unit'):
--- a/testing/talos/talos/run_tests.py
+++ b/testing/talos/talos/run_tests.py
@@ -88,17 +88,16 @@ def setup_webserver(webserver):
def run_tests(config, browser_config):
"""Runs the talos tests on the given configuration and generates a report.
"""
# get the test data
tests = config['tests']
tests = useBaseTestDefaults(config.get('basetest', {}), tests)
paths = ['profile_path', 'tpmanifest', 'extensions', 'setup', 'cleanup']
for test in tests:
-
# Check for profile_path, tpmanifest and interpolate based on Talos
# root https://bugzilla.mozilla.org/show_bug.cgi?id=727711
# Build command line from config
for path in paths:
if test.get(path):
test[path] = utils.interpolate(test[path])
if test.get('tpmanifest'):
test['tpmanifest'] = \
@@ -250,20 +249,28 @@ def run_tests(config, browser_config):
# parse out the multi-value results, and 'fake it' to appear like separate tests
separate_results_list = convert_to_separate_test_results(multi_value_result,
test_event_map)
# now we have three separate test results, store them
for test_result in separate_results_list:
talos_results.add(test_result)
+
+ # some tests like bloom_basic run two separate tests and then compare those values
+ # we want the results in perfherder to only be the actual difference between those
+ # and store the base and reference test replicates in results.json for upload
+ elif test.get('base_vs_ref', False):
+ # run the test, results will be reported for each page like two tests in the suite
+ base_and_reference_results = mytest.runTest(browser_config, test)
+ # now compare each test, and create a new test object for the comparison
+ talos_results.add(make_comparison_result(base_and_reference_results))
else:
# just expecting regular test - one result value per iteration
talos_results.add(mytest.runTest(browser_config, test))
-
LOG.test_end(testname, status='OK')
except TalosRegression as exc:
LOG.error("Detected a regression for %s" % testname)
# by returning 1, we report an orange to buildbot
# http://docs.buildbot.net/latest/developer/results.html
LOG.test_end(testname, status='FAIL', message=str(exc),
stack=traceback.format_exc())
@@ -293,16 +300,66 @@ def run_tests(config, browser_config):
print("Thanks for running Talos locally. Results are in %s"
% (results_urls['output_urls']))
# we will stop running tests on a failed test, or we will return 0 for
# green
return 0
+def make_comparison_result(base_and_reference_results):
+ ''' Receive a test result object meant to be used as a base vs reference test. The result
+ object will have one test with two subtests; instead of traditional subtests we want to
+ treat them as separate tests, comparing them together and reporting the comparison results.
+
+ Results with multiple pages used as subtests would look like this normally, with the overall
+ result value being the mean of the pages/subtests:
+
+ PERFHERDER_DATA: {"framework": {"name": "talos"}, "suites": [{"extraOptions": ["e10s"],
+ "name": "bloom_basic", "lowerIsBetter": true, "alertThreshold": 5.0, "value": 594.81,
+ "subtests": [{"name": ".html", "lowerIsBetter": true, "alertThreshold": 5.0, "replicates":
+ [586.52, ...], "value": 586.52], "unit": "ms"}, {"name": "-ref.html", "lowerIsBetter": true,
+ "alertThreshold": 5.0, "replicates": [603.225, ...], "value": 603.225, "unit": "ms"}]}]}
+
+ We want to compare the subtests against eachother (base vs ref) and create a new single test
+ results object with the comparison results, that will look like traditional single test results
+ like this:
+
+ PERFHERDER_DATA: {"framework": {"name": "talos"}, "suites": [{"lowerIsBetter": true,
+ "subtests": [{"name": "", "lowerIsBetter": true, "alertThreshold": 5.0, "replicates":
+ [16.705, ...], "value": 16.705, "unit": "ms"}], "extraOptions": ["e10s"], "name":
+ "bloom_basic", "alertThreshold": 5.0}]}
+ '''
+ # separate the 'base' and 'reference' result run values
+ base_result_runs = base_and_reference_results.results[0].results[0]['runs']
+ ref_result_runs = base_and_reference_results.results[0].results[1]['runs']
+
+ # create a new results object for the comparison result; keep replicates from both pages
+ comparison_result = copy.deepcopy(base_and_reference_results)
+
+ # remove original results from our copy as they will be replaced by one comparison result
+ comparison_result.results[0].results = []
+
+ # populate our new comparison result with 'base' and 'ref' replicates
+ comparison_result.results[0].results.append({'index': 0,
+ 'runs': [],
+ 'page': '',
+ 'base_runs': base_result_runs,
+ 'ref_runs': ref_result_runs})
+
+ # now step thru each result, compare 'base' vs 'ref', and store the difference in 'runs'
+ _index = 0
+ for next_ref in comparison_result.results[0].results[0]['ref_runs']:
+ diff = abs(next_ref - comparison_result.results[0].results[0]['base_runs'][_index])
+ comparison_result.results[0].results[0]['runs'].append(round(diff, 3))
+ _index += 1
+
+ return comparison_result
+
+
def convert_to_separate_test_results(multi_value_result, test_event_map):
''' Receive a test result that actually contains multiple values in a single iteration, and
parse it out in order to 'fake' three seprate test results.
Incoming result looks like this:
[{'index': 0, 'runs': {'event_1': [1338, ...], 'event_2': [1438, ...], 'event_3':
[1538, ...]}, 'page': 'NULL'}]
--- a/testing/talos/talos/test.py
+++ b/testing/talos/talos/test.py
@@ -102,16 +102,17 @@ class TsBase(Test):
'xperf_counters',
'xperf_providers',
'xperf_user_providers',
'xperf_stackwalk',
'tpmozafterpaint',
'firstpaint',
'userready',
'testeventmap',
+ 'base_vs_ref',
'extensions',
'filters',
'setup',
'cleanup',
'webextensions',
'reinstall', # A list of files from the profile directory that
# should be copied to the temporary profile prior to
# running each cycle, to avoid one cycle overwriting
@@ -234,17 +235,17 @@ class tresize(TsBase):
class PageloaderTest(Test):
"""abstract base class for a Talos Pageloader test"""
tpmanifest = None # test manifest
tpcycles = 1 # number of time to run each page
cycles = None
timeout = None
keys = ['tpmanifest', 'tpcycles', 'tppagecycles', 'tprender', 'tpchrome',
'tpmozafterpaint', 'tploadnocache', 'firstpaint', 'userready',
- 'testeventmap', 'rss', 'mainthread', 'resolution', 'cycles',
+ 'testeventmap', 'base_vs_ref', 'rss', 'mainthread', 'resolution', 'cycles',
'gecko_profile', 'gecko_profile_interval', 'gecko_profile_entries',
'tptimeout', 'win_counters', 'w7_counters', 'linux_counters', 'mac_counters',
'tpscrolltest', 'xperf_counters', 'timeout', 'shutdown', 'responsiveness',
'profile_path', 'xperf_providers', 'xperf_user_providers', 'xperf_stackwalk',
'filters', 'preferences', 'extensions', 'setup', 'cleanup',
'lower_is_better', 'alert_threshold', 'unit', 'webextensions']
@@ -787,46 +788,31 @@ class a11yr(PageloaderTest):
preferences = {'dom.send_after_paint_to_content': False}
unit = 'ms'
alert_threshold = 5.0
@register_test()
class bloom_basic(PageloaderTest):
"""
- Stylo bloom_basic test
+ Stylo bloom_basic: runs bloom_basic and bloom_basic_ref and reports difference
"""
+ base_vs_ref = True # compare the two test pages with eachother and report comparison
tpmanifest = '${talos}/tests/perf-reftest/bloom_basic.manifest'
tpcycles = 1
tppagecycles = 25
gecko_profile_interval = 1
gecko_profile_entries = 2000000
filters = filter.ignore_first.prepare(5) + filter.median.prepare()
unit = 'ms'
lower_is_better = True
alert_threshold = 5.0
@register_test()
-class bloom_basic_ref(PageloaderTest):
- """
- Stylo bloom_basic_ref test
- """
- tpmanifest = '${talos}/tests/perf-reftest/bloom_basic_ref.manifest'
- tpcycles = 1
- tppagecycles = 25
- gecko_profile_interval = 1
- gecko_profile_entries = 2000000
- filters = filter.ignore_first.prepare(5) + filter.median.prepare()
- unit = 'ms'
- lower_is_better = True
- alert_threshold = 5.0
-
-
-@register_test()
class quantum_pageload_google(QuantumPageloadTest):
"""
Quantum Pageload Test - Google
"""
tpmanifest = '${talos}/tests/quantum_pageload/quantum_pageload_google.manifest'
@register_test()
--- a/testing/talos/talos/tests/perf-reftest/bloom_basic.manifest
+++ b/testing/talos/talos/tests/perf-reftest/bloom_basic.manifest
@@ -1,1 +1,4 @@
+# base_vs_ref is set in test.py for this test, so each of these pages are run as separate
+# tests, but then compared against eachother; and the reported results are the comparison
% http://localhost/tests/perf-reftest/bloom-basic.html
+% http://localhost/tests/perf-reftest/bloom-basic-ref.html