Bug 1363104 - Fix perf-reftest to compare perf numbers of basic vs ref pages; r=jmaher draft
authorRob Wood <rwood@mozilla.com>
Thu, 29 Jun 2017 18:18:45 -0400
changeset 602684 c0bee15606940ab8fe0df544a8fc6b24c988803f
parent 601236 306d2070e105b75f7076d30cc0288171f1435c07
child 635654 505f5adda9e496d377683568d000c9f84ee20695
push id66490
push userrwood@mozilla.com
push dateFri, 30 Jun 2017 14:42:55 +0000
reviewersjmaher
bugs1363104
milestone56.0a1
Bug 1363104 - Fix perf-reftest to compare perf numbers of basic vs ref pages; r=jmaher MozReview-Commit-ID: JMtaa9I0atY
testing/talos/talos.json
testing/talos/talos/config.py
testing/talos/talos/output.py
testing/talos/talos/run_tests.py
testing/talos/talos/test.py
testing/talos/talos/tests/perf-reftest/bloom_basic.manifest
--- a/testing/talos/talos.json
+++ b/testing/talos/talos.json
@@ -77,20 +77,20 @@
         "svgr": {
             "tests": ["tsvgx", "tsvgr_opacity", "tart", "tscrollx", "cart", "tsvg_static"],
             "talos_options": ["--disable-e10s"]
         },
         "svgr-e10s": {
             "tests": ["tsvgx", "tsvgr_opacity", "tart", "tscrollx", "cart", "tsvg_static"]
         },
         "perf-reftest": {
-            "tests": ["bloom_basic", "bloom_basic_ref"]
+            "tests": ["bloom_basic"]
         },
         "perf-reftest-e10s": {
-            "tests": ["bloom_basic", "bloom_basic_ref"]
+            "tests": ["bloom_basic"]
         },
         "tp5o": {
             "tests": ["tp5o"],
             "pagesets_name": "tp5n.zip",
             "talos_options": ["--disable-e10s"]
         },
         "tp5o-e10s": {
             "tests": ["tp5o"],
--- a/testing/talos/talos/config.py
+++ b/testing/talos/talos/config.py
@@ -37,16 +37,17 @@ DEFAULTS = dict(
         shutdown=False,
         timeout=3600,
         tpchrome=True,
         tpcycles=10,
         tpmozafterpaint=False,
         firstpaint=False,
         userready=False,
         testeventmap=[],
+        base_vs_ref=False,
         tpdisable_e10s=False,
         tpnoisy=True,
         tppagecycles=1,
         tploadnocache=False,
         tpscrolltest=False,
         tprender=False,
         win_counters=[],
         w7_counters=[],
--- a/testing/talos/talos/output.py
+++ b/testing/talos/talos/output.py
@@ -63,18 +63,17 @@ class Output(object):
                     'extraOptions': self.results.extra_options or [],
                     'subtests': subtests
                 }
 
                 suites.append(suite)
                 vals = []
                 replicates = {}
 
-                # TODO: counters!!!! we don't have any, but they suffer the
-                # same
+                # TODO: counters!!!! we don't have any, but they suffer the same
                 for result in test.results:
                     # XXX this will not work for manifests which list
                     # the same page name twice. It also ignores cycles
                     for page, val in result.raw_values():
                         if page == 'NULL':
                             page = test.name()
                             if tsresult is None:
                                 tsresult = r = TalosResults.Results()
@@ -97,16 +96,24 @@ class Output(object):
                         if page == 'NULL':
                             # no real subtests
                             page = test.name()
                         subtest = {
                             'name': page,
                             'value': val['filtered'],
                             'replicates': replicates[page],
                         }
+                        # if results are from a comparison test i.e. perf-reftest, it will also
+                        # contain replicates for 'base' and 'reference'; we wish to keep those
+                        # to reference; actual results were calculated as the difference of those
+                        base_runs = result.results[0].get('base_runs', None)
+                        ref_runs = result.results[0].get('ref_runs', None)
+                        if base_runs and ref_runs:
+                            subtest['base_replicates'] = base_runs
+                            subtest['ref_replicates'] = ref_runs
                         subtests.append(subtest)
                         if test.test_config.get('lower_is_better') is not None:
                             subtest['lowerIsBetter'] = \
                                 test.test_config['lower_is_better']
                         if test.test_config.get('alert_threshold') is not None:
                             subtest['alertThreshold'] = \
                                 test.test_config['alert_threshold']
                         if test.test_config.get('unit'):
--- a/testing/talos/talos/run_tests.py
+++ b/testing/talos/talos/run_tests.py
@@ -88,17 +88,16 @@ def setup_webserver(webserver):
 def run_tests(config, browser_config):
     """Runs the talos tests on the given configuration and generates a report.
     """
     # get the test data
     tests = config['tests']
     tests = useBaseTestDefaults(config.get('basetest', {}), tests)
     paths = ['profile_path', 'tpmanifest', 'extensions', 'setup', 'cleanup']
     for test in tests:
-
         # Check for profile_path, tpmanifest and interpolate based on Talos
         # root https://bugzilla.mozilla.org/show_bug.cgi?id=727711
         # Build command line from config
         for path in paths:
             if test.get(path):
                 test[path] = utils.interpolate(test[path])
         if test.get('tpmanifest'):
             test['tpmanifest'] = \
@@ -250,20 +249,28 @@ def run_tests(config, browser_config):
 
                 # parse out the multi-value results, and 'fake it' to appear like separate tests
                 separate_results_list = convert_to_separate_test_results(multi_value_result,
                                                                          test_event_map)
 
                 # now we have three separate test results, store them
                 for test_result in separate_results_list:
                     talos_results.add(test_result)
+
+            # some tests like bloom_basic run two separate tests and then compare those values
+            # we want the results in perfherder to only be the actual difference between those
+            # and store the base and reference test replicates in results.json for upload
+            elif test.get('base_vs_ref', False):
+                # run the test, results will be reported for each page like two tests in the suite
+                base_and_reference_results = mytest.runTest(browser_config, test)
+                # now compare each test, and create a new test object for the comparison
+                talos_results.add(make_comparison_result(base_and_reference_results))
             else:
                 # just expecting regular test - one result value per iteration
                 talos_results.add(mytest.runTest(browser_config, test))
-
             LOG.test_end(testname, status='OK')
 
     except TalosRegression as exc:
         LOG.error("Detected a regression for %s" % testname)
         # by returning 1, we report an orange to buildbot
         # http://docs.buildbot.net/latest/developer/results.html
         LOG.test_end(testname, status='FAIL', message=str(exc),
                      stack=traceback.format_exc())
@@ -293,16 +300,66 @@ def run_tests(config, browser_config):
             print("Thanks for running Talos locally. Results are in %s"
                   % (results_urls['output_urls']))
 
     # we will stop running tests on a failed test, or we will return 0 for
     # green
     return 0
 
 
+def make_comparison_result(base_and_reference_results):
+    ''' Receive a test result object meant to be used as a base vs reference test. The result
+    object will have one test with two subtests; instead of traditional subtests we want to
+    treat them as separate tests, comparing them together and reporting the comparison results.
+
+    Results with multiple pages used as subtests would look like this normally, with the overall
+    result value being the mean of the pages/subtests:
+
+    PERFHERDER_DATA: {"framework": {"name": "talos"}, "suites": [{"extraOptions": ["e10s"],
+    "name": "bloom_basic", "lowerIsBetter": true, "alertThreshold": 5.0, "value": 594.81,
+    "subtests": [{"name": ".html", "lowerIsBetter": true, "alertThreshold": 5.0, "replicates":
+    [586.52, ...], "value": 586.52], "unit": "ms"}, {"name": "-ref.html", "lowerIsBetter": true,
+    "alertThreshold": 5.0, "replicates": [603.225, ...], "value": 603.225, "unit": "ms"}]}]}
+
+    We want to compare the subtests against eachother (base vs ref) and create a new single test
+    results object with the comparison results, that will look like traditional single test results
+    like this:
+
+    PERFHERDER_DATA: {"framework": {"name": "talos"}, "suites": [{"lowerIsBetter": true,
+    "subtests": [{"name": "", "lowerIsBetter": true, "alertThreshold": 5.0, "replicates":
+    [16.705, ...], "value": 16.705, "unit": "ms"}], "extraOptions": ["e10s"], "name":
+    "bloom_basic", "alertThreshold": 5.0}]}
+    '''
+    # separate the 'base' and 'reference' result run values
+    base_result_runs = base_and_reference_results.results[0].results[0]['runs']
+    ref_result_runs = base_and_reference_results.results[0].results[1]['runs']
+
+    # create a new results object for the comparison result; keep replicates from both pages
+    comparison_result = copy.deepcopy(base_and_reference_results)
+
+    # remove original results from our copy as they will be replaced by one comparison result
+    comparison_result.results[0].results = []
+
+    # populate our new comparison result with 'base' and 'ref' replicates
+    comparison_result.results[0].results.append({'index': 0,
+                                                 'runs': [],
+                                                 'page': '',
+                                                 'base_runs': base_result_runs,
+                                                 'ref_runs': ref_result_runs})
+
+    # now step thru each result, compare 'base' vs 'ref', and store the difference in 'runs'
+    _index = 0
+    for next_ref in comparison_result.results[0].results[0]['ref_runs']:
+        diff = abs(next_ref - comparison_result.results[0].results[0]['base_runs'][_index])
+        comparison_result.results[0].results[0]['runs'].append(round(diff, 3))
+        _index += 1
+
+    return comparison_result
+
+
 def convert_to_separate_test_results(multi_value_result, test_event_map):
     ''' Receive a test result that actually contains multiple values in a single iteration, and
     parse it out in order to 'fake' three seprate test results.
 
     Incoming result looks like this:
 
     [{'index': 0, 'runs': {'event_1': [1338, ...], 'event_2': [1438, ...], 'event_3':
     [1538, ...]}, 'page': 'NULL'}]
--- a/testing/talos/talos/test.py
+++ b/testing/talos/talos/test.py
@@ -102,16 +102,17 @@ class TsBase(Test):
         'xperf_counters',
         'xperf_providers',
         'xperf_user_providers',
         'xperf_stackwalk',
         'tpmozafterpaint',
         'firstpaint',
         'userready',
         'testeventmap',
+        'base_vs_ref',
         'extensions',
         'filters',
         'setup',
         'cleanup',
         'webextensions',
         'reinstall',     # A list of files from the profile directory that
                          # should be copied to the temporary profile prior to
                          # running each cycle, to avoid one cycle overwriting
@@ -234,17 +235,17 @@ class tresize(TsBase):
 class PageloaderTest(Test):
     """abstract base class for a Talos Pageloader test"""
     tpmanifest = None  # test manifest
     tpcycles = 1  # number of time to run each page
     cycles = None
     timeout = None
     keys = ['tpmanifest', 'tpcycles', 'tppagecycles', 'tprender', 'tpchrome',
             'tpmozafterpaint', 'tploadnocache', 'firstpaint', 'userready',
-            'testeventmap', 'rss', 'mainthread', 'resolution', 'cycles',
+            'testeventmap', 'base_vs_ref', 'rss', 'mainthread', 'resolution', 'cycles',
             'gecko_profile', 'gecko_profile_interval', 'gecko_profile_entries',
             'tptimeout', 'win_counters', 'w7_counters', 'linux_counters', 'mac_counters',
             'tpscrolltest', 'xperf_counters', 'timeout', 'shutdown', 'responsiveness',
             'profile_path', 'xperf_providers', 'xperf_user_providers', 'xperf_stackwalk',
             'filters', 'preferences', 'extensions', 'setup', 'cleanup',
             'lower_is_better', 'alert_threshold', 'unit', 'webextensions']
 
 
@@ -787,46 +788,31 @@ class a11yr(PageloaderTest):
     preferences = {'dom.send_after_paint_to_content': False}
     unit = 'ms'
     alert_threshold = 5.0
 
 
 @register_test()
 class bloom_basic(PageloaderTest):
     """
-    Stylo bloom_basic test
+    Stylo bloom_basic: runs bloom_basic and bloom_basic_ref and reports difference
     """
+    base_vs_ref = True  # compare the two test pages with eachother and report comparison
     tpmanifest = '${talos}/tests/perf-reftest/bloom_basic.manifest'
     tpcycles = 1
     tppagecycles = 25
     gecko_profile_interval = 1
     gecko_profile_entries = 2000000
     filters = filter.ignore_first.prepare(5) + filter.median.prepare()
     unit = 'ms'
     lower_is_better = True
     alert_threshold = 5.0
 
 
 @register_test()
-class bloom_basic_ref(PageloaderTest):
-    """
-    Stylo bloom_basic_ref test
-    """
-    tpmanifest = '${talos}/tests/perf-reftest/bloom_basic_ref.manifest'
-    tpcycles = 1
-    tppagecycles = 25
-    gecko_profile_interval = 1
-    gecko_profile_entries = 2000000
-    filters = filter.ignore_first.prepare(5) + filter.median.prepare()
-    unit = 'ms'
-    lower_is_better = True
-    alert_threshold = 5.0
-
-
-@register_test()
 class quantum_pageload_google(QuantumPageloadTest):
     """
     Quantum Pageload Test - Google
     """
     tpmanifest = '${talos}/tests/quantum_pageload/quantum_pageload_google.manifest'
 
 
 @register_test()
--- a/testing/talos/talos/tests/perf-reftest/bloom_basic.manifest
+++ b/testing/talos/talos/tests/perf-reftest/bloom_basic.manifest
@@ -1,1 +1,4 @@
+# base_vs_ref is set in test.py for this test, so each of these pages are run as separate
+# tests, but then compared against eachother; and the reported results are the comparison
 % http://localhost/tests/perf-reftest/bloom-basic.html
+% http://localhost/tests/perf-reftest/bloom-basic-ref.html