Bug 1272176 - Emit Perfherder data for system resource utilization; r?wlach draft
authorGregory Szorc <gps@mozilla.com>
Thu, 12 May 2016 13:55:35 -0700
changeset 374694 feae08c31fd7789eb7b3e64613ba2f001ccc6c69
parent 374693 9230de71ad2d76fa01c1f876d49bfff953c7ad11
child 522675 e62d89ecd42cc1a5448da939b5555f2f8584fe52
push id20067
push userbmo:gps@mozilla.com
push dateThu, 02 Jun 2016 19:55:21 +0000
reviewerswlach
bugs1272176
milestone49.0a1
Bug 1272176 - Emit Perfherder data for system resource utilization; r?wlach This commit teaches the resource monitor in mozharness to emit Perfherder data for system metrics and step times. This will allow us to see when the timing or resource characteristics of jobs in automation changes. The recorded data includes overall CPU percent usage and I/O. Each step has its time and CPU percent recorded. There is certainly more data we could record. However, the immediate goal of this change is to see if the data provides any benefit. I'd rather start small and expand reporting once value from this data is proved. The wonkiest part of this patch is likely the mechanism to define the Perfherder "test" names. We don't appear to have an identifier in mozharness suitable for distinguishing between job types. e.g. the "desktop_unittest.py" script is responsible for running a few dozen jobs. So we invent code for creating an identifier from the script config options. I /think/ Treeherder will automatically assign the project/branch, platform, and build type, which is why these aren't included in the identifier. MozReview-Commit-ID: HjhtXfxOvzJ
testing/mozharness/mozharness/base/python.py
testing/mozharness/scripts/desktop_unittest.py
--- a/testing/mozharness/mozharness/base/python.py
+++ b/testing/mozharness/mozharness/base/python.py
@@ -9,26 +9,32 @@
 
 import os
 import subprocess
 import sys
 import time
 import json
 import traceback
 
+import mozharness
 from mozharness.base.script import (
     PostScriptAction,
     PostScriptRun,
     PreScriptAction,
     PreScriptRun,
 )
 from mozharness.base.errors import VirtualenvErrorList
 from mozharness.base.log import WARNING, FATAL
 from mozharness.mozilla.proxxy import Proxxy
 
+external_tools_path = os.path.join(
+    os.path.abspath(os.path.dirname(os.path.dirname(mozharness.__file__))),
+    'external_tools',
+)
+
 def get_tlsv1_post():
     # Monkeypatch to work around SSL errors in non-bleeding-edge Python.
     # Taken from https://lukasa.co.uk/2013/01/Choosing_SSL_Version_In_Requests/
     import requests
     from requests.packages.urllib3.poolmanager import PoolManager
     import ssl
 
     class TLSV1Adapter(requests.adapters.HTTPAdapter):
@@ -453,18 +459,25 @@ class ResourceMonitoringMixin(object):
     """
     def __init__(self, *args, **kwargs):
         super(ResourceMonitoringMixin, self).__init__(*args, **kwargs)
 
         self.register_virtualenv_module('psutil>=3.1.1', method='pip',
                                         optional=True)
         self.register_virtualenv_module('mozsystemmonitor==0.3',
                                         method='pip', optional=True)
+        self.register_virtualenv_module('jsonschema==2.5.1',
+                                        method='pip')
         self._resource_monitor = None
 
+        # 2-tuple of (name, options) to assign Perfherder resource monitor
+        # metrics to. This needs to be assigned by a script in order for
+        # Perfherder metrics to be reported.
+        self.resource_monitor_perfherder_id = None
+
     @PostScriptAction('create-virtualenv')
     def _start_resource_monitoring(self, action, success=None):
         self.activate_virtualenv()
 
         # Resource Monitor requires Python 2.7, however it's currently optional.
         # Remove when all machines have had their Python version updated (bug 711299).
         if sys.version_info[:2] < (2, 7):
             self.warning('Resource monitoring will not be enabled! Python 2.7+ required.')
@@ -517,16 +530,19 @@ class ResourceMonitoringMixin(object):
                 self.exception('could not upload resource usage JSON',
                                level=WARNING)
 
         except Exception:
             self.warning("Exception when reporting resource usage: %s" %
                          traceback.format_exc())
 
     def _log_resource_usage(self):
+        # Delay import because not available until virtualenv is populated.
+        import jsonschema
+
         rm = self._resource_monitor
 
         if rm.start_time is None:
             return
 
         def resources(phase):
             cpu_percent = rm.aggregate_cpu_percent(phase=phase, per_cpu=False)
             cpu_times = rm.aggregate_cpu_times(phase=phase, per_cpu=False)
@@ -560,16 +576,82 @@ class ResourceMonitoringMixin(object):
 
             except ValueError:
                 self.warning("Exception when formatting: %s" %
                              traceback.format_exc())
 
         cpu_percent, cpu_times, io, (swap_in, swap_out) = resources(None)
         duration = rm.end_time - rm.start_time
 
+        # Write out Perfherder data if configured.
+        if self.resource_monitor_perfherder_id:
+            perfherder_name, perfherder_options = self.resource_monitor_perfherder_id
+
+            suites = []
+            overall = []
+
+            if cpu_percent:
+                overall.append({
+                    'name': 'cpu_percent',
+                    'value': cpu_percent,
+                })
+
+            overall.extend([
+                {'name': 'io_write_bytes', 'value': io.write_bytes},
+                {'name': 'io.read_bytes', 'value': io.read_bytes},
+                {'name': 'io_write_time', 'value': io.write_time},
+                {'name': 'io_read_time', 'value': io.read_time},
+            ])
+
+            suites.append({
+                'name': '%s.overall' % perfherder_name,
+                'extraOptions': perfherder_options,
+                'subtests': overall,
+
+            })
+
+            for phase in rm.phases.keys():
+                phase_duration = rm.phases[phase][1] - rm.phases[phase][0]
+                subtests = [
+                    {
+                        'name': 'time',
+                        'value': phase_duration,
+                    },
+                    {
+                        'name': 'cpu_percent',
+                        'value': rm.aggregate_cpu_percent(phase=phase,
+                                                          per_cpu=False),
+                    }
+                ]
+                # We don't report I/O during each step because measured I/O
+                # is system I/O and that I/O can be delayed (e.g. writes will
+                # buffer before being flushed and recorded in our metrics).
+                suites.append({
+                    'name': '%s.%s' % (perfherder_name, phase),
+                    'subtests': subtests,
+                })
+
+            data = {
+                'framework': {'name': 'job_resource_usage'},
+                'suites': suites,
+            }
+
+            try:
+                schema_path = os.path.join(external_tools_path,
+                                           'performance-artifact-schema.json')
+                with open(schema_path, 'rb') as fh:
+                    schema = json.load(fh)
+
+                self.info('Validating Perfherder data against %s' % schema_path)
+                jsonschema.validate(data, schema)
+            except Exception:
+                self.exception('error while validating Perfherder data; ignoring')
+            else:
+                self.info('PERFHERDER_DATA: %s' % json.dumps(data))
+
         log_usage('Total resource usage', duration, cpu_percent, cpu_times, io)
 
         # Print special messages so usage shows up in Treeherder.
         if cpu_percent:
             self._tinderbox_print('CPU usage<br/>{:,.1f}%'.format(
                                   cpu_percent))
 
         self._tinderbox_print('I/O read bytes / time<br/>{:,} / {:,}'.format(
--- a/testing/mozharness/scripts/desktop_unittest.py
+++ b/testing/mozharness/scripts/desktop_unittest.py
@@ -166,16 +166,45 @@ class DesktopUnittest(TestingMixin, Merc
         self.symbols_url = c.get('symbols_url')
         # this is so mozinstall in install() doesn't bug out if we don't run
         # the download_and_extract action
         self.installer_path = c.get('installer_path')
         self.binary_path = c.get('binary_path')
         self.abs_app_dir = None
         self.abs_res_dir = None
 
+        # Construct an identifier to be used to identify Perfherder data
+        # for resource monitoring recording. This attempts to uniquely
+        # identify this test invocation configuration.
+        perfherder_parts = []
+        perfherder_options = []
+        suites = (
+            ('specified_mochitest_suites', 'mochitest'),
+            ('specified_reftest_suites', 'reftest'),
+            ('specified_xpcshell_suites', 'xpcshell'),
+            ('specified_cppunittest_suites', 'cppunit'),
+            ('specified_gtest_suites', 'gtest'),
+            ('specified_jittest_suites', 'jittest'),
+            ('specified_mozbase_suites', 'mozbase'),
+            ('specified_mozmill_suites', 'mozmill'),
+        )
+        for s, prefix in suites:
+            if s in c:
+                perfherder_parts.append(prefix)
+                perfherder_parts.extend(c[s])
+
+        if 'this_chunk' in c:
+            perfherder_parts.append(c['this_chunk'])
+
+        if c['e10s']:
+            perfherder_options.append('e10s')
+
+        self.resource_monitor_perfherder_id = ('.'.join(perfherder_parts),
+                                               perfherder_options)
+
     # helper methods {{{2
     def _pre_config_lock(self, rw_config):
         super(DesktopUnittest, self)._pre_config_lock(rw_config)
         c = self.config
         if not c.get('run_all_suites'):
             return  # configs are valid
         for category in SUITE_CATEGORIES:
             specific_suites = c.get('specified_%s_suites' % (category))