Bug 1272176 - Emit Perfherder data for system resource utilization; r?wlach
This commit teaches the resource monitor in mozharness to emit
Perfherder data for system metrics and step times. This will
allow us to see when the timing or resource characteristics
of jobs in automation changes.
The recorded data includes overall CPU percent usage and I/O.
Each step has its time and CPU percent recorded. There is
certainly more data we could record. However, the immediate
goal of this change is to see if the data provides any benefit.
I'd rather start small and expand reporting once value from
this data is proved.
The wonkiest part of this patch is likely the mechanism to
define the Perfherder "test" names. We don't appear to have
an identifier in mozharness suitable for distinguishing
between job types. e.g. the "desktop_unittest.py" script is
responsible for running a few dozen jobs. So we invent code
for creating an identifier from the script config options.
I /think/ Treeherder will automatically assign the
project/branch, platform, and build type, which is why these
aren't included in the identifier.
MozReview-Commit-ID: HjhtXfxOvzJ
--- a/testing/mozharness/mozharness/base/python.py
+++ b/testing/mozharness/mozharness/base/python.py
@@ -9,26 +9,32 @@
import os
import subprocess
import sys
import time
import json
import traceback
+import mozharness
from mozharness.base.script import (
PostScriptAction,
PostScriptRun,
PreScriptAction,
PreScriptRun,
)
from mozharness.base.errors import VirtualenvErrorList
from mozharness.base.log import WARNING, FATAL
from mozharness.mozilla.proxxy import Proxxy
+external_tools_path = os.path.join(
+ os.path.abspath(os.path.dirname(os.path.dirname(mozharness.__file__))),
+ 'external_tools',
+)
+
def get_tlsv1_post():
# Monkeypatch to work around SSL errors in non-bleeding-edge Python.
# Taken from https://lukasa.co.uk/2013/01/Choosing_SSL_Version_In_Requests/
import requests
from requests.packages.urllib3.poolmanager import PoolManager
import ssl
class TLSV1Adapter(requests.adapters.HTTPAdapter):
@@ -453,18 +459,25 @@ class ResourceMonitoringMixin(object):
"""
def __init__(self, *args, **kwargs):
super(ResourceMonitoringMixin, self).__init__(*args, **kwargs)
self.register_virtualenv_module('psutil>=3.1.1', method='pip',
optional=True)
self.register_virtualenv_module('mozsystemmonitor==0.3',
method='pip', optional=True)
+ self.register_virtualenv_module('jsonschema==2.5.1',
+ method='pip')
self._resource_monitor = None
+ # 2-tuple of (name, options) to assign Perfherder resource monitor
+ # metrics to. This needs to be assigned by a script in order for
+ # Perfherder metrics to be reported.
+ self.resource_monitor_perfherder_id = None
+
@PostScriptAction('create-virtualenv')
def _start_resource_monitoring(self, action, success=None):
self.activate_virtualenv()
# Resource Monitor requires Python 2.7, however it's currently optional.
# Remove when all machines have had their Python version updated (bug 711299).
if sys.version_info[:2] < (2, 7):
self.warning('Resource monitoring will not be enabled! Python 2.7+ required.')
@@ -517,16 +530,19 @@ class ResourceMonitoringMixin(object):
self.exception('could not upload resource usage JSON',
level=WARNING)
except Exception:
self.warning("Exception when reporting resource usage: %s" %
traceback.format_exc())
def _log_resource_usage(self):
+ # Delay import because not available until virtualenv is populated.
+ import jsonschema
+
rm = self._resource_monitor
if rm.start_time is None:
return
def resources(phase):
cpu_percent = rm.aggregate_cpu_percent(phase=phase, per_cpu=False)
cpu_times = rm.aggregate_cpu_times(phase=phase, per_cpu=False)
@@ -560,16 +576,82 @@ class ResourceMonitoringMixin(object):
except ValueError:
self.warning("Exception when formatting: %s" %
traceback.format_exc())
cpu_percent, cpu_times, io, (swap_in, swap_out) = resources(None)
duration = rm.end_time - rm.start_time
+ # Write out Perfherder data if configured.
+ if self.resource_monitor_perfherder_id:
+ perfherder_name, perfherder_options = self.resource_monitor_perfherder_id
+
+ suites = []
+ overall = []
+
+ if cpu_percent:
+ overall.append({
+ 'name': 'cpu_percent',
+ 'value': cpu_percent,
+ })
+
+ overall.extend([
+ {'name': 'io_write_bytes', 'value': io.write_bytes},
+ {'name': 'io.read_bytes', 'value': io.read_bytes},
+ {'name': 'io_write_time', 'value': io.write_time},
+ {'name': 'io_read_time', 'value': io.read_time},
+ ])
+
+ suites.append({
+ 'name': '%s.overall' % perfherder_name,
+ 'extraOptions': perfherder_options,
+ 'subtests': overall,
+
+ })
+
+ for phase in rm.phases.keys():
+ phase_duration = rm.phases[phase][1] - rm.phases[phase][0]
+ subtests = [
+ {
+ 'name': 'time',
+ 'value': phase_duration,
+ },
+ {
+ 'name': 'cpu_percent',
+ 'value': rm.aggregate_cpu_percent(phase=phase,
+ per_cpu=False),
+ }
+ ]
+ # We don't report I/O during each step because measured I/O
+ # is system I/O and that I/O can be delayed (e.g. writes will
+ # buffer before being flushed and recorded in our metrics).
+ suites.append({
+ 'name': '%s.%s' % (perfherder_name, phase),
+ 'subtests': subtests,
+ })
+
+ data = {
+ 'framework': {'name': 'job_resource_usage'},
+ 'suites': suites,
+ }
+
+ try:
+ schema_path = os.path.join(external_tools_path,
+ 'performance-artifact-schema.json')
+ with open(schema_path, 'rb') as fh:
+ schema = json.load(fh)
+
+ self.info('Validating Perfherder data against %s' % schema_path)
+ jsonschema.validate(data, schema)
+ except Exception:
+ self.exception('error while validating Perfherder data; ignoring')
+ else:
+ self.info('PERFHERDER_DATA: %s' % json.dumps(data))
+
log_usage('Total resource usage', duration, cpu_percent, cpu_times, io)
# Print special messages so usage shows up in Treeherder.
if cpu_percent:
self._tinderbox_print('CPU usage<br/>{:,.1f}%'.format(
cpu_percent))
self._tinderbox_print('I/O read bytes / time<br/>{:,} / {:,}'.format(
--- a/testing/mozharness/scripts/desktop_unittest.py
+++ b/testing/mozharness/scripts/desktop_unittest.py
@@ -166,16 +166,45 @@ class DesktopUnittest(TestingMixin, Merc
self.symbols_url = c.get('symbols_url')
# this is so mozinstall in install() doesn't bug out if we don't run
# the download_and_extract action
self.installer_path = c.get('installer_path')
self.binary_path = c.get('binary_path')
self.abs_app_dir = None
self.abs_res_dir = None
+ # Construct an identifier to be used to identify Perfherder data
+ # for resource monitoring recording. This attempts to uniquely
+ # identify this test invocation configuration.
+ perfherder_parts = []
+ perfherder_options = []
+ suites = (
+ ('specified_mochitest_suites', 'mochitest'),
+ ('specified_reftest_suites', 'reftest'),
+ ('specified_xpcshell_suites', 'xpcshell'),
+ ('specified_cppunittest_suites', 'cppunit'),
+ ('specified_gtest_suites', 'gtest'),
+ ('specified_jittest_suites', 'jittest'),
+ ('specified_mozbase_suites', 'mozbase'),
+ ('specified_mozmill_suites', 'mozmill'),
+ )
+ for s, prefix in suites:
+ if s in c:
+ perfherder_parts.append(prefix)
+ perfherder_parts.extend(c[s])
+
+ if 'this_chunk' in c:
+ perfherder_parts.append(c['this_chunk'])
+
+ if c['e10s']:
+ perfherder_options.append('e10s')
+
+ self.resource_monitor_perfherder_id = ('.'.join(perfherder_parts),
+ perfherder_options)
+
# helper methods {{{2
def _pre_config_lock(self, rw_config):
super(DesktopUnittest, self)._pre_config_lock(rw_config)
c = self.config
if not c.get('run_all_suites'):
return # configs are valid
for category in SUITE_CATEGORIES:
specific_suites = c.get('specified_%s_suites' % (category))