Add signing scriptworker monitoring for
Bug 1314840 r=aki
The monitoring scripts are generic, and so have been added to the
scriptworker manifests, and the specific configuration to run
them have been added to the signing_scriptworker manifests.
MozReview-Commit-ID: 7t3TtXQqXJy
new file mode 100644
--- /dev/null
+++ b/modules/scriptworker/files/nagios_file_age_check.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python
+
+"""
+Checks for the existence and age of specific files.
+
+Allow file age to be defined as warning and critical arguments.
+
+You can either:
+- Specify one file to check
+- Give the path to a file, that contains a newline-separated list
+ of the files to check.
+
+This script will produce a multi-line nagios report in either case.
+
+Attributes:
+ STATUS_CODE (dict): a mapping of status strings to the exit codes nagios
+ requires
+ DEFAULT_WARNING (int): The default warning threshold, if none is given
+ DEFAULT_CRITICAL (int): The default critical alarm threshold, if none is
+ given
+
+
+"""
+import os
+import sys
+import time
+import argparse
+
+# Nagios plugin exit codes
+STATUS_CODE = {
+ 'OK': 0,
+ 'WARNING': 1,
+ 'CRITICAL': 2,
+ 'UNKNOWN': 3,
+}
+
+DEFAULT_WARNING = 45
+DEFAULT_CRITICAL = 60
+
+
+def file_age_check(filename, warning, critical, optional):
+ """file_age_check.
+
+ Checks the age and existence of a given filename
+
+ Args:
+ filename (str): containing a full path
+ warning (int): time in seconds over which to issue a warning.
+ critical (int): time in seconds over which to issue critical.
+
+ Returns:
+ tuple:
+ (nagios status code from STATUS_CODE, message string)
+ """
+ if not os.path.isfile(filename):
+ if optional:
+ return STATUS_CODE['OK'], "{0} doesn't exist and that's ok".format(filename)
+ else:
+ return STATUS_CODE['CRITICAL'], "{0} does not exist".format(filename)
+
+ try:
+ st = os.stat(filename)
+ except OSError as excp:
+ return STATUS_CODE['UNKNOWN'], "{0}: {1}".format(filename, excp)
+ current_time = time.time()
+ age = current_time - st.st_mtime
+
+ if age >= critical:
+ msg = "{0} is too old {1}/{2} seconds".format(
+ filename, int(age), critical)
+ return STATUS_CODE['CRITICAL'], msg
+ elif age >= warning:
+ msg = "{0} is getting too old {1}/{2} seconds".format(
+ filename, int(age), warning)
+ return STATUS_CODE['CRITICAL'], msg
+ else:
+ msg = "{0} is ok, {1}/{2} seconds old".format(
+ filename, int(age), critical)
+ return STATUS_CODE['OK'], msg
+
+
+def get_args():
+ """Parse command-line arguments."""
+ argp = argparse.ArgumentParser(description=__doc__)
+ argp.add_argument('-w', '--warning', type=int, default=DEFAULT_WARNING,
+ help='warn if older than this many minutes')
+ argp.add_argument('-c', '--critical', type=int, default=DEFAULT_CRITICAL,
+ help='critical if older than this many minutes')
+
+ argp.add_argument('-o', '--optional', action='store_true',
+ help="If set, don't error if the file is missing")
+
+ arggroup = argp.add_mutually_exclusive_group(required=True)
+
+ arggroup.add_argument('-p', '--path', type=str,
+ help="The full path name to check")
+ arggroup.add_argument('-f', '--from-file',
+ type=argparse.FileType('r'),
+ default=sys.stdin,
+ help="File of paths one per line, or - for stdin (default)")
+
+ args = argp.parse_args()
+
+ # convert to seconds for epoch time comparison
+ args.warning = args.warning * 60
+ args.critical = args.critical * 60
+
+ return args
+
+
+def run_file_age_checks():
+ """Organise the file age checks for nagios.
+
+ Output:
+ Prints to stdout
+ Exits with appropriate return code
+ """
+ args = get_args()
+
+ statuses = list()
+ messages = list()
+
+ if args.path:
+ check_files = [args.path]
+ else:
+ check_files = [f.strip() for f in args.from_file]
+
+ for filename in check_files:
+ status, message = file_age_check(
+ filename, args.warning, args.critical, args.optional)
+ statuses.append(status)
+ messages.append(message)
+
+ exit_code = max(statuses)
+
+ reverse_status_codes = {v: k for k, v in STATUS_CODE.items()}
+ service_output = "FILE_AGE {0}".format(reverse_status_codes[exit_code])
+
+ service_output_options = {
+ STATUS_CODE['OK']: "All files ok",
+ STATUS_CODE['WARNING']: "Some files may be too old, see long output",
+ STATUS_CODE['CRITICAL']: "Some files errored, see long output",
+ STATUS_CODE['UNKNOWN']: "Unknown error",
+ }
+
+ service_output += " - {0}".format(service_output_options[exit_code])
+
+ print("{0}\n{1}\n".format(service_output, "\n".join(sorted(messages))))
+ sys.exit(exit_code)
+
+
+if __name__ == '__main__':
+ run_file_age_checks()
new file mode 100644
--- /dev/null
+++ b/modules/scriptworker/files/nagios_pending_tasks.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+
+"""
+Nagios report script to check the number of pending scriptworker tasks
+
+Will use the current worker type and provisionerId from the scriptworker
+configuration, and query the API to discover the number of pending tasks.
+
+
+Attributes:
+ STATUS_CODE (dict): a mapping of status strings to the exit codes nagios
+ requires
+ DEFAULT_WARNING (int): The default warning threshold, if none is given
+ DEFAULT_CRITICAL (int): The default critical alarm threshold, if none is
+ given
+
+"""
+import asyncio
+import aiohttp
+import sys
+import argparse
+
+
+from scriptworker.config import get_context_from_cmdln
+from scriptworker.utils import cleanup
+
+
+# Nagios plugin exit codes
+STATUS_CODE = {
+ 'OK': 0,
+ 'WARNING': 1,
+ 'CRITICAL': 2,
+ 'UNKNOWN': 3,
+}
+
+DEFAULT_WARNING = 5
+DEFAULT_CRITICAL = 10
+
+
+def nagios_message(status, message):
+ """Place a nagios-style message to stdout and exit."""
+ print("PENDING_TASKS {0} - {1}".format(status, message))
+ sys.exit(STATUS_CODE[status])
+
+
+def get_args():
+ """Process command-line arguments.
+
+ Arguments:
+ None
+
+ Returns:
+ a parsed arguments object
+ """
+ argp = argparse.ArgumentParser(description=__doc__)
+ argp.add_argument('-w', '--warning', type=int, default=DEFAULT_WARNING,
+ help='warning threshhold for number of pending tasks')
+ argp.add_argument('-c', '--critical', type=int, default=DEFAULT_CRITICAL,
+ help='critical threshhold for number of pending tasks')
+
+ return argp.parse_args()
+
+
+def query_pending_task_count():
+ """query_pending_task_count.
+
+ Query the API for the number of pending tasks, so we can
+ report to nagios
+ """
+ args = get_args()
+
+ context, credentials = get_context_from_cmdln(sys.argv[1:])
+ cleanup(context)
+
+ conn = aiohttp.TCPConnector(
+ limit=context.config['aiohttp_max_connections'])
+ loop = asyncio.get_event_loop()
+ with aiohttp.ClientSession(connector=conn) as session:
+ context.session = session
+ context.credentials = credentials
+
+ try:
+ result = loop.run_until_complete(
+ context.queue.pendingTasks(
+ context.config['provisioner_id'],
+ context.config['worker_type']
+ )
+ )
+
+ except Exception as excp:
+ nagios_message(
+ 'UNKNOWN', 'Unable to query pending tasks: {0}'.format(excp))
+
+ template = '{pending}/{max} pending tasks for {provisioner}:{worker}'
+
+ if result['pendingTasks'] >= args.critical:
+ nagios_message(
+ 'CRITICAL', template.format(
+ pending=result['pendingTasks'],
+ max=args.critical,
+ provisioner=result['provisionerId'],
+ worker=result['workerType']
+ )
+ )
+ elif result['pendingTasks'] >= args.warning:
+ nagios_message(
+ 'WARNING', "{0}/{1} pending tasks".format(
+ result['pendingTasks'],
+ args.warning
+ )
+ )
+ else:
+ nagios_message(
+ 'OK', "{0}/{1} pending tasks".format(
+ result['pendingTasks'],
+ args.critical
+ )
+ )
+
+
+if __name__ == '__main__':
+ query_pending_task_count()
--- a/modules/scriptworker/manifests/instance.pp
+++ b/modules/scriptworker/manifests/instance.pp
@@ -75,16 +75,30 @@ define scriptworker::instance(
owner => "${username}",
group => "${group}";
"/home/${username}/privkey":
mode => 600,
content => $config::scriptworker_gpg_private_keys[$fqdn],
owner => "${username}",
group => "${group}",
show_diff => false;
+ "${nrpe::base::plugins_dir}/nagios_file_age_check.py":
+ require => Python35::Virtualenv["${basedir}"],
+ mode => 750,
+ owner => "${username}",
+ group => "${group}",
+ source => "puppet:///modules/scriptworker/nagios_file_age_check.py",
+ show_diff => false;
+ "${nrpe::base::plugins_dir}/nagios_pending_tasks.py":
+ require => Python35::Virtualenv["${basedir}"],
+ mode => 750,
+ owner => "${username}",
+ group => "${group}",
+ source => "puppet:///modules/scriptworker/nagios_pending_tasks.py",
+ show_diff => false;
}
exec {
# create gpg homedirs on change
'create_gpg_homedirs':
require => [Python35::Virtualenv["${basedir}"],
Git::Repo["scriptworker-${git_key_repo_dir}"],
File["${basedir}/scriptworker.yaml"]],
new file mode 100644
--- /dev/null
+++ b/modules/signing_scriptworker/files/file_age_check_optionals.txt
@@ -0,0 +1,1 @@
+/builds/scriptworker/.gpg_homedirs.lock
\ No newline at end of file
new file mode 100644
--- /dev/null
+++ b/modules/signing_scriptworker/files/file_age_check_required.txt
@@ -0,0 +1,3 @@
+/builds/scriptworker/logs/create_initial_gpg_homedirs.log
+/builds/scriptworker/logs/rebuild_gpg_homedirs.log
+/builds/scriptworker/logs/worker.log
\ No newline at end of file
--- a/modules/signing_scriptworker/manifests/init.pp
+++ b/modules/signing_scriptworker/manifests/init.pp
@@ -63,25 +63,42 @@ class signing_scriptworker {
worker_group => "${signing_scriptworker::settings::worker_group}",
worker_type => "${signing_scriptworker::settings::worker_type}",
cot_job_type => "signing",
verbose_logging => $verbose_logging,
taskcluster_client_id => secret("signing_scriptworker_taskcluster_client_id"),
taskcluster_access_token => secret("signing_scriptworker_taskcluster_access_token");
}
+ nrpe::custom {
+ "signingworker.cfg":
+ content => template("${module_name}/nagios.cfg.erb");
+ }
+
file {
"${signing_scriptworker::settings::root}/script_config.json":
require => Python35::Virtualenv["${signing_scriptworker::settings::root}"],
mode => 600,
owner => "${users::signer::username}",
group => "${users::signer::group}",
content => template("${module_name}/script_config.json.erb"),
show_diff => true;
"${signing_scriptworker::settings::root}/passwords.json":
require => Python35::Virtualenv["${signing_scriptworker::settings::root}"],
mode => 600,
owner => "${users::signer::username}",
group => "${users::signer::group}",
content => template("${module_name}/passwords.json.erb"),
show_diff => false;
+ "${signing_scriptworker::settings::root}/file_age_check_optionals.txt":
+ mode => 640,
+ owner => "${users::signer::username}",
+ group => "${users::signer::group}",
+ source => "puppet:///modules/signing_scriptworker/file_age_check_optionals.txt",
+ show_diff => true;
+ "${signing_scriptworker::settings::root}/file_age_check_required.txt":
+ mode => 640,
+ owner => "${users::signer::username}",
+ group => "${users::signer::group}",
+ source => "puppet:///modules/signing_scriptworker/file_age_check_required.txt",
+ show_diff => true;
}
}
new file mode 100644
--- /dev/null
+++ b/modules/signing_scriptworker/templates/nagios.cfg.erb
@@ -0,0 +1,3 @@
+command[check_signing_scriptworker_file_age=<%= scope.lookupvar("nrpe::base::plugins_dir") %>/nagios_check_file_ages.py -w 45 -c 60 --optional --from-file <%= scope.lookupvar("signing_scriptworker::settings::root") %>/file_age_check_optionals.txt
+command[check_signing_scriptworker_file_age=<%= scope.lookupvar("nrpe::base::plugins_dir") %>/nagios_check_file_ages.py -w 45 -c 60 --from-file <%= scope.lookupvar("signing_scriptworker::settings::root") %>/file_age_check_required.txt
+command[check_signing_scriptworker_pending_tasks]=<%= scope.lookupvar("nrpe::base::plugins_dir") %>/nagios_check_pending_tasks.py -w 5 -c 10
\ No newline at end of file