Add signing scriptworker monitoring for Bug 1314840 r=aki draft
authorSimon Fraser <sfraser@mozilla.com>
Wed, 30 Nov 2016 14:36:11 +0000
changeset 4495 b87dffc7b3a4f635a3036b7e0f0984c375239aed
parent 4494 eda376cfe99c472f04c1f4ff1733b7483a175558
child 4496 efd8220880c1df85daa6e13d012a14f60a4f27d4
push id2527
push usersfraser@mozilla.com
push dateWed, 30 Nov 2016 14:36:37 +0000
reviewersaki
bugs1314840
Add signing scriptworker monitoring for Bug 1314840 r=aki The monitoring scripts are generic, and so have been added to the scriptworker manifests, and the specific configuration to run them have been added to the signing_scriptworker manifests. MozReview-Commit-ID: 7t3TtXQqXJy
modules/scriptworker/files/nagios_file_age_check.py
modules/scriptworker/files/nagios_pending_tasks.py
modules/scriptworker/manifests/instance.pp
modules/signing_scriptworker/files/file_age_check_optionals.txt
modules/signing_scriptworker/files/file_age_check_required.txt
modules/signing_scriptworker/manifests/init.pp
modules/signing_scriptworker/templates/nagios.cfg.erb
new file mode 100644
--- /dev/null
+++ b/modules/scriptworker/files/nagios_file_age_check.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python
+
+"""
+Checks for the existence and age of specific files.
+
+Allow file age to be defined as warning and critical arguments.
+
+You can either:
+- Specify one file to check
+- Give the path to a file, that contains a newline-separated list
+  of the files to check.
+
+This script will produce a multi-line nagios report in either case.
+
+Attributes:
+    STATUS_CODE (dict): a mapping of status strings to the exit codes nagios
+        requires
+    DEFAULT_WARNING (int): The default warning threshold, if none is given
+    DEFAULT_CRITICAL (int): The default critical alarm threshold, if none is
+        given
+
+
+"""
+import os
+import sys
+import time
+import argparse
+
+# Nagios plugin exit codes
+STATUS_CODE = {
+    'OK': 0,
+    'WARNING': 1,
+    'CRITICAL': 2,
+    'UNKNOWN': 3,
+}
+
+DEFAULT_WARNING = 45
+DEFAULT_CRITICAL = 60
+
+
+def file_age_check(filename, warning, critical, optional):
+    """file_age_check.
+
+    Checks the age and existence of a given filename
+
+    Args:
+      filename (str): containing a full path
+      warning (int): time in seconds over which to issue a warning.
+      critical (int): time in seconds over which to issue critical.
+
+    Returns:
+      tuple:
+        (nagios status code from STATUS_CODE, message string)
+    """
+    if not os.path.isfile(filename):
+        if optional:
+            return STATUS_CODE['OK'], "{0} doesn't exist and that's ok".format(filename)
+        else:
+            return STATUS_CODE['CRITICAL'], "{0} does not exist".format(filename)
+
+    try:
+        st = os.stat(filename)
+    except OSError as excp:
+        return STATUS_CODE['UNKNOWN'], "{0}: {1}".format(filename, excp)
+    current_time = time.time()
+    age = current_time - st.st_mtime
+
+    if age >= critical:
+        msg = "{0} is too old {1}/{2} seconds".format(
+            filename, int(age), critical)
+        return STATUS_CODE['CRITICAL'], msg
+    elif age >= warning:
+        msg = "{0} is getting too old {1}/{2} seconds".format(
+            filename, int(age), warning)
+        return STATUS_CODE['CRITICAL'], msg
+    else:
+        msg = "{0} is ok, {1}/{2} seconds old".format(
+            filename, int(age), critical)
+        return STATUS_CODE['OK'], msg
+
+
+def get_args():
+    """Parse command-line arguments."""
+    argp = argparse.ArgumentParser(description=__doc__)
+    argp.add_argument('-w', '--warning', type=int, default=DEFAULT_WARNING,
+                      help='warn if older than this many minutes')
+    argp.add_argument('-c', '--critical', type=int, default=DEFAULT_CRITICAL,
+                      help='critical if older than this many minutes')
+
+    argp.add_argument('-o', '--optional', action='store_true',
+                      help="If set, don't error if the file is missing")
+
+    arggroup = argp.add_mutually_exclusive_group(required=True)
+
+    arggroup.add_argument('-p', '--path', type=str,
+                          help="The full path name to check")
+    arggroup.add_argument('-f', '--from-file',
+                          type=argparse.FileType('r'),
+                          default=sys.stdin,
+                          help="File of paths one per line, or - for stdin (default)")
+
+    args = argp.parse_args()
+
+    # convert to seconds for epoch time comparison
+    args.warning = args.warning * 60
+    args.critical = args.critical * 60
+
+    return args
+
+
+def run_file_age_checks():
+    """Organise the file age checks for nagios.
+
+    Output:
+    Prints to stdout
+    Exits with appropriate return code
+    """
+    args = get_args()
+
+    statuses = list()
+    messages = list()
+
+    if args.path:
+        check_files = [args.path]
+    else:
+        check_files = [f.strip() for f in args.from_file]
+
+    for filename in check_files:
+        status, message = file_age_check(
+            filename, args.warning, args.critical, args.optional)
+        statuses.append(status)
+        messages.append(message)
+
+    exit_code = max(statuses)
+
+    reverse_status_codes = {v: k for k, v in STATUS_CODE.items()}
+    service_output = "FILE_AGE {0}".format(reverse_status_codes[exit_code])
+
+    service_output_options = {
+        STATUS_CODE['OK']: "All files ok",
+        STATUS_CODE['WARNING']: "Some files may be too old, see long output",
+        STATUS_CODE['CRITICAL']: "Some files errored, see long output",
+        STATUS_CODE['UNKNOWN']: "Unknown error",
+    }
+
+    service_output += " - {0}".format(service_output_options[exit_code])
+
+    print("{0}\n{1}\n".format(service_output, "\n".join(sorted(messages))))
+    sys.exit(exit_code)
+
+
+if __name__ == '__main__':
+    run_file_age_checks()
new file mode 100644
--- /dev/null
+++ b/modules/scriptworker/files/nagios_pending_tasks.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+
+"""
+Nagios report script to check the number of pending scriptworker tasks
+
+Will use the current worker type and provisionerId from the scriptworker
+configuration, and query the API to discover the number of pending tasks.
+
+
+Attributes:
+    STATUS_CODE (dict): a mapping of status strings to the exit codes nagios
+        requires
+    DEFAULT_WARNING (int): The default warning threshold, if none is given
+    DEFAULT_CRITICAL (int): The default critical alarm threshold, if none is
+        given
+
+"""
+import asyncio
+import aiohttp
+import sys
+import argparse
+
+
+from scriptworker.config import get_context_from_cmdln
+from scriptworker.utils import cleanup
+
+
+# Nagios plugin exit codes
+STATUS_CODE = {
+    'OK': 0,
+    'WARNING': 1,
+    'CRITICAL': 2,
+    'UNKNOWN': 3,
+}
+
+DEFAULT_WARNING = 5
+DEFAULT_CRITICAL = 10
+
+
+def nagios_message(status, message):
+    """Place a nagios-style message to stdout and exit."""
+    print("PENDING_TASKS {0} - {1}".format(status, message))
+    sys.exit(STATUS_CODE[status])
+
+
+def get_args():
+    """Process command-line arguments.
+
+    Arguments:
+        None
+
+    Returns:
+        a parsed arguments object
+    """
+    argp = argparse.ArgumentParser(description=__doc__)
+    argp.add_argument('-w', '--warning', type=int, default=DEFAULT_WARNING,
+                      help='warning threshhold for number of pending tasks')
+    argp.add_argument('-c', '--critical', type=int, default=DEFAULT_CRITICAL,
+                      help='critical threshhold for number of pending tasks')
+
+    return argp.parse_args()
+
+
+def query_pending_task_count():
+    """query_pending_task_count.
+
+    Query the API for the number of pending tasks, so we can
+    report to nagios
+    """
+    args = get_args()
+
+    context, credentials = get_context_from_cmdln(sys.argv[1:])
+    cleanup(context)
+
+    conn = aiohttp.TCPConnector(
+        limit=context.config['aiohttp_max_connections'])
+    loop = asyncio.get_event_loop()
+    with aiohttp.ClientSession(connector=conn) as session:
+        context.session = session
+        context.credentials = credentials
+
+        try:
+            result = loop.run_until_complete(
+                context.queue.pendingTasks(
+                    context.config['provisioner_id'],
+                    context.config['worker_type']
+                )
+            )
+
+        except Exception as excp:
+            nagios_message(
+                'UNKNOWN', 'Unable to query pending tasks: {0}'.format(excp))
+
+        template = '{pending}/{max} pending tasks for {provisioner}:{worker}'
+
+        if result['pendingTasks'] >= args.critical:
+            nagios_message(
+                'CRITICAL', template.format(
+                    pending=result['pendingTasks'],
+                    max=args.critical,
+                    provisioner=result['provisionerId'],
+                    worker=result['workerType']
+                )
+            )
+        elif result['pendingTasks'] >= args.warning:
+            nagios_message(
+                'WARNING', "{0}/{1} pending tasks".format(
+                    result['pendingTasks'],
+                    args.warning
+                )
+            )
+        else:
+            nagios_message(
+                'OK', "{0}/{1} pending tasks".format(
+                    result['pendingTasks'],
+                    args.critical
+                )
+            )
+
+
+if __name__ == '__main__':
+    query_pending_task_count()
--- a/modules/scriptworker/manifests/instance.pp
+++ b/modules/scriptworker/manifests/instance.pp
@@ -75,16 +75,30 @@ define scriptworker::instance(
             owner       => "${username}",
             group       => "${group}";
         "/home/${username}/privkey":
             mode        => 600,
             content     => $config::scriptworker_gpg_private_keys[$fqdn],
             owner       => "${username}",
             group       => "${group}",
             show_diff   => false;
+        "${nrpe::base::plugins_dir}/nagios_file_age_check.py":
+            require     => Python35::Virtualenv["${basedir}"],
+            mode        => 750,
+            owner       => "${username}",
+            group       => "${group}",
+            source      => "puppet:///modules/scriptworker/nagios_file_age_check.py",
+            show_diff => false;
+        "${nrpe::base::plugins_dir}/nagios_pending_tasks.py":
+            require     => Python35::Virtualenv["${basedir}"],
+            mode        => 750,
+            owner       => "${username}",
+            group       => "${group}",
+            source      => "puppet:///modules/scriptworker/nagios_pending_tasks.py",
+            show_diff => false;
     }
 
     exec {
         # create gpg homedirs on change
         'create_gpg_homedirs':
             require => [Python35::Virtualenv["${basedir}"],
                         Git::Repo["scriptworker-${git_key_repo_dir}"],
                         File["${basedir}/scriptworker.yaml"]],
new file mode 100644
--- /dev/null
+++ b/modules/signing_scriptworker/files/file_age_check_optionals.txt
@@ -0,0 +1,1 @@
+/builds/scriptworker/.gpg_homedirs.lock
\ No newline at end of file
new file mode 100644
--- /dev/null
+++ b/modules/signing_scriptworker/files/file_age_check_required.txt
@@ -0,0 +1,3 @@
+/builds/scriptworker/logs/create_initial_gpg_homedirs.log
+/builds/scriptworker/logs/rebuild_gpg_homedirs.log
+/builds/scriptworker/logs/worker.log
\ No newline at end of file
--- a/modules/signing_scriptworker/manifests/init.pp
+++ b/modules/signing_scriptworker/manifests/init.pp
@@ -63,25 +63,42 @@ class signing_scriptworker {
             worker_group             => "${signing_scriptworker::settings::worker_group}",
             worker_type              => "${signing_scriptworker::settings::worker_type}",
             cot_job_type             => "signing",
             verbose_logging          => $verbose_logging,
             taskcluster_client_id    => secret("signing_scriptworker_taskcluster_client_id"),
             taskcluster_access_token => secret("signing_scriptworker_taskcluster_access_token");
     }
 
+    nrpe::custom {
+        "signingworker.cfg":
+            content => template("${module_name}/nagios.cfg.erb");
+    }
+
     file {
         "${signing_scriptworker::settings::root}/script_config.json":
             require     => Python35::Virtualenv["${signing_scriptworker::settings::root}"],
             mode        => 600,
             owner       => "${users::signer::username}",
             group       => "${users::signer::group}",
             content     => template("${module_name}/script_config.json.erb"),
             show_diff   => true;
         "${signing_scriptworker::settings::root}/passwords.json":
             require     => Python35::Virtualenv["${signing_scriptworker::settings::root}"],
             mode        => 600,
             owner       => "${users::signer::username}",
             group       => "${users::signer::group}",
             content     => template("${module_name}/passwords.json.erb"),
             show_diff => false;
+        "${signing_scriptworker::settings::root}/file_age_check_optionals.txt":
+            mode        => 640,
+            owner       => "${users::signer::username}",
+            group       => "${users::signer::group}",
+            source      => "puppet:///modules/signing_scriptworker/file_age_check_optionals.txt",
+            show_diff   => true;
+        "${signing_scriptworker::settings::root}/file_age_check_required.txt":
+            mode        => 640,
+            owner       => "${users::signer::username}",
+            group       => "${users::signer::group}",
+            source      => "puppet:///modules/signing_scriptworker/file_age_check_required.txt",
+            show_diff   => true;
     }
 }
new file mode 100644
--- /dev/null
+++ b/modules/signing_scriptworker/templates/nagios.cfg.erb
@@ -0,0 +1,3 @@
+command[check_signing_scriptworker_file_age=<%= scope.lookupvar("nrpe::base::plugins_dir") %>/nagios_check_file_ages.py -w 45 -c 60 --optional --from-file <%= scope.lookupvar("signing_scriptworker::settings::root") %>/file_age_check_optionals.txt
+command[check_signing_scriptworker_file_age=<%= scope.lookupvar("nrpe::base::plugins_dir") %>/nagios_check_file_ages.py -w 45 -c 60 --from-file <%= scope.lookupvar("signing_scriptworker::settings::root") %>/file_age_check_required.txt
+command[check_signing_scriptworker_pending_tasks]=<%= scope.lookupvar("nrpe::base::plugins_dir") %>/nagios_check_pending_tasks.py -w 5 -c 10
\ No newline at end of file