Bug 1375635 - Add telemetry for how often and long users are in bad authentication states. r?tcsc,Dexter draft
authorMark Hammond <mhammond@skippinet.com.au>
Thu, 27 Jul 2017 11:59:24 +1000
changeset 657060 23f6956da30cf7579248549f87dae6e5ae86b0ca
parent 656346 04b6be50a2526c7a26a63715f441c47e1aa1f9be
child 729333 3b24f27bc6cbee4114214d77eeda280f73785202
push id77424
push userbmo:markh@mozilla.com
push dateFri, 01 Sep 2017 01:32:03 +0000
reviewerstcsc, Dexter
bugs1375635
milestone57.0a1
Bug 1375635 - Add telemetry for how often and long users are in bad authentication states. r?tcsc,Dexter MozReview-Commit-ID: BW8ixM3wmxJ
services/sync/modules-testing/utils.js
services/sync/modules/browserid_identity.js
services/sync/tests/unit/test_browserid_identity.js
services/sync/tests/unit/test_browserid_identity_telemetry.js
services/sync/tests/unit/xpcshell.ini
toolkit/components/telemetry/Histograms.json
toolkit/components/telemetry/Scalars.yaml
tools/lint/eslint/modules.json
--- a/services/sync/modules-testing/utils.js
+++ b/services/sync/modules-testing/utils.js
@@ -12,16 +12,17 @@ this.EXPORTED_SYMBOLS = [
   "configureIdentity",
   "SyncTestingInfrastructure",
   "waitForZeroTimer",
   "promiseZeroTimer",
   "promiseNamedTimer",
   "MockFxaStorageManager",
   "AccountState", // from a module import
   "sumHistogram",
+  "getLoginTelemetryScalar",
 ];
 
 var {utils: Cu} = Components;
 
 Cu.import("resource://services-sync/status.js");
 Cu.import("resource://services-common/utils.js");
 Cu.import("resource://services-crypto/utils.js");
 Cu.import("resource://services-sync/util.js");
@@ -264,8 +265,14 @@ this.sumHistogram = function(name, optio
   let snapshot = histogram.snapshot(options.key);
   let sum = -Infinity;
   if (snapshot) {
     sum = snapshot.sum;
   }
   histogram.clear();
   return sum;
 }
+
+this.getLoginTelemetryScalar = function() {
+  let dataset = Services.telemetry.DATASET_RELEASE_CHANNEL_OPTOUT;
+  let snapshot = Services.telemetry.snapshotKeyedScalars(dataset, true);
+  return snapshot.parent ? snapshot.parent["services.sync.sync_login_state_transitions"] : {};
+}
--- a/services/sync/modules/browserid_identity.js
+++ b/services/sync/modules/browserid_identity.js
@@ -41,16 +41,104 @@ var fxAccountsCommon = {};
 Cu.import("resource://gre/modules/FxAccountsCommon.js", fxAccountsCommon);
 
 const OBSERVER_TOPICS = [
   fxAccountsCommon.ONLOGIN_NOTIFICATION,
   fxAccountsCommon.ONLOGOUT_NOTIFICATION,
   fxAccountsCommon.ON_ACCOUNT_STATE_CHANGE_NOTIFICATION,
 ];
 
+// A telemetry helper that records how long a user was in a "bad" state.
+// It is recorded in the *main* ping, *not* the Sync ping.
+// These bad states may persist across browser restarts, and may never change
+// (eg, users may *never* validate)
+this.telemetryHelper = {
+  // These are both the "status" values passed to maybeRecordLoginState and
+  // the key we use for our keyed scalar.
+  STATES: {
+    SUCCESS: "SUCCESS",
+    NOTVERIFIED: "NOTVERIFIED",
+    REJECTED: "REJECTED",
+  },
+
+  PREFS: {
+    REJECTED_AT: "identity.telemetry.loginRejectedAt",
+    APPEARS_PERMANENTLY_REJECTED: "identity.telemetry.loginAppearsPermanentlyRejected",
+    LAST_RECORDED_STATE: "identity.telemetry.lastRecordedState",
+  },
+
+  // How long, in minutes, that we continue to wait for a user to transition
+  // from a "bad" state to a success state. After this has expired, we record
+  // the "how long were they rejected for?" histogram.
+  NUM_MINUTES_TO_RECORD_REJECTED_TELEMETRY: 20160, // 14 days in minutes.
+
+  SCALAR: "services.sync.sync_login_state_transitions", // The scalar we use to report
+
+  maybeRecordLoginState(status) {
+    try {
+      this._maybeRecordLoginState(status);
+    } catch (ex) {
+      log.error("Failed to record login telemetry", ex);
+    }
+  },
+
+  _maybeRecordLoginState(status) {
+    let key = this.STATES[status];
+    if (!key) {
+      throw new Error(`invalid state ${status}`);
+    }
+
+    let when = Svc.Prefs.get(this.PREFS.REJECTED_AT);
+    let howLong = when ? this.nowInMinutes() - when : 0; // minutes.
+    let isNewState = Svc.Prefs.get(this.PREFS.LAST_RECORDED_STATE) != status;
+
+    if (status == this.STATES.SUCCESS) {
+      if (isNewState) {
+        Services.telemetry.keyedScalarSet(this.SCALAR, key, true);
+        Svc.Prefs.set(this.PREFS.LAST_RECORDED_STATE, status);
+      }
+      // If we previously recorded an error state, report how long they were
+      // in the bad state for (in minutes)
+      if (when) {
+        // If we are "permanently rejected" we've already recorded for how
+        // long, so don't do it again.
+        if (!Svc.Prefs.get(this.PREFS.APPEARS_PERMANENTLY_REJECTED)) {
+          Services.telemetry.getHistogramById("WEAVE_LOGIN_FAILED_FOR").add(howLong);
+        }
+      }
+      Svc.Prefs.reset(this.PREFS.REJECTED_AT);
+      Svc.Prefs.reset(this.PREFS.APPEARS_PERMANENTLY_REJECTED);
+    } else {
+      // We are in a failure state.
+      if (Svc.Prefs.get(this.PREFS.APPEARS_PERMANENTLY_REJECTED)) {
+        return; // we've given up, so don't record errors.
+      }
+      if (isNewState) {
+        Services.telemetry.keyedScalarSet(this.SCALAR, key, true);
+        Svc.Prefs.set(this.PREFS.LAST_RECORDED_STATE, status);
+      }
+      if (howLong > this.NUM_MINUTES_TO_RECORD_REJECTED_TELEMETRY) {
+        // We are giving up for this user, so report this "max time" as how
+        // long they were in this state for.
+        Services.telemetry.getHistogramById("WEAVE_LOGIN_FAILED_FOR").add(howLong);
+        Svc.Prefs.set(this.PREFS.APPEARS_PERMANENTLY_REJECTED, true);
+      }
+      if (!Svc.Prefs.has(this.PREFS.REJECTED_AT)) {
+        Svc.Prefs.set(this.PREFS.REJECTED_AT, this.nowInMinutes());
+      }
+    }
+  },
+
+  // hookable by tests.
+  nowInMinutes() {
+    return Math.floor(Date.now() / 1000 / 60);
+  },
+}
+
+
 function deriveKeyBundle(kB) {
   let out = CryptoUtils.hkdf(kB, undefined,
                              "identity.mozilla.com/picl/v1/oldsync", 2 * 32);
   let bundle = new BulkKeyBundle();
   // [encryptionKey, hmacKey]
   bundle.keyPair = [out.slice(0, 32), out.slice(32, 64)];
   return bundle;
 }
@@ -192,16 +280,19 @@ this.BrowserIDManager.prototype = {
       }
 
       this.username = accountData.email;
       this._updateSignedInUser(accountData);
       // The user must be verified before we can do anything at all; we kick
       // this and the rest of initialization off in the background (ie, we
       // don't return the promise)
       this._log.info("Waiting for user to be verified.");
+      if (!accountData.verified) {
+        telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.NOTVERIFIED);
+      }
       this._fxaService.whenVerified(accountData).then(accountData => {
         this._updateSignedInUser(accountData);
 
         this._log.info("Starting fetch for key bundle.");
         return this._fetchTokenForUser();
       }).then(token => {
         this._token = token;
         if (token) {
@@ -606,16 +697,17 @@ this.BrowserIDManager.prototype = {
         // TODO: Make it be only 80% of the duration, so refresh the token
         // before it actually expires. This is to avoid sync storage errors
         // otherwise, we get a nasty notification bar briefly. Bug 966568.
         token.expiration = this._now() + (token.duration * 1000) * 0.80;
         if (!this._syncKeyBundle) {
           // We are given kA/kB as hex.
           this._syncKeyBundle = deriveKeyBundle(CommonUtils.hexToBytes(userData.kB));
         }
+        telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.SUCCESS);
         return token;
       })
       .catch(err => {
         // TODO: unify these errors - we need to handle errors thrown by
         // both tokenserverclient and hawkclient.
         // A tokenserver error thrown based on a bad response.
         if (err.response && err.response.status === 401) {
           err = new AuthenticationError(err, "tokenserver");
@@ -629,16 +721,17 @@ this.BrowserIDManager.prototype = {
 
         // TODO: write tests to make sure that different auth error cases are handled here
         // properly: auth error getting assertion, auth error getting token (invalid generation
         // and client-state error)
         if (err instanceof AuthenticationError) {
           this._log.error("Authentication error in _fetchTokenForUser", err);
           // set it to the "fatal" LOGIN_FAILED_LOGIN_REJECTED reason.
           this._authFailureReason = LOGIN_FAILED_LOGIN_REJECTED;
+          telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.REJECTED);
         } else {
           this._log.error("Non-authentication error in _fetchTokenForUser", err);
           // for now assume it is just a transient network related problem
           // (although sadly, it might also be a regular unhandled exception)
           this._authFailureReason = LOGIN_FAILED_NETWORK_ERROR;
         }
         // this._authFailureReason being set to be non-null in the above if clause
         // ensures we are in the correct currentAuthState, and
--- a/services/sync/tests/unit/test_browserid_identity.js
+++ b/services/sync/tests/unit/test_browserid_identity.js
@@ -78,16 +78,17 @@ add_test(function test_initial_state() {
 );
 
 add_task(async function test_initialializeWithCurrentIdentity() {
     _("Verify start after initializeWithCurrentIdentity");
     globalBrowseridManager.initializeWithCurrentIdentity();
     await globalBrowseridManager.whenReadyToAuthenticate.promise;
     do_check_true(!!globalBrowseridManager._token);
     do_check_true(globalBrowseridManager.hasValidToken());
+    Assert.deepEqual(getLoginTelemetryScalar(), {SUCCESS: 1});
   }
 );
 
 add_task(async function test_initialializeWithAuthErrorAndDeletedAccount() {
     _("Verify sync unpair after initializeWithCurrentIdentity with auth error + account deleted");
 
     var identityConfig = makeIdentityConfig();
     var browseridManager = new BrowserIDManager();
@@ -127,16 +128,17 @@ add_task(async function test_initialiali
     await browseridManager.initializeWithCurrentIdentity();
     await Assert.rejects(browseridManager.whenReadyToAuthenticate.promise,
                      "should reject due to an auth error");
 
     do_check_true(signCertificateCalled);
     do_check_true(accountStatusCalled);
     do_check_false(browseridManager._token);
     do_check_false(browseridManager.hasValidToken());
+    Assert.deepEqual(getLoginTelemetryScalar(), {REJECTED: 1});
 });
 
 add_task(async function test_initialializeWithNoKeys() {
     _("Verify start after initializeWithCurrentIdentity without kA, kB or keyFetchToken");
     let identityConfig = makeIdentityConfig();
     delete identityConfig.fxaccount.user.kA;
     delete identityConfig.fxaccount.user.kB;
     // there's no keyFetchToken by default, so the initialize should fail.
new file mode 100644
--- /dev/null
+++ b/services/sync/tests/unit/test_browserid_identity_telemetry.js
@@ -0,0 +1,120 @@
+Cu.import("resource://gre/modules/Preferences.jsm");
+Cu.import("resource://testing-common/services/sync/utils.js");
+let {telemetryHelper} = Cu.import("resource://services-sync/browserid_identity.js", {});
+
+const prefs = new Preferences("services.sync.");
+
+function cleanup() {
+  prefs.resetBranch();
+}
+
+add_test(function test_success() {
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.SUCCESS);
+  Assert.deepEqual(getLoginTelemetryScalar(), {SUCCESS: 1});
+  // doing it again should not bump the count.
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.SUCCESS);
+  Assert.deepEqual(getLoginTelemetryScalar(), {});
+
+  // check the prefs.
+  Assert.equal(prefs.get(telemetryHelper.PREFS.REJECTED_AT), undefined);
+  Assert.equal(prefs.get(telemetryHelper.PREFS.APPEARS_PERMANENTLY_REJECTED), undefined);
+  Assert.equal(prefs.get(telemetryHelper.PREFS.LAST_RECORDED_STATE), telemetryHelper.STATES.SUCCESS);
+
+  cleanup();
+  run_next_test();
+});
+
+add_test(function test_success_to_error() {
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.SUCCESS);
+  Assert.deepEqual(getLoginTelemetryScalar(), {SUCCESS: 1});
+  // transition to an error state
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.NOTVERIFIED);
+  Assert.deepEqual(getLoginTelemetryScalar(), {NOTVERIFIED: 1});
+
+  cleanup();
+  run_next_test();
+});
+
+add_test(function test_unverified() {
+  telemetryHelper.nowInMinutes = () => 10000;
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.NOTVERIFIED);
+  Assert.deepEqual(getLoginTelemetryScalar(), {NOTVERIFIED: 1});
+
+  Assert.equal(prefs.get(telemetryHelper.PREFS.REJECTED_AT), 10000);
+
+  // doing it again should not bump the count and should not change the
+  // original time it was rejected at.
+  telemetryHelper.nowInMinutes = () => 10010;
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.NOTVERIFIED);
+  Assert.deepEqual(getLoginTelemetryScalar(), {});
+  Assert.equal(sumHistogram("WEAVE_LOGIN_FAILED_FOR"), 0);
+
+  Assert.equal(prefs.get(telemetryHelper.PREFS.REJECTED_AT), 10000);
+
+  // record unverified again - should not record telemetry as it's the same state.
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.NOTVERIFIED);
+  Assert.deepEqual(getLoginTelemetryScalar(), {});
+  Assert.equal(sumHistogram("WEAVE_LOGIN_FAILED_FOR"), 0);
+
+  // now record success - should get the new state recorded and the duration.
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.SUCCESS);
+  Assert.deepEqual(getLoginTelemetryScalar(), {SUCCESS: 1});
+
+  // we are now at 10010 minutes, so should have recorded it took 10.
+  Assert.equal(sumHistogram("WEAVE_LOGIN_FAILED_FOR"), 10);
+
+  cleanup();
+  run_next_test();
+});
+
+add_test(function test_unverified_give_up() {
+  telemetryHelper.nowInMinutes = () => 10000;
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.NOTVERIFIED);
+  Assert.deepEqual(getLoginTelemetryScalar(), {NOTVERIFIED: 1});
+
+  Assert.equal(prefs.get(telemetryHelper.PREFS.REJECTED_AT), 10000);
+
+  // bump our timestamp to 50k minutes, which is past our "give up" period.
+  telemetryHelper.nowInMinutes = () => 50000;
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.NOTVERIFIED);
+
+  // We've already recorded the unverified state, so shouldn't again.
+  Assert.deepEqual(getLoginTelemetryScalar(), {});
+  // But we *should* have recorded how long it failed for even though it still
+  // fails.
+  Assert.equal(sumHistogram("WEAVE_LOGIN_FAILED_FOR"), 40000);
+
+  // Record failure again - we should *not* have any telemetry recorded.
+  telemetryHelper.nowInMinutes = () => 50001;
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.NOTVERIFIED);
+  Assert.deepEqual(getLoginTelemetryScalar(), {});
+  Assert.equal(sumHistogram("WEAVE_LOGIN_FAILED_FOR"), 0);
+
+  // Now record success - it also should *not* again record the failure duration.
+  telemetryHelper.nowInMinutes = () => 60000;
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.SUCCESS);
+  Assert.deepEqual(getLoginTelemetryScalar(), {SUCCESS: 1});
+  Assert.equal(sumHistogram("WEAVE_LOGIN_FAILED_FOR"), 0);
+
+  // Even though we were permanently rejected, the SUCCESS recording should
+  // have reset that state, so new error states should work as normal.
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.NOTVERIFIED);
+  Assert.deepEqual(getLoginTelemetryScalar(), {NOTVERIFIED: 1});
+  Assert.equal(sumHistogram("WEAVE_LOGIN_FAILED_FOR"), 0);
+
+  telemetryHelper.nowInMinutes = () => 60001;
+  telemetryHelper.maybeRecordLoginState(telemetryHelper.STATES.SUCCESS);
+  Assert.deepEqual(getLoginTelemetryScalar(), {SUCCESS: 1});
+  Assert.equal(sumHistogram("WEAVE_LOGIN_FAILED_FOR"), 1);
+
+  cleanup();
+  run_next_test();
+});
+
+add_test(function test_bad_state() {
+  // We call the internal implementation to check it throws as the public
+  // method catches and logs.
+  Assert.throws(() => telemetryHelper._maybeRecordLoginState("foo"), /invalid state/);
+  cleanup();
+  run_next_test();
+});
--- a/services/sync/tests/unit/xpcshell.ini
+++ b/services/sync/tests/unit/xpcshell.ini
@@ -44,16 +44,17 @@ tags = addons
 [test_resource.js]
 [test_resource_async.js]
 [test_resource_header.js]
 [test_resource_ua.js]
 [test_syncstoragerequest.js]
 
 # Generic Sync types.
 [test_browserid_identity.js]
+[test_browserid_identity_telemetry.js]
 [test_collection_getBatched.js]
 [test_collections_recovery.js]
 [test_keys.js]
 [test_records_crypto.js]
 [test_records_wbo.js]
 
 # Engine APIs.
 [test_engine.js]
--- a/toolkit/components/telemetry/Histograms.json
+++ b/toolkit/components/telemetry/Histograms.json
@@ -10810,16 +10810,27 @@
   "WEAVE_COMPLETE_SUCCESS_COUNT": {
     "record_in_processes": ["main", "content"],
     "expires_in_version": "default",
     "kind": "exponential",
     "high": 1000,
     "n_buckets": 10,
     "description": "The number of times a sync successfully completed in this session"
   },
+  "WEAVE_LOGIN_FAILED_FOR": {
+    "record_in_processes": ["main"],
+    "expires_in_version": "65",
+    "alert_emails": ["sync-dev@mozilla.org"],
+    "kind": "exponential",
+    "high": 20000,
+    "n_buckets": 100,
+    "releaseChannelCollection": "opt-out",
+    "bug_numbers": [1375635],
+    "description": "The number of minutes the user was in a bad login state."
+  },
   "WEAVE_WIPE_SERVER_SUCCEEDED": {
     "record_in_processes": ["main", "content"],
     "expires_in_version": "55",
     "alert_emails": ["sync-dev@mozilla.org"],
     "kind": "boolean",
     "bug_numbers": [1241699],
     "description": "Stores 1 if a wipeServer call succeeded, and 0 if it failed."
   },
--- a/toolkit/components/telemetry/Scalars.yaml
+++ b/toolkit/components/telemetry/Scalars.yaml
@@ -423,16 +423,31 @@ services.sync:
     kind: string
     keyed: false
     notification_emails:
       - sync-staff@mozilla.com
     release_channel_collection: opt-out
     record_in_processes:
       - main
 
+  sync_login_state_transitions:
+    bug_numbers:
+      - 1375635
+    description: >
+      Records transitions between login states of "success", "unverified"
+      and "rejected.
+    expires: "65"
+    kind: boolean
+    keyed: true
+    notification_emails:
+      - sync-staff@mozilla.com
+    release_channel_collection: opt-out
+    record_in_processes:
+      - main
+
 security:
   pkcs11_modules_loaded:
     bug_numbers:
       - 1369911
     description: >
       A keyed boolean indicating the library names of the PKCS#11 modules that
       have been loaded by the browser.
     expires: "62"
--- a/tools/lint/eslint/modules.json
+++ b/tools/lint/eslint/modules.json
@@ -220,17 +220,17 @@
   "tokenserverclient.js": ["TokenServerClient", "TokenServerClientError", "TokenServerClientNetworkError", "TokenServerClientServerError"],
   "ToolboxProcess.jsm": ["BrowserToolboxProcess"],
   "tps.jsm": ["ACTIONS", "TPS"],
   "Translation.jsm": ["Translation", "TranslationTelemetry"],
   "Traversal.jsm": ["TraversalRules", "TraversalHelper"],
   "UpdateTelemetry.jsm": ["AUSTLMY"],
   "UpdateTopLevelContentWindowIDHelper.jsm": ["trackBrowserWindow"],
   "util.js": ["getChromeWindow", "Utils", "Svc"],
-  "utils.js": ["applicationName", "assert", "Copy", "getBrowserObject", "getChromeWindow", "getWindows", "getWindowByTitle", "getWindowByType", "getWindowId", "getMethodInWindows", "getPreference", "saveDataURL", "setPreference", "sleep", "startTimer", "stopTimer", "takeScreenshot", "unwrapNode", "waitFor", "btoa", "encryptPayload", "makeIdentityConfig", "makeFxAccountsInternalMock", "configureFxAccountIdentity", "configureIdentity", "SyncTestingInfrastructure", "waitForZeroTimer", "Promise", "MockFxaStorageManager", "AccountState", "sumHistogram", "CommonUtils", "CryptoUtils", "TestingUtils", "promiseZeroTimer", "promiseNamedTimer"],
+  "utils.js": ["applicationName", "assert", "Copy", "getBrowserObject", "getChromeWindow", "getWindows", "getWindowByTitle", "getWindowByType", "getWindowId", "getMethodInWindows", "getPreference", "saveDataURL", "setPreference", "sleep", "startTimer", "stopTimer", "takeScreenshot", "unwrapNode", "waitFor", "btoa", "encryptPayload", "makeIdentityConfig", "makeFxAccountsInternalMock", "configureFxAccountIdentity", "configureIdentity", "SyncTestingInfrastructure", "waitForZeroTimer", "Promise", "MockFxaStorageManager", "AccountState", "sumHistogram", "CommonUtils", "CryptoUtils", "TestingUtils", "promiseZeroTimer", "promiseNamedTimer", "getLoginTelemetryScalar"],
   "Utils.jsm": ["Utils", "Logger", "PivotContext", "PrefCache"],
   "VariablesView.jsm": ["VariablesView", "escapeHTML"],
   "VariablesViewController.jsm": ["VariablesViewController", "StackFrameUtils"],
   "version.jsm": ["VERSION"],
   "vtt.jsm": ["WebVTT"],
   "WebChannel.jsm": ["WebChannel", "WebChannelBroker"],
   "WindowDraggingUtils.jsm": ["WindowDraggingElement"],
   "windows.js": ["init", "map"],