Bug 1323333 - Implement limited same-level bookmark deduping. draft
authorKit Cambridge <kit@yakshaving.ninja>
Thu, 27 Apr 2017 15:22:32 -0700
changeset 569796 73531e63fdae220e3008e5ab5404b139eb0e8001
parent 569795 ec2806ff1649b345e7999a42876592e5dd36ff63
child 569797 49c71c016a876ab1fe126bfe3dc26b6d4ed6edbf
push id56282
push userbmo:kit@mozilla.com
push dateThu, 27 Apr 2017 22:47:44 +0000
bugs1323333
milestone55.0a1
Bug 1323333 - Implement limited same-level bookmark deduping. MozReview-Commit-ID: 3NhKo3B5fkF
services/sync/modules/engines/bookmarks.js
toolkit/components/places/PlacesSyncUtils.jsm
--- a/services/sync/modules/engines/bookmarks.js
+++ b/services/sync/modules/engines/bookmarks.js
@@ -346,19 +346,23 @@ BookmarksEngine.prototype = {
     return false;
   },
 
   _processIncoming(newitems) {
     SyncEngine.prototype._processIncoming.call(this, newitems);
     Async.promiseSpinningly(this._postProcessIncoming());
   },
 
-  // Applies pending tombstones, sets folder child order, and updates the sync
-  // status of all `NEW` bookmarks to `NORMAL`.
+  // Pares duplicates, applies pending tombstones, sets folder child order,
+  // and update the sync status of all `NEW` bookmarks to `NORMAL`.
   async _postProcessIncoming() {
+    let childIndices = this._store.getChildIndices();
+    let newChanges = await PlacesSyncUtils.bookmarks.dedupe(childIndices);
+    this._modified.replace(newChanges);
+
     await this._deletePending();
     await this._orderChildren();
 
     let changes = this._modified.changes;
     await PlacesSyncUtils.bookmarks.markChangesAsSyncing(changes);
   },
 
   async _orderChildren() {
@@ -660,17 +664,29 @@ BookmarksStore.prototype = {
 
   wipe: function BStore_wipe() {
     this.clearPendingDeletions();
     Async.promiseSpinningly(Task.spawn(function* () {
       // Save a backup before clearing out all bookmarks.
       yield PlacesBackups.create(null, true);
       yield PlacesSyncUtils.bookmarks.wipe();
     }));
-  }
+  },
+
+  getChildIndices() {
+    let childIndices = new Map();
+    for (let syncId in this._childrenToOrder) {
+      let childSyncIds = this._childrenToOrder[syncId];
+      for (let index = 0; index < childSyncIds.length; index++) {
+        let syncId = childSyncIds[index];
+        childIndices.set(syncId, index);
+      }
+    }
+    return childIndices;
+  },
 };
 
 // The bookmarks tracker is a special flower. Instead of listening for changes
 // via observer notifications, it queries Places for the set of items that have
 // changed since the last sync. Because it's a "pull-based" tracker, it ignores
 // all concepts of "add a changed ID." However, it still registers an observer
 // to bump the score, so that changed bookmarks are synced immediately.
 function BookmarksTracker(name, engine) {
--- a/toolkit/components/places/PlacesSyncUtils.jsm
+++ b/toolkit/components/places/PlacesSyncUtils.jsm
@@ -572,16 +572,75 @@ const BookmarkSyncUtils = PlacesSyncUtil
           // Drop stale tombstones.
           yield db.executeCached("DELETE FROM moz_bookmarks_deleted");
         });
       }
     );
   }),
 
   /**
+   * Deduping looks for local siblings with the same attributes (title, URL,
+   * separator index, smart bookmark anno), and removes them if there are
+   * matching remote siblings. Only items that haven't been uploaded yet are
+   * candidates for deduping; we'd corrupt the server if we tried to dedupe
+   * items that already exist on other clients.
+   *
+   * In most cases, deduping won't be necessary. Item GUIDs are preserved in
+   * backups, so syncing a device with restored bookmarks should trigger Sync's
+   * `update` logic. While it's possible to create explicit dupes with the same
+   * parent and attributes, most users don't organize their bookmarks this way,
+   * and starring moves existing bookmarks instead of creating dupes. The only
+   * time we expect deduping to be necessary is on the first sync, where other
+   * devices might have similar, manually-added bookmarks with different GUIDs.
+   *
+   * During the download phase, we apply all incoming items, including dupes.
+   * Before uploading, we scan the tree and collapse similar `NEW` and `UNKNOWN`
+   * items.
+   *
+   * How it works:
+   *
+   * 1. Walk all syncable roots to find folders with the same name at the same
+   *    level. We group by the level instead of the parent because items in
+   *    local subtrees are still congruent with remote subtrees, but won't have
+   *    the same parents.
+   * 2. Group the dupe folders by level and title.
+   * 3. Each group may have a different number of local (`NEW` and `UNKNOWN`)
+   *    dupes and remote (`NORMAL`) matches. For example, if we have two local
+   *    folders (A, AA) and two remote folders (B, BB) with the same name and
+   *    level, we'll merge A into B and AA into BB. If we have (A, AA) locally,
+   *    and (B) remotely, we'll merge A into B, and leave AA as a dupe.
+   *    Similarly, if we have (A) locally, and (B, BB) remotely, we'll merge A
+   *    into BB and take BB as a dupe.
+   * 4. Sort the local and remote dupes by position, then zip the two
+   *    lists into a list of (local, remote) tuples. Sorting helps us pick the
+   *    correct remote match if there are explicit dupes.
+   * 5. Move the contents of each local folder into the matching remote folder.
+   *    For local bookmarks that don't exist on the server, this will preserve
+   *    their order relative to other local bookmarks, but *not* any matches
+   *    that exist on the server. This is a similar issue to bug 1352233.
+   * 6. Delete the (now-empty) local folder. If the user added new items to the
+   *    folder in the meantime, this will fail, and we'll keep the old folder
+   *    around so that they can manually move the items.
+   * 7. At this point, we have a single subtree with duplicate non-folder
+   *    descendants: bookmarks, smart bookmarks, livemarks, and separators.
+   * 8. Group all items with similar attributes and the same parent.
+   * 9. For each group, remove as many duplicate local items as we have remote
+   *    matches.
+   *
+   * And that's it. If the wind in my sail on the sea stays behind me, one day
+   * I'll know how far I'll go to dedupe all these bookmarks.
+   */
+  async dedupe(remoteIndices) {
+    let db = await PlacesUtils.promiseDBConnection();
+    await dedupeFolders(db, remoteIndices);
+    await dedupeItems(db, remoteIndices);
+    return pullSyncChanges(db);
+  },
+
+  /**
    * Updates a bookmark with synced properties. Only Sync should call this
    * method; other callers should use `Bookmarks.update`.
    *
    * The following properties are supported:
    *  - kind: Optional.
    *  - guid: Required.
    *  - parentGuid: Optional; reparents the bookmark if specified.
    *  - title: Optional.
@@ -1859,8 +1918,278 @@ function markChangesAsSyncing(db, change
 var removeTombstones = Task.async(function* (db, guids) {
   if (!guids.length) {
     return Promise.resolve();
   }
   return db.execute(`
     DELETE FROM moz_bookmarks_deleted
     WHERE guid IN (${guids.map(guid => JSON.stringify(guid)).join(",")})`);
 });
+
+// Merges local duplicate folders into matching remote folders.
+async function dedupeFolders(db, remoteIndices) {
+  let rows = await findDupeFolders(db);
+
+  let guidsToRemove = [];
+  for (let row of rows) {
+    let dupeGroup = row.getResultByName("dupeGroup");
+    let matches = matchDupeFolders(dupeGroup, remoteIndices);
+    for (let { oldGuid, newGuid } of matches) {
+      let childGuids = await fetchChildGuids(db, oldGuid);
+      for (let childGuid of childGuids) {
+        await PlacesUtils.bookmarks.update({
+          guid: childGuid,
+          parentGuid: newGuid,
+          index: PlacesUtils.bookmarks.DEFAULT_INDEX,
+          source: PlacesUtils.bookmarks.SOURCES.DEFAULT,
+        });
+      }
+      guidsToRemove.push(oldGuid);
+    }
+  }
+
+  BookmarkSyncLog.debug(`dedupeFolders: Removing deduped folders`,
+                        guidsToRemove);
+
+  for (let guid of guidsToRemove) {
+    try {
+      await PlacesUtils.bookmarks.remove(guid, {
+        preventRemovalOfNonEmptyFolders: true,
+        // TODO: Consider a different source. We shouldn't write
+        // tombstones in any case, since the folder we're deleting
+        // is `NEW`.
+        source: PlacesUtils.bookmarks.SOURCES.DEFAULT,
+      });
+    } catch (ex) {
+      BookmarkSyncLog.debug(`dedupeFolders: Error removing deduped folder ` +
+                            guid, ex);
+    }
+  }
+}
+
+// Removes all local duplicate bookmarks, queries, livemarks, and separators
+// that have remote matches.
+async function dedupeItems(db, remoteIndices) {
+  let rows = await findDupeItems(db);
+  for (let row of rows) {
+    let dupeGroup = row.getResultByName("dupeGroup");
+    let oldGuids = matchDupeItemGuids(dupeGroup, remoteIndices);
+    for (let guid of oldGuids) {
+      try {
+        await PlacesUtils.bookmarks.remove(guid, {
+          preventRemovalOfNonEmptyFolders: true,
+          source: PlacesUtils.bookmarks.SOURCES.DEFAULT,
+        });
+      } catch (ex) {
+        BookmarkSyncLog.debug(`dedupeItems: Error removing deduped item ` +
+                              guid, ex);
+      }
+    }
+  }
+}
+
+// Returns an array of rows containing folder dupe group keys. Instead of
+// selecting the duplicate rows and re-grouping in JS, we encode the dupe
+// info into a string "dupe key", and then abuse SQLite's `GROUP_CONCAT` to join
+// the dupe keys into a "dupe group key". The dupe key is `{level} |
+// {position} | {guid} | {syncStatus}`, and the dupe group key is multiple dupe
+// keys joined with a comma.
+async function findDupeFolders(db) {
+  return db.executeCached(`
+    WITH RECURSIVE
+    descendants(id, guid, syncStatus, level, position, title) AS (
+      SELECT b.id, b.guid, b.syncStatus, 0, b.position, b.title
+      FROM moz_bookmarks b
+      WHERE b.guid IN ('menu________', 'toolbar_____', 'unfiled_____',
+                       'mobile______')
+      UNION ALL
+      SELECT b.id, b.guid, b.syncStatus, d.level + 1, b.position, b.title
+      FROM moz_bookmarks b
+      JOIN descendants d ON d.id = b.parent
+      WHERE b.type = :type AND NOT EXISTS (
+        SELECT 1
+        FROM moz_items_annos a
+        JOIN moz_anno_attributes n ON n.id = a.anno_attribute_id
+        WHERE a.item_id = b.id AND
+              n.name = :livemarksAnno
+      )
+    )
+    SELECT GROUP_CONCAT(d.position || '|' ||
+                        d.guid || '|' ||
+                        d.syncStatus, ',') AS dupeGroup
+    FROM descendants d
+    GROUP BY d.level, d.title
+    HAVING COUNT(*) > 1
+  `, { type: PlacesUtils.bookmarks.TYPE_FOLDER,
+       livemarksAnno: PlacesUtils.LMANNO_FEEDURI });
+}
+
+// Returns an array of rows containing smart bookmark, bookmark, livemark, and
+// separator dupe group keys. See `findDupeFolders` for an explanation of how
+// dupe keys and dupe group keys work. The dupe key for these items is just
+// `{guid} | {syncStatus}`; unlike folders, we don't need the level or position.
+async function findDupeItems(db) {
+  return db.executeCached(`
+    WITH
+    smartBookmarks(id, parent, smartBookmarkName, position, guid,
+                   syncStatus) AS (
+      SELECT b.id, b.parent, a.content, b.position, b.guid,
+             b.syncStatus
+      FROM moz_bookmarks b
+      JOIN moz_items_annos a ON a.item_id = b.id
+      JOIN moz_anno_attributes n ON n.id = a.anno_attribute_id
+      WHERE b.type = :bookmarkType AND
+            n.name = :smartBookmarksAnno
+    )
+
+    /* Smart bookmarks: parent, query name. */
+    SELECT GROUP_CONCAT(s.position || '|' ||
+                        s.guid || '|' ||
+                        s.syncStatus, ',') as dupeGroup
+    FROM smartBookmarks s
+    GROUP BY s.parent, s.smartBookmarkName
+    HAVING COUNT(*) > 1
+    UNION ALL
+
+    /* Bookmarks: parent, title, URL. */
+    SELECT GROUP_CONCAT(b.position || '|' ||
+                        b.guid || '|' ||
+                        b.syncStatus, ',') as dupeGroup
+    FROM moz_bookmarks b
+    JOIN moz_places h ON h.id = b.fk
+    LEFT JOIN smartBookmarks s ON s.id = b.id
+    WHERE b.type = :bookmarkType AND
+          s.id IS NULL
+    GROUP BY b.parent, b.title, h.url_hash, h.url
+    HAVING COUNT(*) > 1
+    UNION ALL
+
+    /* Livemarks: parent, title. TODO: Feed URL? */
+    SELECT GROUP_CONCAT(b.position || '|' ||
+                        b.guid || '|' ||
+                        b.syncStatus, ',') as dupeGroup
+    FROM moz_bookmarks b
+    JOIN moz_items_annos a ON a.item_id = b.id
+    JOIN moz_anno_attributes n ON n.id = a.anno_attribute_id
+    WHERE b.type = :folderType AND
+          n.name = :livemarksAnno
+    GROUP BY b.parent, b.title
+    HAVING COUNT(*) > 1
+    UNION ALL
+
+    /* Separators: parent. TODO: Only dupe immediate siblings? */
+    SELECT GROUP_CONCAT(b.position || '|' ||
+                        b.guid || '|' ||
+                        b.syncStatus, ',') as dupeGroup
+    FROM moz_bookmarks b
+    WHERE b.type = :separatorType
+    GROUP BY b.parent
+    HAVING COUNT(*) > 1
+  `, { bookmarkType: PlacesUtils.bookmarks.TYPE_BOOKMARK,
+       smartBookmarksAnno: BookmarkSyncUtils.SMART_BOOKMARKS_ANNO,
+       folderType: PlacesUtils.bookmarks.TYPE_FOLDER,
+       livemarksAnno: PlacesUtils.LMANNO_FEEDURI,
+       separatorType: PlacesUtils.bookmarks.TYPE_SEPARATOR });
+}
+
+function dupeGroupKeyToInfos(dupeGroup, remoteIndices) {
+  let infos = { local: [], remote: [] };
+
+  // Extract the position, GUID, and sync status of every dupe in the group,
+  // and determine whether the item is local or remote.
+  let dupes = dupeGroup.split(",");
+  for (let dupe of dupes) {
+    let [index, guid, syncStatus] = dupe.split("|");
+    let info = {
+      index: +index,
+      guid,
+      syncStatus: +syncStatus,
+    };
+    switch (info.syncStatus) {
+      // `NEW` and `UNKNOWN` items are candidates for deduping. `NEW` items
+      // aren't on the server yet, so we won't write tombstones or update their
+      // change counters if we have matching `NORMAL` bookmarks. `UNKNOWN`
+      // bookmarks may be on the server; we'll still dedupe them, but we'll need
+      // to bump their change counters (TODO: Actually do this).
+      case PlacesUtils.bookmarks.SYNC_STATUS.NEW:
+      case PlacesUtils.bookmarks.SYNC_STATUS.UNKNOWN:
+        infos.local.push(info);
+        break;
+
+      // `NORMAL` bookmarks are on the server, so they're possible matches for
+      // local dupes. We'll always match local to remote; never local to local
+      // or remote to remote.
+      case PlacesUtils.bookmarks.SYNC_STATUS.NORMAL:
+        infos.remote.push(info);
+        break;
+
+      default:
+        BookmarkSyncLog.debug(`dupeGroupKeyToInfos: Item ${info.guid} not ` +
+                              `duped because its sync status is ` +
+                              info.syncStatus);
+    }
+  }
+
+  // Sort the local dupes and remote matches by their indices. This makes it
+  // more likely that we'll correctly match up explicit dupes. We may dedupe
+  // incorrectly if the matches are in a different order, but there's not much
+  // we can do in that situation.
+  infos.local.sort((a, b) => b.index - a.index);
+  infos.remote.sort((a, b) => {
+    let aSyncId = BookmarkSyncUtils.syncIdToGuid(a.guid);
+    let bSyncId = BookmarkSyncUtils.syncIdToGuid(b.guid);
+    if (!remoteIndices.has(aSyncId) && !remoteIndices.has(bSyncId)) {
+      return 0;
+    }
+    if (!remoteIndices.has(aSyncId)) {
+      return 1;
+    }
+    if (!remoteIndices.has(bSyncId)) {
+      return -1;
+    }
+    return remoteIndices.get(b.guid) - remoteIndices.get(a.guid);
+  });
+
+  return infos;
+}
+
+// Returns an array of matching `{ oldGuid, newGuid }` tuples for the dupe
+// group. We'll need to move the contents of `oldGuid` into `newGuid`, then
+// remove `oldGuid`.
+function matchDupeFolders(dupeGroup, remoteIndices) {
+  let infos = dupeGroupKeyToInfos(dupeGroup, remoteIndices);
+  BookmarkSyncLog.debug(`matchDupeFolders: Found candidates for deduping`,
+                        infos);
+
+  // Map local folders to their matching remote folders. We can run out of local
+  // dupes or remote matches if there are explicit dupes. In that case, we'll
+  // leave the remaining candidates alone.
+  let matches = [];
+  let localIndex = 0;
+  let remoteIndex = 0;
+  while (localIndex < infos.local.length &&
+         remoteIndex < infos.remote.length) {
+    let oldGuid = infos.local[localIndex].guid;
+    let newGuid = infos.remote[remoteIndex].guid;
+    matches.push({ oldGuid, newGuid });
+    localIndex++;
+    remoteIndex++;
+  }
+
+  BookmarkSyncLog.debug(`matchDupeFolders: Deduping folders`, matches);
+  return matches;
+}
+
+// Returns an array of local non-folder dupe GUIDs that we should remove.
+function matchDupeItemGuids(dupeGroup, remoteIndices) {
+  let infos = dupeGroupKeyToInfos(dupeGroup, remoteIndices);
+  BookmarkSyncLog.debug(`matchDupeItemGuids: Found candidates for deduping`,
+                        infos);
+
+  // The number of local dupes to remove. We can run out of matches if there
+  // are explicit dupes; in that case, we'll leave the remaining local dupes
+  // alone.
+  let maxLocalItemsToDedupe = Math.min(infos.local.length, infos.remote.length);
+  let guidsToRemove = infos.local.slice(0, maxLocalItemsToDedupe);
+
+  BookmarkSyncLog.debug(`matchDupeItemGuids: Deduping items`, guidsToRemove);
+  return guidsToRemove;
+}