author Evelyn Hung <jj.evelyn@gmail.com>

Mon, 12 Jun 2017 17:16:28 +0800

changeset 592469 601390e03132687befc9205771161b9253d20262

parent 592464 27cad9749cddf68e11fdd4e5d73dad84a8f8cf23

child 592470 846ea2c0b4a05cf1ebcbf5cd593635a65f1b83e4

child 593180 ccf0f0e453819cb4f782ab081a42e10b470a8e91

push id 63409

push user bmo:ehung@mozilla.com

push date Mon, 12 Jun 2017 09:38:27 +0000

reviewers ehsan

bugs 1362858

milestone 55.0a1

extensions/spellcheck/src/mozInlineSpellWordUtil.cpp file | annotate | diff | comparison | revisions
--- a/extensions/spellcheck/src/mozInlineSpellWordUtil.cpp
+++ b/extensions/spellcheck/src/mozInlineSpellWordUtil.cpp
@@ -340,16 +340,257 @@ mozInlineSpellWordUtil::MakeRange(NodeOf
   if (NS_WARN_IF(NS_FAILED(rv))) {
     return rv;
   }
   range.forget(aRange);
 
   return NS_OK;
 }
 
+/*********** Word Splitting ************/
+
+// classifies a given character in the DOM word
+enum CharClass {
+  CHAR_CLASS_WORD,
+  CHAR_CLASS_SEPARATOR,
+  CHAR_CLASS_END_OF_INPUT };
+
+// Encapsulates DOM-word to real-word splitting
+struct MOZ_STACK_CLASS WordSplitState
+{
+  mozInlineSpellWordUtil*    mWordUtil;
+  const nsDependentSubstring mDOMWordText;
+  int32_t                    mDOMWordOffset;
+  CharClass                  mCurCharClass;
+
+  WordSplitState(mozInlineSpellWordUtil* aWordUtil,
+                 const nsString& aString, int32_t aStart, int32_t aLen)
+    : mWordUtil(aWordUtil), mDOMWordText(aString, aStart, aLen),
+      mDOMWordOffset(0), mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}
+
+  CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const;
+  void Advance();
+  void AdvanceThroughSeparators();
+  void AdvanceThroughWord();
+
+  // Finds special words like email addresses and URLs that may start at the
+  // current position, and returns their length, or 0 if not found. This allows
+  // arbitrary word breaking rules to be used for these special entities, as
+  // long as they can not contain whitespace.
+  bool IsSpecialWord();
+
+  // Similar to IsSpecialWord except that this takes a split word as
+  // input. This checks for things that do not require special word-breaking
+  // rules.
+  bool ShouldSkipWord(int32_t aStart, int32_t aLength);
+};
+
+// WordSplitState::ClassifyCharacter
+
+CharClass
+WordSplitState::ClassifyCharacter(int32_t aIndex, bool aRecurse) const
+{
+  NS_ASSERTION(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()),
+               "Index out of range");
+  if (aIndex == int32_t(mDOMWordText.Length()))
+    return CHAR_CLASS_SEPARATOR;
+
+  // this will classify the character, we want to treat "ignorable" characters
+  // such as soft hyphens, and also ZWJ and ZWNJ as word characters.
+  nsUGenCategory charCategory =
+    mozilla::unicode::GetGenCategory(mDOMWordText[aIndex]);
+  if (charCategory == nsUGenCategory::kLetter ||
+      IsIgnorableCharacter(mDOMWordText[aIndex]) ||
+      mDOMWordText[aIndex] == 0x200C /* ZWNJ */ ||
+      mDOMWordText[aIndex] == 0x200D /* ZWJ */)
+    return CHAR_CLASS_WORD;
+
+  // If conditional punctuation is surrounded immediately on both sides by word
+  // characters it also counts as a word character.
+  if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
+    if (!aRecurse) {
+      // not allowed to look around, this punctuation counts like a separator
+      return CHAR_CLASS_SEPARATOR;
+    }
+
+    // check the left-hand character
+    if (aIndex == 0)
+      return CHAR_CLASS_SEPARATOR;
+    if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
+      return CHAR_CLASS_SEPARATOR;
+    // If the previous charatcer is a word-char, make sure that it's not a
+    // special dot character.
+    if (mDOMWordText[aIndex - 1] == '.')
+      return CHAR_CLASS_SEPARATOR;
+
+    // now we know left char is a word-char, check the right-hand character
+    if (aIndex == int32_t(mDOMWordText.Length()) - 1)
+      return CHAR_CLASS_SEPARATOR;
+    if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
+      return CHAR_CLASS_SEPARATOR;
+    // If the next charatcer is a word-char, make sure that it's not a
+    // special dot character.
+    if (mDOMWordText[aIndex + 1] == '.')
+      return CHAR_CLASS_SEPARATOR;
+
+    // char on either side is a word, this counts as a word
+    return CHAR_CLASS_WORD;
+  }
+
+  // The dot character, if appearing at the end of a word, should
+  // be considered part of that word.  Example: "etc.", or
+  // abbreviations
+  if (aIndex > 0 &&
+      mDOMWordText[aIndex] == '.' &&
+      mDOMWordText[aIndex - 1] != '.' &&
+      ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) {
+    return CHAR_CLASS_WORD;
+  }
+
+  // all other punctuation
+  if (charCategory == nsUGenCategory::kSeparator ||
+      charCategory == nsUGenCategory::kOther ||
+      charCategory == nsUGenCategory::kPunctuation ||
+      charCategory == nsUGenCategory::kSymbol) {
+    // Don't break on hyphens, as hunspell handles them on its own.
+    if (aIndex > 0 &&
+        mDOMWordText[aIndex] == '-' &&
+        mDOMWordText[aIndex - 1] != '-' &&
+        ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) {
+      // A hyphen is only meaningful as a separator inside a word
+      // if the previous and next characters are a word character.
+      if (aIndex == int32_t(mDOMWordText.Length()) - 1)
+        return CHAR_CLASS_SEPARATOR;
+      if (mDOMWordText[aIndex + 1] != '.' &&
+          ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD)
+        return CHAR_CLASS_WORD;
+    }
+    return CHAR_CLASS_SEPARATOR;
+  }
+
+  // any other character counts as a word
+  return CHAR_CLASS_WORD;
+}
+
+
+// WordSplitState::Advance
+
+void
+WordSplitState::Advance()
+{
+  NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index");
+  NS_ASSERTION(mDOMWordOffset < (int32_t)mDOMWordText.Length(),
+               "Length beyond end");
+
+  mDOMWordOffset ++;
+  if (mDOMWordOffset >= (int32_t)mDOMWordText.Length())
+    mCurCharClass = CHAR_CLASS_END_OF_INPUT;
+  else
+    mCurCharClass = ClassifyCharacter(mDOMWordOffset, true);
+}
+
+
+// WordSplitState::AdvanceThroughSeparators
+
+void
+WordSplitState::AdvanceThroughSeparators()
+{
+  while (mCurCharClass == CHAR_CLASS_SEPARATOR)
+    Advance();
+}
+
+// WordSplitState::AdvanceThroughWord
+
+void
+WordSplitState::AdvanceThroughWord()
+{
+  while (mCurCharClass == CHAR_CLASS_WORD)
+    Advance();
+}
+
+
+// WordSplitState::IsSpecialWord
+
+bool
+WordSplitState::IsSpecialWord()
+{
+  // Search for email addresses. We simply define these as any sequence of
+  // characters with an '@' character in the middle. The DOM word is already
+  // split on whitepace, so we know that everything to the end is the address
+  int32_t firstColon = -1;
+  for (int32_t i = mDOMWordOffset;
+       i < int32_t(mDOMWordText.Length()); i ++) {
+    if (mDOMWordText[i] == '@') {
+      // only accept this if there are unambiguous word characters (don't bother
+      // recursing to disambiguate apostrophes) on each side. This prevents
+      // classifying, e.g. "@home" as an email address
+
+      // Use this condition to only accept words with '@' in the middle of
+      // them. It works, but the inlinespellcker doesn't like this. The problem
+      // is that you type "fhsgfh@" that's a misspelled word followed by a
+      // symbol, but when you type another letter "fhsgfh@g" that first word
+      // need to be unmarked misspelled. It doesn't do this. it only checks the
+      // current position for potentially removing a spelling range.
+      if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD &&
+          i < (int32_t)mDOMWordText.Length() - 1 &&
+          ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) {
+        return true;
+      }
+    } else if (mDOMWordText[i] == ':' && firstColon < 0) {
+      firstColon = i;
+
+      // If the first colon is followed by a slash, consider it a URL
+      // This will catch things like asdf://foo.com
+      if (firstColon < (int32_t)mDOMWordText.Length() - 1 &&
+          mDOMWordText[firstColon + 1] == '/') {
+        return true;
+      }
+    }
+  }
+
+  // Check the text before the first colon against some known protocols. It
+  // is impossible to check against all protocols, especially since you can
+  // plug in new protocols. We also don't want to waste time here checking
+  // against a lot of obscure protocols.
+  if (firstColon > mDOMWordOffset) {
+    nsString protocol(Substring(mDOMWordText, mDOMWordOffset,
+                      firstColon - mDOMWordOffset));
+    if (protocol.EqualsIgnoreCase("http") ||
+        protocol.EqualsIgnoreCase("https") ||
+        protocol.EqualsIgnoreCase("news") ||
+        protocol.EqualsIgnoreCase("file") ||
+        protocol.EqualsIgnoreCase("javascript") ||
+        protocol.EqualsIgnoreCase("data") ||
+        protocol.EqualsIgnoreCase("ftp")) {
+      return true;
+    }
+  }
+
+  // not anything special
+  return false;
+}
+
+// WordSplitState::ShouldSkipWord
+
+bool
+WordSplitState::ShouldSkipWord(int32_t aStart, int32_t aLength)
+{
+  int32_t last = aStart + aLength;
+
+  // check to see if the word contains a digit
+  for (int32_t i = aStart; i < last; i ++) {
+    if (unicode::GetGenCategory(mDOMWordText[i]) == nsUGenCategory::kNumber) {
+      return true;
+    }
+  }
+
+  // not special
+  return false;
+}
+
 /*********** DOM text extraction ************/
 
 // IsDOMWordSeparator
 //
 //    Determines if the given character should be considered as a DOM Word
 //    separator. Basically, this is whitespace, although it could also have
 //    certain punctuation that we know ALWAYS breaks words. This is important.
 //    For example, we can't have any punctuation that could appear in a URL
@@ -399,21 +640,30 @@ static bool
 TextNodeContainsDOMWordSeparator(nsINode* aNode,
                                  int32_t aBeforeOffset,
                                  int32_t* aSeparatorOffset)
 {
   // aNode is actually an nsIContent, since it's eTEXT
   nsIContent* content = static_cast<nsIContent*>(aNode);
   const nsTextFragment* textFragment = content->GetText();
   NS_ASSERTION(textFragment, "Where is our text?");
-  for (int32_t i = std::min(aBeforeOffset, int32_t(textFragment->GetLength())) - 1; i >= 0; --i) {
-    if (IsDOMWordSeparator(textFragment->CharAt(i))) {
+  nsString text;
+  int32_t end = std::min(aBeforeOffset, int32_t(textFragment->GetLength()));
+  bool ok = textFragment->AppendTo(text, 0, end, mozilla::fallible);
+  if(!ok)
+    return false;
+
+  WordSplitState state(nullptr, text, 0, end);
+  for (int32_t i = end - 1; i >= 0; --i) {
+    if (IsDOMWordSeparator(textFragment->CharAt(i)) ||
+        state.ClassifyCharacter(i, true) == CHAR_CLASS_SEPARATOR) {
       // Be greedy, find as many separators as we can
       for (int32_t j = i - 1; j >= 0; --j) {
-        if (IsDOMWordSeparator(textFragment->CharAt(j))) {
+        if (IsDOMWordSeparator(textFragment->CharAt(j)) ||
+            state.ClassifyCharacter(j, true) == CHAR_CLASS_SEPARATOR) {
           i = j;
         } else {
           break;
         }
       }
       *aSeparatorOffset = i;
       return true;
     }
@@ -797,257 +1047,16 @@ mozInlineSpellWordUtil::FindRealWordCont
     // mSoftTextOffset > aSoftTextOffset.
     if (index + 1 < mRealWords.Length())
       return index + 1;
   }
 
   return -1;
 }
 
-/*********** Word Splitting ************/
-
-// classifies a given character in the DOM word
-enum CharClass {
-  CHAR_CLASS_WORD,
-  CHAR_CLASS_SEPARATOR,
-  CHAR_CLASS_END_OF_INPUT };
-
-// Encapsulates DOM-word to real-word splitting
-struct MOZ_STACK_CLASS WordSplitState
-{
-  mozInlineSpellWordUtil*    mWordUtil;
-  const nsDependentSubstring mDOMWordText;
-  int32_t                    mDOMWordOffset;
-  CharClass                  mCurCharClass;
-
-  WordSplitState(mozInlineSpellWordUtil* aWordUtil,
-                 const nsString& aString, int32_t aStart, int32_t aLen)
-    : mWordUtil(aWordUtil), mDOMWordText(aString, aStart, aLen),
-      mDOMWordOffset(0), mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}
-
-  CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const;
-  void Advance();
-  void AdvanceThroughSeparators();
-  void AdvanceThroughWord();
-
-  // Finds special words like email addresses and URLs that may start at the
-  // current position, and returns their length, or 0 if not found. This allows
-  // arbitrary word breaking rules to be used for these special entities, as
-  // long as they can not contain whitespace.
-  bool IsSpecialWord();
-
-  // Similar to IsSpecialWord except that this takes a split word as
-  // input. This checks for things that do not require special word-breaking
-  // rules.
-  bool ShouldSkipWord(int32_t aStart, int32_t aLength);
-};
-
-// WordSplitState::ClassifyCharacter
-
-CharClass
-WordSplitState::ClassifyCharacter(int32_t aIndex, bool aRecurse) const
-{
-  NS_ASSERTION(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()),
-               "Index out of range");
-  if (aIndex == int32_t(mDOMWordText.Length()))
-    return CHAR_CLASS_SEPARATOR;
-
-  // this will classify the character, we want to treat "ignorable" characters
-  // such as soft hyphens, and also ZWJ and ZWNJ as word characters.
-  nsUGenCategory charCategory =
-    mozilla::unicode::GetGenCategory(mDOMWordText[aIndex]);
-  if (charCategory == nsUGenCategory::kLetter ||
-      IsIgnorableCharacter(mDOMWordText[aIndex]) ||
-      mDOMWordText[aIndex] == 0x200C /* ZWNJ */ ||
-      mDOMWordText[aIndex] == 0x200D /* ZWJ */)
-    return CHAR_CLASS_WORD;
-
-  // If conditional punctuation is surrounded immediately on both sides by word
-  // characters it also counts as a word character.
-  if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
-    if (!aRecurse) {
-      // not allowed to look around, this punctuation counts like a separator
-      return CHAR_CLASS_SEPARATOR;
-    }
-
-    // check the left-hand character
-    if (aIndex == 0)
-      return CHAR_CLASS_SEPARATOR;
-    if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
-      return CHAR_CLASS_SEPARATOR;
-    // If the previous charatcer is a word-char, make sure that it's not a
-    // special dot character.
-    if (mDOMWordText[aIndex - 1] == '.')
-      return CHAR_CLASS_SEPARATOR;
-
-    // now we know left char is a word-char, check the right-hand character
-    if (aIndex == int32_t(mDOMWordText.Length()) - 1)
-      return CHAR_CLASS_SEPARATOR;
-    if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
-      return CHAR_CLASS_SEPARATOR;
-    // If the next charatcer is a word-char, make sure that it's not a
-    // special dot character.
-    if (mDOMWordText[aIndex + 1] == '.')
-      return CHAR_CLASS_SEPARATOR;
-
-    // char on either side is a word, this counts as a word
-    return CHAR_CLASS_WORD;
-  }
-
-  // The dot character, if appearing at the end of a word, should
-  // be considered part of that word.  Example: "etc.", or
-  // abbreviations
-  if (aIndex > 0 &&
-      mDOMWordText[aIndex] == '.' &&
-      mDOMWordText[aIndex - 1] != '.' &&
-      ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) {
-    return CHAR_CLASS_WORD;
-  }
-
-  // all other punctuation
-  if (charCategory == nsUGenCategory::kSeparator ||
-      charCategory == nsUGenCategory::kOther ||
-      charCategory == nsUGenCategory::kPunctuation ||
-      charCategory == nsUGenCategory::kSymbol) {
-    // Don't break on hyphens, as hunspell handles them on its own.
-    if (aIndex > 0 &&
-        mDOMWordText[aIndex] == '-' &&
-        mDOMWordText[aIndex - 1] != '-' &&
-        ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) {
-      // A hyphen is only meaningful as a separator inside a word
-      // if the previous and next characters are a word character.
-      if (aIndex == int32_t(mDOMWordText.Length()) - 1)
-        return CHAR_CLASS_SEPARATOR;
-      if (mDOMWordText[aIndex + 1] != '.' &&
-          ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD)
-        return CHAR_CLASS_WORD;
-    }
-    return CHAR_CLASS_SEPARATOR;
-  }
-
-  // any other character counts as a word
-  return CHAR_CLASS_WORD;
-}
-
-
-// WordSplitState::Advance
-
-void
-WordSplitState::Advance()
-{
-  NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index");
-  NS_ASSERTION(mDOMWordOffset < (int32_t)mDOMWordText.Length(),
-               "Length beyond end");
-
-  mDOMWordOffset ++;
-  if (mDOMWordOffset >= (int32_t)mDOMWordText.Length())
-    mCurCharClass = CHAR_CLASS_END_OF_INPUT;
-  else
-    mCurCharClass = ClassifyCharacter(mDOMWordOffset, true);
-}
-
-
-// WordSplitState::AdvanceThroughSeparators
-
-void
-WordSplitState::AdvanceThroughSeparators()
-{
-  while (mCurCharClass == CHAR_CLASS_SEPARATOR)
-    Advance();
-}
-
-// WordSplitState::AdvanceThroughWord
-
-void
-WordSplitState::AdvanceThroughWord()
-{
-  while (mCurCharClass == CHAR_CLASS_WORD)
-    Advance();
-}
-
-
-// WordSplitState::IsSpecialWord
-
-bool
-WordSplitState::IsSpecialWord()
-{
-  // Search for email addresses. We simply define these as any sequence of
-  // characters with an '@' character in the middle. The DOM word is already
-  // split on whitepace, so we know that everything to the end is the address
-  int32_t firstColon = -1;
-  for (int32_t i = mDOMWordOffset;
-       i < int32_t(mDOMWordText.Length()); i ++) {
-    if (mDOMWordText[i] == '@') {
-      // only accept this if there are unambiguous word characters (don't bother
-      // recursing to disambiguate apostrophes) on each side. This prevents
-      // classifying, e.g. "@home" as an email address
-
-      // Use this condition to only accept words with '@' in the middle of
-      // them. It works, but the inlinespellcker doesn't like this. The problem
-      // is that you type "fhsgfh@" that's a misspelled word followed by a
-      // symbol, but when you type another letter "fhsgfh@g" that first word
-      // need to be unmarked misspelled. It doesn't do this. it only checks the
-      // current position for potentially removing a spelling range.
-      if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD &&
-          i < (int32_t)mDOMWordText.Length() - 1 &&
-          ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) {
-        return true;
-      }
-    } else if (mDOMWordText[i] == ':' && firstColon < 0) {
-      firstColon = i;
-
-      // If the first colon is followed by a slash, consider it a URL
-      // This will catch things like asdf://foo.com
-      if (firstColon < (int32_t)mDOMWordText.Length() - 1 &&
-          mDOMWordText[firstColon + 1] == '/') {
-        return true;
-      }
-    }
-  }
-
-  // Check the text before the first colon against some known protocols. It
-  // is impossible to check against all protocols, especially since you can
-  // plug in new protocols. We also don't want to waste time here checking
-  // against a lot of obscure protocols.
-  if (firstColon > mDOMWordOffset) {
-    nsString protocol(Substring(mDOMWordText, mDOMWordOffset,
-                      firstColon - mDOMWordOffset));
-    if (protocol.EqualsIgnoreCase("http") ||
-        protocol.EqualsIgnoreCase("https") ||
-        protocol.EqualsIgnoreCase("news") ||
-        protocol.EqualsIgnoreCase("file") ||
-        protocol.EqualsIgnoreCase("javascript") ||
-        protocol.EqualsIgnoreCase("data") ||
-        protocol.EqualsIgnoreCase("ftp")) {
-      return true;
-    }
-  }
-
-  // not anything special
-  return false;
-}
-
-// WordSplitState::ShouldSkipWord
-
-bool
-WordSplitState::ShouldSkipWord(int32_t aStart, int32_t aLength)
-{
-  int32_t last = aStart + aLength;
-
-  // check to see if the word contains a digit
-  for (int32_t i = aStart; i < last; i ++) {
-    if (unicode::GetGenCategory(mDOMWordText[i]) == nsUGenCategory::kNumber) {
-      return true;
-    }
-  }
-
-  // not special
-  return false;
-}
-
 // mozInlineSpellWordUtil::SplitDOMWord
 
 nsresult
 mozInlineSpellWordUtil::SplitDOMWord(int32_t aStart, int32_t aEnd)
 {
   WordSplitState state(this, mSoftText, aStart, aEnd - aStart);
   state.mCurCharClass = state.ClassifyCharacter(0, true);
author	Evelyn Hung <jj.evelyn@gmail.com>
	Mon, 12 Jun 2017 17:16:28 +0800
changeset 592469	601390e03132687befc9205771161b9253d20262
parent 592464	27cad9749cddf68e11fdd4e5d73dad84a8f8cf23
child 592470	846ea2c0b4a05cf1ebcbf5cd593635a65f1b83e4
child 593180	ccf0f0e453819cb4f782ab081a42e10b470a8e91
push id	63409
push user	bmo:ehung@mozilla.com
push date	Mon, 12 Jun 2017 09:38:27 +0000
reviewers	ehsan
bugs	1362858
milestone	55.0a1