Bug 425915 - word boundary detection for Thai text draft
authorTheppitak Karoonboonyanan <thep@linux.thai.net>
Sun, 23 Apr 2017 22:45:13 +0800
changeset 566776 7861406f8916b00b65a1bd737ca9566f74ac40dd
parent 566736 42c8a716081e6a8aa33c1e61944f36f6f89245fb
child 625418 46654c700300d4e223973d7bf6172358adab37b2
push id55326
push useraxel@mozilla.com
push dateSun, 23 Apr 2017 14:46:20 +0000
bugs425915
milestone55.0a1
Bug 425915 - word boundary detection for Thai text MozReview-Commit-ID: 85aSPXLLBlW
intl/lwbrk/nsIWordBreaker.h
intl/lwbrk/nsSampleWordBreaker.cpp
intl/lwbrk/nsSampleWordBreaker.h
layout/generic/nsTextFrame.cpp
layout/generic/test/mochitest.ini
layout/generic/test/test_bug425915.html
--- a/intl/lwbrk/nsIWordBreaker.h
+++ b/intl/lwbrk/nsIWordBreaker.h
@@ -6,20 +6,20 @@
 #define nsIWordBreaker_h__
 
 #include "nsISupports.h"
 
 #include "nscore.h"
 
 #define NS_WORDBREAKER_NEED_MORE_TEXT -1
 
-// {E86B3379-BF89-11d2-B3AF-00805F8A6670}
+// {CE044592-9B21-4DB8-9AB6-D8F752D543BA}
 #define NS_IWORDBREAKER_IID \
-{ 0xe86b3379, 0xbf89, 0x11d2, \
-   { 0xb3, 0xaf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
+{ 0xce044592, 0x9b21, 0x4db8, \
+   { 0x9a, 0xb6, 0xd8, 0xf7, 0x52, 0xd5, 0x43, 0xba } }
 
 typedef struct {
   uint32_t mBegin;
   uint32_t mEnd;
 } nsWordRange;
 
 class nsIWordBreaker : public nsISupports
 {
@@ -28,14 +28,21 @@ public:
 
   virtual bool BreakInBetween(const char16_t* aText1 , uint32_t aTextLen1,
                                 const char16_t* aText2 ,
                                 uint32_t aTextLen2) = 0;
   virtual nsWordRange FindWord(const char16_t* aText1 , uint32_t aTextLen1,
                                uint32_t aOffset) = 0;
   virtual int32_t NextWord(const char16_t* aText, uint32_t aLen, 
                            uint32_t aPos) = 0;
-                           
+
+  /* Analyze text in aText and turn on corresponding word break positions
+   * in aBreakBefore[].
+   *
+   * Note that aBreakBefore[] is assumed to be initially filled with false.
+   * Only word break positions are set to true, and the rest are untouched. */
+  virtual void GetWordBreaks(const char16_t* aText, uint32_t aLen,
+                             bool* aBreakBefore) = 0;
 };
 
 NS_DEFINE_STATIC_IID_ACCESSOR(nsIWordBreaker, NS_IWORDBREAKER_IID)
 
 #endif  /* nsIWordBreaker_h__ */
--- a/intl/lwbrk/nsSampleWordBreaker.cpp
+++ b/intl/lwbrk/nsSampleWordBreaker.cpp
@@ -1,15 +1,17 @@
 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 
 #include "nsSampleWordBreaker.h"
+#include "nsComplexBreaker.h"
+#include "nsTArray.h"
 
 nsSampleWordBreaker::nsSampleWordBreaker()
 {
 }
 nsSampleWordBreaker::~nsSampleWordBreaker()
 {
 }
 
@@ -20,17 +22,29 @@ bool nsSampleWordBreaker::BreakInBetween
   const char16_t* aText2 , uint32_t aTextLen2)
 {
   NS_PRECONDITION( nullptr != aText1, "null ptr");
   NS_PRECONDITION( nullptr != aText2, "null ptr");
 
   if(!aText1 || !aText2 || (0 == aTextLen1) || (0 == aTextLen2))
     return false;
 
-  return (this->GetClass(aText1[aTextLen1-1]) != this->GetClass(aText2[0]));
+  uint8_t c1 = this->GetClass(aText1[aTextLen1-1]);
+  uint8_t c2 = this->GetClass(aText2[0]);
+  if (kWbClassThaiLetter == c1 && kWbClassThaiLetter == c2)
+  {
+    nsAutoString text(aText1, aTextLen1);
+    text.Append(aText2, aTextLen2);
+    AutoTArray<uint8_t,100> breakBefore;
+    breakBefore.SetLength(aTextLen1 + aTextLen2);
+    NS_GetComplexLineBreaks(text.get(), text.Length(), breakBefore.Elements());
+    return breakBefore[aTextLen1];
+  }
+
+  return (c1 != c2);
 }
 
 
 #define IS_ASCII(c)            (0 == ( 0xFF80 & (c)))
 #define ASCII_IS_ALPHA(c)         ((( 'a' <= (c)) && ((c) <= 'z')) || (( 'A' <= (c)) && ((c) <= 'Z')))
 #define ASCII_IS_DIGIT(c)         (( '0' <= (c)) && ((c) <= '9'))
 #define ASCII_IS_SPACE(c)         (( ' ' == (c)) || ( '\t' == (c)) || ( '\r' == (c)) || ( '\n' == (c)))
 #define IS_ALPHABETICAL_SCRIPT(c) ((c) < 0x2E80) 
@@ -113,18 +127,38 @@ nsWordRange nsSampleWordBreaker::FindWor
      if( c != this->GetClass(aText[i-1]))
      {
        range.mBegin = i;
        break;
      }
   }
   if(kWbClassThaiLetter == c)
   {
-	// need to call Thai word breaker from here
-	// we should pass the whole Thai segment to the thai word breaker to find a shorter answer
+    AutoTArray<uint8_t,100> breakBefore;
+    breakBefore.SetLength(range.mEnd - range.mBegin);
+    NS_GetComplexLineBreaks(aText + range.mBegin, range.mEnd - range.mBegin,
+                            breakBefore.Elements());
+    // Scan forward
+    for (i = aOffset + 1; i < range.mEnd; i++)
+    {
+      if (breakBefore[i - range.mBegin])
+      {
+        range.mEnd = i;
+        break;
+      }
+    }
+    // Scan backward
+    for (i = aOffset; i > range.mBegin; i--)
+    {
+      if (breakBefore[i - range.mBegin])
+      {
+        range.mBegin = i;
+        break;
+      }
+    }
   }
   return range;
 }
 
 int32_t nsSampleWordBreaker::NextWord( 
   const char16_t* aText, uint32_t aLen, uint32_t aPos) 
 {
   int8_t c1, c2;
@@ -136,15 +170,60 @@ int32_t nsSampleWordBreaker::NextWord(
   for(cur++; cur <aLen; cur++)
   {
      c2 = this->GetClass(aText[cur]);
      if(c2 != c1) 
        break;
   }
   if(kWbClassThaiLetter == c1)
   {
-	// need to call Thai word breaker from here
-	// we should pass the whole Thai segment to the thai word breaker to find a shorter answer
+    AutoTArray<uint8_t,100> breakBefore;
+    breakBefore.SetLength(aLen - aPos);
+    NS_GetComplexLineBreaks(aText + aPos, aLen - aPos, breakBefore.Elements());
+    uint32_t i = 0;
+    while (i < cur - aPos && !breakBefore[i])
+    {
+      i++;
+    }
+    if (i < cur - aPos)
+      return aPos + i;
   }
   if (cur == aLen)
     return NS_WORDBREAKER_NEED_MORE_TEXT;
   return cur;
 }
+
+void nsSampleWordBreaker::GetWordBreaks(
+    const char16_t* aText, uint32_t aLen, bool* aBreakBefore)
+{
+  uint32_t curBegin, curEnd;
+  int8_t c1, c2;
+
+  curBegin = 0;
+  c1 = this->GetClass(aText[0]);
+  while (curBegin < aLen)
+  {
+    for (curEnd = curBegin + 1; curEnd < aLen; curEnd++)
+    {
+      c2 = this->GetClass(aText[curEnd]);
+      if (c2 != c1)
+        break;
+    }
+    if (kWbClassThaiLetter == c1)
+    {
+      AutoTArray<uint8_t,100> breakBefore;
+      breakBefore.SetLength(curEnd - curBegin);
+      NS_GetComplexLineBreaks(aText + curBegin, curEnd - curBegin,
+                              breakBefore.Elements());
+      for (uint32_t i = curBegin + 1; i < curEnd; i++)
+      {
+        aBreakBefore[i] |= breakBefore[i - curBegin];
+      }
+    }
+    if (curEnd < aLen)
+    {
+      aBreakBefore[curEnd] = true;
+    }
+    curBegin = curEnd;
+    c1 = c2;
+  }
+}
+
--- a/intl/lwbrk/nsSampleWordBreaker.h
+++ b/intl/lwbrk/nsSampleWordBreaker.h
@@ -28,15 +28,19 @@ public:
 
   bool BreakInBetween(const char16_t* aText1 , uint32_t aTextLen1,
                         const char16_t* aText2 , uint32_t aTextLen2) override;
   nsWordRange FindWord(const char16_t* aText1 , uint32_t aTextLen1,
                        uint32_t aOffset) override;
 
   int32_t NextWord(const char16_t* aText, uint32_t aLen, uint32_t aPos) override;
 
+ 
+  void GetWordBreaks(const char16_t* aText, uint32_t aLen,
+                     bool* aBreakBefore) override;
+
 protected:
   uint8_t  GetClass(char16_t aChar);
 
   virtual ~nsSampleWordBreaker();
 };
 
 #endif  /* nsSampleWordBreaker_h__ */
--- a/layout/generic/nsTextFrame.cpp
+++ b/layout/generic/nsTextFrame.cpp
@@ -8009,16 +8009,17 @@ ClusterIterator::NextCluster()
       keepGoing = mIterator.IsOriginalCharSkipped() ||
           mIterator.GetOriginalOffset() >= mTrimmed.GetEnd() ||
           !textRun->IsClusterStart(mIterator.GetSkippedOffset());
       mCharIndex = mIterator.GetOriginalOffset();
     }
 
     if (mWordBreaks[GetBeforeOffset() - mTextFrame->GetContentOffset()]) {
       mHaveWordBreak = true;
+      return true;
     }
     if (!keepGoing)
       return true;
   }
 }
 
 ClusterIterator::ClusterIterator(nsTextFrame* aTextFrame, int32_t aPosition,
                                  int32_t aDirection, nsString& aContext)
@@ -8055,22 +8056,23 @@ ClusterIterator::ClusterIterator(nsTextF
       mWordBreaks[textLen] = true;
     }
     textStart = 0;
     nsAutoString str;
     mFrag->AppendTo(str, textOffset, textLen);
     aContext.Insert(str, 0);
   }
   nsIWordBreaker* wordBreaker = nsContentUtils::WordBreaker();
+  AutoTArray<bool,100> breakBefore;
+  breakBefore.SetLength(aContext.Length() + 1);
+  mozilla::PodZero(breakBefore.Elements(), breakBefore.Length());
+  wordBreaker->GetWordBreaks(aContext.get(), aContext.Length(),
+                             breakBefore.Elements());
   for (int32_t i = 0; i <= textLen; ++i) {
-    int32_t indexInText = i + textStart;
-    mWordBreaks[i] |=
-      wordBreaker->BreakInBetween(aContext.get(), indexInText,
-                                  aContext.get() + indexInText,
-                                  aContext.Length() - indexInText);
+    mWordBreaks[i] |= breakBefore[textStart + i];
   }
 }
 
 nsIFrame::FrameSearchResult
 nsTextFrame::PeekOffsetWord(bool aForward, bool aWordSelectEatSpace, bool aIsKeyboardSelect,
                             int32_t* aOffset, PeekWordState* aState)
 {
   int32_t contentLength = GetContentLength();
--- a/layout/generic/test/mochitest.ini
+++ b/layout/generic/test/mochitest.ini
@@ -40,16 +40,17 @@ support-files = bug344830_testembed.svg
 [test_bug405178.html]
 [test_bug416168.html]
 [test_bug421436.html]
 [test_bug421839-1.html]
 skip-if = true # Disabled for calling finish twice
 [test_bug421839-2.html]
 support-files = bug421839-2-page.html
 [test_bug424627.html]
+[test_bug425915.html]
 [test_bug438840.html]
 [test_bug448860.html]
 [test_bug448987.html]
 skip-if = true # Bug 932296
 support-files = file_bug448987.html file_bug448987_ref.html file_bug448987_notref.html
 [test_bug449653.html]
 support-files = file_bug449653_1.html file_bug449653_1_ref.html
 [test_bug460532.html]
new file mode 100644
--- /dev/null
+++ b/layout/generic/test/test_bug425915.html
@@ -0,0 +1,130 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=425915
+-->
+
+<head>
+  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+  <title>Test for Bug 425915</title>
+  <script type="application/javascript" src="/MochiKit/MochiKit.js"></script>
+  <script type="application/javascript"
+          src="/tests/SimpleTest/SimpleTest.js"></script>
+  <script type="application/javascript"
+          src="/tests/SimpleTest/WindowSnapshot.js"></script>
+  <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+</head>
+
+<body onload="run()">
+<a target="_blank"
+   href="https://bugzilla.mozilla.org/show_bug.cgi?id=425915">
+  Mozilla Bug 425915
+</a>
+<div id="complex" contenteditable="true">
+  <p id="c1">นี่คือคำไทย This is English. ไทยอีกครั้ง</p>
+</div>
+
+<div>
+  <p>Anchor offset: <span id="anchor-offset"></span></p>
+  <p>Focus offset: <span id="focus-offset"></span></p>
+</div>
+
+<script type="application/javascript">
+SimpleTest.waitForExplicitFinish();
+
+function isOrIsParent(actual, expected, msg) {
+  // actual should be expected or actual's parent node should be expected.
+  msg += " Expected " + actual + " or " + actual.parentNode +
+         " to be " + expected + ".";
+
+  ok(actual == expected || actual.parentNode == expected, msg);
+}
+
+function isAt(anchorNode, anchorOffset, focusNode, focusOffset, msg) {
+  var sel = window.getSelection();
+
+  isOrIsParent(sel.anchorNode, $(anchorNode), msg + ": Wrong anchor node.");
+  is(sel.anchorOffset, anchorOffset, msg + ": Wrong anchor offset.");
+  isOrIsParent(sel.focusNode, $(focusNode), msg + ": Wrong focus node.");
+  is(sel.focusOffset, $(focusOffset), msg + ": Wrong focus offset.");
+}
+
+function run() {
+  var sel = window.getSelection();
+
+  // If nothing is focused, selection.modify() should silently fail.
+  sel.removeAllRanges();
+  sel.modify("move", "forward", "character");
+
+  // Now focus our first div and put the cursor at the beginning of c1.
+  $("complex").focus();
+  sel.collapse($("c1"), 0);
+
+  // Move forward through Thai word-wise
+  isAt("c1", 0, "c1", 0, "test 0a");
+  sel.modify("move", "forward", "word");
+  isAt("c1", 3, "c1", 3, "test 0b");
+  sel.modify("move", "forward", "word");
+  isAt("c1", 6, "c1", 6, "test 0c");
+  sel.modify("move", "forward", "word");
+  isAt("c1", 8, "c1", 8, "test 0d");
+  sel.modify("move", "forward", "word");
+  isAt("c1", 11, "c1", 11, "test 0e");
+
+  // Move forward through English word-wise
+  sel.modify("move", "forward", "word");
+  isAt("c1", 16, "c1", 16, "test 0f");
+  sel.modify("move", "forward", "word");
+  isAt("c1", 19, "c1", 19, "test 0g");
+  sel.modify("move", "forward", "word");
+  isAt("c1", 28, "c1", 28, "test 0h");
+
+  // Move forward through Thai word-wise again
+  sel.modify("move", "forward", "word");
+  isAt("c1", 32, "c1", 32, "test 0i");
+  sel.modify("move", "forward", "word");
+  isAt("c1", 35, "c1", 35, "test 0j");
+  sel.modify("move", "forward", "word");
+  isAt("c1", 40, "c1", 40, "test 0k");
+
+  // Move backward through Thai word-wise
+  sel.modify("move", "backward", "word");
+  isAt("c1", 35, "c1", 35, "test 1a");
+  sel.modify("move", "backward", "word");
+  isAt("c1", 32, "c1", 32, "test 1b");
+  sel.modify("move", "backward", "word");
+  isAt("c1", 29, "c1", 29, "test 1c");
+
+  // Move backward through English word-wise
+  sel.modify("move", "backward", "word");
+  isAt("c1", 20, "c1", 20, "test 1d");
+  sel.modify("move", "backward", "word");
+  isAt("c1", 17, "c1", 17, "test 1e");
+  sel.modify("move", "backward", "word");
+  isAt("c1", 12, "c1", 12, "test 1f");
+
+  // Move backward through Thai word-wise again
+  sel.modify("move", "backward", "word");
+  isAt("c1", 8, "c1", 8, "test 1g");
+  sel.modify("move", "backward", "word");
+  isAt("c1", 6, "c1", 6, "test 1h");
+  sel.modify("move", "backward", "word");
+  isAt("c1", 3, "c1", 3, "test 1i");
+  sel.modify("move", "backward", "word");
+  isAt("c1", 0, "c1", 0, "test 1j");
+
+  SimpleTest.finish();
+}
+
+function update_debug_info() {
+  var sel = window.getSelection();
+  document.getElementById("anchor-offset").innerHTML = sel.anchorOffset;
+  document.getElementById("focus-offset").innerHTML = sel.focusOffset;
+  setTimeout(update_debug_info, 100);
+}
+
+setTimeout(update_debug_info, 100);
+
+</script>
+</body>
+</html>