--- a/third_party/rust/unicode-segmentation/src/grapheme.rs
+++ b/third_party/rust/unicode-segmentation/src/grapheme.rs
@@ -59,20 +59,18 @@ impl<'a> DoubleEndedIterator for Graphem
}
}
/// External iterator for a string's
/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
#[derive(Clone)]
pub struct Graphemes<'a> {
string: &'a str,
- extended: bool,
- cat: Option<GraphemeCat>,
- catb: Option<GraphemeCat>,
- regional_count_back: Option<usize>,
+ cursor: GraphemeCursor,
+ cursor_back: GraphemeCursor,
}
impl<'a> Graphemes<'a> {
#[inline]
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
///
/// ```rust
/// # use unicode_segmentation::UnicodeSegmentation;
@@ -80,356 +78,627 @@ impl<'a> Graphemes<'a> {
/// assert_eq!(iter.as_str(), "abc");
/// iter.next();
/// assert_eq!(iter.as_str(), "bc");
/// iter.next();
/// iter.next();
/// assert_eq!(iter.as_str(), "");
/// ```
pub fn as_str(&self) -> &'a str {
- self.string
+ &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
}
}
-// state machine for cluster boundary rules
-#[derive(Copy,Clone,PartialEq,Eq)]
-enum GraphemeState {
- Start,
- FindExtend,
- HangulL,
- HangulLV,
- HangulLVT,
- Prepend,
- Regional,
- Emoji,
- Zwj,
-}
-
impl<'a> Iterator for Graphemes<'a> {
type Item = &'a str;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
- let slen = self.string.len();
+ let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
(cmp::min(slen, 1), Some(slen))
}
#[inline]
fn next(&mut self) -> Option<&'a str> {
- use self::GraphemeState::*;
- use tables::grapheme as gr;
- if self.string.len() == 0 {
+ let start = self.cursor.cur_cursor();
+ if start == self.cursor_back.cur_cursor() {
return None;
}
-
- let mut take_curr = true;
- let mut idx = 0;
- let mut state = Start;
- let mut cat = gr::GC_Any;
-
- // caching used by next_back() should be invalidated
- self.regional_count_back = None;
- self.catb = None;
-
- for (curr, ch) in self.string.char_indices() {
- idx = curr;
-
- // retrieve cached category, if any
- // We do this because most of the time we would end up
- // looking up each character twice.
- cat = match self.cat {
- None => gr::grapheme_category(ch),
- _ => self.cat.take().unwrap()
- };
-
- if (state, cat) == (Emoji, gr::GC_Extend) {
- continue; // rule GB10
- }
-
- if let Some(new_state) = match cat {
- gr::GC_Extend => Some(FindExtend), // rule GB9
- gr::GC_SpacingMark if self.extended => Some(FindExtend), // rule GB9a
- gr::GC_ZWJ => Some(Zwj), // rule GB9/GB11
- _ => None
- } {
- state = new_state;
- continue;
- }
-
- state = match state {
- Start if '\r' == ch => {
- let slen = self.string.len();
- let nidx = idx + 1;
- if nidx != slen && self.string[nidx..].chars().next().unwrap() == '\n' {
- idx = nidx; // rule GB3
- }
- break; // rule GB4
- }
- Start | Prepend => match cat {
- gr::GC_Control => { // rule GB5
- take_curr = state == Start;
- break;
- }
- gr::GC_L => HangulL,
- gr::GC_LV | gr::GC_V => HangulLV,
- gr::GC_LVT | gr::GC_T => HangulLVT,
- gr::GC_Prepend if self.extended => Prepend,
- gr::GC_Regional_Indicator => Regional,
- gr::GC_E_Base | gr::GC_E_Base_GAZ => Emoji,
- _ => FindExtend
- },
- FindExtend => { // found non-extending when looking for extending
- take_curr = false;
- break;
- },
- HangulL => match cat { // rule GB6: L x (L|V|LV|LVT)
- gr::GC_L => continue,
- gr::GC_LV | gr::GC_V => HangulLV,
- gr::GC_LVT => HangulLVT,
- _ => {
- take_curr = false;
- break;
- }
- },
- HangulLV => match cat { // rule GB7: (LV|V) x (V|T)
- gr::GC_V => continue,
- gr::GC_T => HangulLVT,
- _ => {
- take_curr = false;
- break;
- }
- },
- HangulLVT => match cat { // rule GB8: (LVT|T) x T
- gr::GC_T => continue,
- _ => {
- take_curr = false;
- break;
- }
- },
- Regional => match cat { // rule GB12/GB13
- gr::GC_Regional_Indicator => FindExtend,
- _ => {
- take_curr = false;
- break;
- }
- },
- Emoji => match cat { // rule GB10: (E_Base|EBG) Extend* x E_Modifier
- gr::GC_E_Modifier => continue,
- _ => {
- take_curr = false;
- break;
- }
- },
- Zwj => match cat { // rule GB11: ZWJ x (GAZ|EBG)
- gr::GC_Glue_After_Zwj => continue,
- gr::GC_E_Base_GAZ => Emoji,
- _ => {
- take_curr = false;
- break;
- }
- },
- }
- }
-
- self.cat = if take_curr {
- idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
- None
- } else {
- Some(cat)
- };
-
- let retstr = &self.string[..idx];
- self.string = &self.string[idx..];
- Some(retstr)
+ let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
+ Some(&self.string[start..next])
}
}
impl<'a> DoubleEndedIterator for Graphemes<'a> {
#[inline]
fn next_back(&mut self) -> Option<&'a str> {
- use self::GraphemeState::*;
- use tables::grapheme as gr;
- if self.string.len() == 0 {
+ let end = self.cursor_back.cur_cursor();
+ if end == self.cursor.cur_cursor() {
return None;
}
-
- let mut take_curr = true;
- let mut idx = self.string.len();
- let mut previdx = idx;
- let mut state = Start;
- let mut cat = gr::GC_Any;
-
- // caching used by next() should be invalidated
- self.cat = None;
-
- 'outer: for (curr, ch) in self.string.char_indices().rev() {
- previdx = idx;
- idx = curr;
-
- // cached category, if any
- cat = match self.catb {
- None => gr::grapheme_category(ch),
- _ => self.catb.take().unwrap()
- };
-
- // a matching state machine that runs *backwards* across an input string
- // note that this has some implications for the Hangul matching, since
- // we now need to know what the rightward letter is:
- //
- // Right to left, we have:
- // L x L
- // V x (L|V|LV)
- // T x (V|T|LV|LVT)
- // HangulL means the letter to the right is L
- // HangulLV means the letter to the right is V
- // HangulLVT means the letter to the right is T
- state = match state {
- Start if '\n' == ch => {
- if idx > 0 && '\r' == self.string[..idx].chars().next_back().unwrap() {
- idx -= 1; // rule GB3
- }
- break; // rule GB4
- },
- Start | FindExtend => match cat {
- gr::GC_Extend => FindExtend,
- gr::GC_SpacingMark if self.extended => FindExtend,
- gr::GC_ZWJ => FindExtend,
- gr::GC_E_Modifier => Emoji,
- gr::GC_Glue_After_Zwj | gr::GC_E_Base_GAZ => Zwj,
- gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL,
- gr::GC_V => HangulLV,
- gr::GC_T => HangulLVT,
- gr::GC_Regional_Indicator => Regional,
- gr::GC_Control => {
- take_curr = Start == state;
- break;
- },
- _ => break
- },
- HangulL => match cat { // char to right is an L
- gr::GC_L => continue, // L x L is the only legal match
- _ => {
- take_curr = false;
- break;
- }
- },
- HangulLV => match cat { // char to right is a V
- gr::GC_V => continue, // V x V, right char is still V
- gr::GC_L | gr::GC_LV => HangulL, // (L|V) x V, right char is now L
- _ => {
- take_curr = false;
- break;
- }
- },
- HangulLVT => match cat { // char to right is a T
- gr::GC_T => continue, // T x T, right char is still T
- gr::GC_V => HangulLV, // V x T, right char is now V
- gr::GC_LV | gr::GC_LVT => HangulL, // (LV|LVT) x T, right char is now L
- _ => {
- take_curr = false;
- break;
- }
- },
- Prepend => {
- // not used in reverse iteration
- unreachable!()
- },
- Regional => { // rule GB12/GB13
- // Need to scan backward to find if this is preceded by an odd or even number
- // of Regional_Indicator characters.
- let count = match self.regional_count_back {
- Some(count) => count,
- None => self.string[..previdx].chars().rev().take_while(|c| {
- gr::grapheme_category(*c) == gr::GC_Regional_Indicator
- }).count()
- };
- // Cache the count to avoid re-scanning the same chars on the next iteration.
- self.regional_count_back = count.checked_sub(1);
-
- if count % 2 == 0 {
- take_curr = false;
- break;
- }
- continue;
- },
- Emoji => { // char to right is E_Modifier
- // In order to decide whether to break before this E_Modifier char, we need to
- // scan backward past any Extend chars to look for (E_Base|(ZWJ? EBG)).
- let mut ebg_idx = None;
- for (startidx, prev) in self.string[..previdx].char_indices().rev() {
- match (ebg_idx, gr::grapheme_category(prev)) {
- (None, gr::GC_Extend) => continue,
- (None, gr::GC_E_Base) => { // rule GB10
- // Found an Emoji modifier sequence. Return the whole sequence.
- idx = startidx;
- break 'outer;
- }
- (None, gr::GC_E_Base_GAZ) => { // rule GB10
- // Keep scanning in case this is part of an ZWJ x EBJ pair.
- ebg_idx = Some(startidx);
- }
- (Some(_), gr::GC_ZWJ) => { // rule GB11
- idx = startidx;
- break 'outer;
- }
- _ => break
- }
- }
- if let Some(ebg_idx) = ebg_idx {
- // Found an EBG without a ZWJ before it.
- idx = ebg_idx;
- break;
- }
- // Not part of an Emoji modifier sequence. Break here.
- take_curr = false;
- break;
- },
- Zwj => match cat { // char to right is (GAZ|EBG)
- gr::GC_ZWJ => FindExtend, // rule GB11: ZWJ x (GAZ|EBG)
- _ => {
- take_curr = false;
- break;
- }
- }
- }
- }
-
- self.catb = if take_curr {
- None
- } else {
- idx = previdx;
- Some(cat)
- };
-
- if self.extended && cat != gr::GC_Control {
- // rule GB9b: include any preceding Prepend characters
- for (i, c) in self.string[..idx].char_indices().rev() {
- match gr::grapheme_category(c) {
- gr::GC_Prepend => idx = i,
- cat => {
- self.catb = Some(cat);
- break;
- }
- }
- }
- }
-
- let retstr = &self.string[idx..];
- self.string = &self.string[..idx];
- Some(retstr)
+ let prev = self.cursor_back.prev_boundary(self.string, 0).unwrap().unwrap();
+ Some(&self.string[prev..end])
}
}
#[inline]
pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
+ let len = s.len();
Graphemes {
string: s,
- extended: is_extended,
- cat: None,
- catb: None,
- regional_count_back: None
+ cursor: GraphemeCursor::new(0, len, is_extended),
+ cursor_back: GraphemeCursor::new(len, len, is_extended),
}
}
#[inline]
pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
GraphemeIndices { start_offset: s.as_ptr() as usize, iter: new_graphemes(s, is_extended) }
}
+
+// maybe unify with PairResult?
+// An enum describing information about a potential boundary.
+#[derive(PartialEq, Eq, Clone)]
+enum GraphemeState {
+ // No information is known.
+ Unknown,
+ // It is known to not be a boundary.
+ NotBreak,
+ // It is known to be a boundary.
+ Break,
+ // The codepoint after is a Regional Indicator Symbol, so a boundary iff
+ // it is preceded by an even number of RIS codepoints. (GB12, GB13)
+ Regional,
+ // The codepoint after is in the E_Modifier category, so whether it's a boundary
+ // depends on pre-context according to GB10.
+ Emoji,
+}
+
+/// Cursor-based segmenter for grapheme clusters.
+#[derive(Clone)]
+pub struct GraphemeCursor {
+ // Current cursor position.
+ offset: usize,
+ // Total length of the string.
+ len: usize,
+ // A config flag indicating whether this cursor computes legacy or extended
+ // grapheme cluster boundaries (enables GB9a and GB9b if set).
+ is_extended: bool,
+ // Information about the potential boundary at `offset`
+ state: GraphemeState,
+ // Category of codepoint immediately preceding cursor, if known.
+ cat_before: Option<GraphemeCat>,
+ // Category of codepoint immediately after cursor, if known.
+ cat_after: Option<GraphemeCat>,
+ // If set, at least one more codepoint immediately preceding this offset
+ // is needed to resolve whether there's a boundary at `offset`.
+ pre_context_offset: Option<usize>,
+ // The number of RIS codepoints preceding `offset`. If `pre_context_offset`
+ // is set, then counts the number of RIS between that and `offset`, otherwise
+ // is an accurate count relative to the string.
+ ris_count: Option<usize>,
+ // Set if a call to `prev_boundary` or `next_boundary` was suspended due
+ // to needing more input.
+ resuming: bool,
+}
+
+/// An error return indicating that not enough content was available in the
+/// provided chunk to satisfy the query, and that more content must be provided.
+#[derive(PartialEq, Eq, Debug)]
+pub enum GraphemeIncomplete {
+ /// More pre-context is needed. The caller should call `provide_context`
+ /// with a chunk ending at the offset given, then retry the query. This
+ /// will only be returned if the `chunk_start` parameter is nonzero.
+ PreContext(usize),
+
+ /// When requesting `prev_boundary`, the cursor is moving past the beginning
+ /// of the current chunk, so the chunk before that is requested. This will
+ /// only be returned if the `chunk_start` parameter is nonzero.
+ PrevChunk,
+
+ /// When requesting `next_boundary`, the cursor is moving past the end of the
+ /// current chunk, so the chunk after that is requested. This will only be
+ /// returned if the chunk ends before the `len` parameter provided on
+ /// creation of the cursor.
+ NextChunk, // requesting chunk following the one given
+
+ /// An error returned when the chunk given does not contain the cursor position.
+ InvalidOffset,
+}
+
+// An enum describing the result from lookup of a pair of categories.
+#[derive(PartialEq, Eq)]
+enum PairResult {
+ NotBreak, // definitely not a break
+ Break, // definitely a break
+ Extended, // a break iff not in extended mode
+ Regional, // a break if preceded by an even number of RIS
+ Emoji, // a break if preceded by emoji base and (Extend)*
+}
+
+fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
+ use tables::grapheme::GraphemeCat::*;
+ use self::PairResult::*;
+ match (before, after) {
+ (GC_CR, GC_LF) => NotBreak, // GB3
+ (GC_Control, _) => Break, // GB4
+ (GC_CR, _) => Break, // GB4
+ (GC_LF, _) => Break, // GB4
+ (_, GC_Control) => Break, // GB5
+ (_, GC_CR) => Break, // GB5
+ (_, GC_LF) => Break, // GB5
+ (GC_L, GC_L) => NotBreak, // GB6
+ (GC_L, GC_V) => NotBreak, // GB6
+ (GC_L, GC_LV) => NotBreak, // GB6
+ (GC_L, GC_LVT) => NotBreak, // GB6
+ (GC_LV, GC_V) => NotBreak, // GB7
+ (GC_LV, GC_T) => NotBreak, // GB7
+ (GC_V, GC_V) => NotBreak, // GB7
+ (GC_V, GC_T) => NotBreak, // GB7
+ (GC_LVT, GC_T) => NotBreak, // GB8
+ (GC_T, GC_T) => NotBreak, // GB8
+ (_, GC_Extend) => NotBreak, // GB9
+ (_, GC_ZWJ) => NotBreak, // GB9
+ (_, GC_SpacingMark) => Extended, // GB9a
+ (GC_Prepend, _) => Extended, // GB9b
+ (GC_E_Base, GC_E_Modifier) => NotBreak, // GB10
+ (GC_E_Base_GAZ, GC_E_Modifier) => NotBreak, // GB10
+ (GC_Extend, GC_E_Modifier) => Emoji, // GB10
+ (GC_ZWJ, GC_Glue_After_Zwj) => NotBreak, // GB11
+ (GC_ZWJ, GC_E_Base_GAZ) => NotBreak, // GB11
+ (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
+ (_, _) => Break, // GB999
+ }
+}
+
+impl GraphemeCursor {
+ /// Create a new cursor. The string and initial offset are given at creation
+ /// time, but the contents of the string are not. The `is_extended` parameter
+ /// controls whether extended grapheme clusters are selected.
+ ///
+ /// The `offset` parameter must be on a codepoint boundary.
+ ///
+ /// ```rust
+ /// # use unicode_segmentation::GraphemeCursor;
+ /// let s = "हिन्दी";
+ /// let mut legacy = GraphemeCursor::new(0, s.len(), false);
+ /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
+ /// let mut extended = GraphemeCursor::new(0, s.len(), true);
+ /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
+ /// ```
+ pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
+ let state = if offset == 0 || offset == len {
+ GraphemeState::Break
+ } else {
+ GraphemeState::Unknown
+ };
+ GraphemeCursor {
+ offset: offset,
+ len: len,
+ state: state,
+ is_extended: is_extended,
+ cat_before: None,
+ cat_after: None,
+ pre_context_offset: None,
+ ris_count: None,
+ resuming: false,
+ }
+ }
+
+ // Not sure I'm gonna keep this, the advantage over new() seems thin.
+
+ /// Set the cursor to a new location in the same string.
+ ///
+ /// ```rust
+ /// # use unicode_segmentation::GraphemeCursor;
+ /// let s = "abcd";
+ /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
+ /// assert_eq!(cursor.cur_cursor(), 0);
+ /// cursor.set_cursor(2);
+ /// assert_eq!(cursor.cur_cursor(), 2);
+ /// ```
+ pub fn set_cursor(&mut self, offset: usize) {
+ if offset != self.offset {
+ self.offset = offset;
+ self.state = if offset == 0 || offset == self.len {
+ GraphemeState::Break
+ } else {
+ GraphemeState::Unknown
+ };
+ // reset state derived from text around cursor
+ self.cat_before = None;
+ self.cat_after = None;
+ self.ris_count = None;
+ }
+ }
+
+ /// The current offset of the cursor. Equal to the last value provided to
+ /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
+ /// `prev_boundary()`.
+ ///
+ /// ```rust
+ /// # use unicode_segmentation::GraphemeCursor;
+ /// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
+ /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
+ /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
+ /// assert_eq!(cursor.cur_cursor(), 4);
+ /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
+ /// assert_eq!(cursor.cur_cursor(), 8);
+ /// ```
+ pub fn cur_cursor(&self) -> usize {
+ self.offset
+ }
+
+ /// Provide additional pre-context when it is needed to decide a boundary.
+ /// The end of the chunk must coincide with the value given in the
+ /// `GraphemeIncomplete::PreContext` request.
+ ///
+ /// ```rust
+ /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
+ /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
+ /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
+ /// // Not enough pre-context to decide if there's a boundary between the two flags.
+ /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
+ /// // Provide one more Regional Indicator Symbol of pre-context
+ /// cursor.provide_context(&flags[4..8], 4);
+ /// // Still not enough context to decide.
+ /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
+ /// // Provide additional requested context.
+ /// cursor.provide_context(&flags[0..4], 0);
+ /// // That's enough to decide (it always is when context goes to the start of the string)
+ /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
+ /// ```
+ pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
+ use tables::grapheme as gr;
+ assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
+ self.pre_context_offset = None;
+ if self.is_extended && chunk_start + chunk.len() == self.offset {
+ let ch = chunk.chars().rev().next().unwrap();
+ if gr::grapheme_category(ch) == gr::GC_Prepend {
+ self.decide(false); // GB9b
+ return;
+ }
+ }
+ match self.state {
+ GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
+ GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
+ _ => if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
+ let ch = chunk.chars().rev().next().unwrap();
+ self.cat_before = Some(gr::grapheme_category(ch));
+ },
+ }
+ }
+
+ fn decide(&mut self, is_break: bool) {
+ self.state = if is_break {
+ GraphemeState::Break
+ } else {
+ GraphemeState::NotBreak
+ };
+ }
+
+ fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
+ self.decide(is_break);
+ Ok(is_break)
+ }
+
+ fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
+ if self.state == GraphemeState::Break {
+ Ok(true)
+ } else if self.state == GraphemeState::NotBreak {
+ Ok(false)
+ } else if let Some(pre_context_offset) = self.pre_context_offset {
+ Err(GraphemeIncomplete::PreContext(pre_context_offset))
+ } else {
+ unreachable!("inconsistent state");
+ }
+ }
+
+ fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
+ use tables::grapheme as gr;
+ let mut ris_count = self.ris_count.unwrap_or(0);
+ for ch in chunk.chars().rev() {
+ if gr::grapheme_category(ch) != gr::GC_Regional_Indicator {
+ self.ris_count = Some(ris_count);
+ self.decide((ris_count % 2) == 0);
+ return;
+ }
+ ris_count += 1;
+ }
+ self.ris_count = Some(ris_count);
+ if chunk_start == 0 {
+ self.decide((ris_count % 2) == 0);
+ return;
+ }
+ self.pre_context_offset = Some(chunk_start);
+ self.state = GraphemeState::Regional;
+ }
+
+ fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
+ use tables::grapheme as gr;
+ for ch in chunk.chars().rev() {
+ match gr::grapheme_category(ch) {
+ gr::GC_Extend => (),
+ gr::GC_E_Base | gr::GC_E_Base_GAZ => {
+ self.decide(false);
+ return;
+ }
+ _ => {
+ self.decide(true);
+ return;
+ }
+ }
+ }
+ if chunk_start == 0 {
+ self.decide(true);
+ return;
+ }
+ self.pre_context_offset = Some(chunk_start);
+ self.state = GraphemeState::Emoji;
+ }
+
+ /// Determine whether the current cursor location is a grapheme cluster boundary.
+ /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
+ /// the length of `chunk` is not equal to `len` on creation, then this method
+ /// may return `GraphemeIncomplete::PreContext`. The caller should then
+ /// call `provide_context` with the requested chunk, then retry calling this
+ /// method.
+ ///
+ /// For partial chunks, if the cursor is not at the beginning or end of the
+ /// string, the chunk should contain at least the codepoint following the cursor.
+ /// If the string is nonempty, the chunk must be nonempty.
+ ///
+ /// All calls should have consistent chunk contents (ie, if a chunk provides
+ /// content for a given slice, all further chunks covering that slice must have
+ /// the same content for it).
+ ///
+ /// ```rust
+ /// # use unicode_segmentation::GraphemeCursor;
+ /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
+ /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
+ /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
+ /// cursor.set_cursor(12);
+ /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
+ /// ```
+ pub fn is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<bool, GraphemeIncomplete> {
+ use tables::grapheme as gr;
+ if self.state == GraphemeState::Break {
+ return Ok(true)
+ }
+ if self.state == GraphemeState::NotBreak {
+ return Ok(false)
+ }
+ if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
+ if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
+ return Err(GraphemeIncomplete::InvalidOffset)
+ }
+ }
+ if let Some(pre_context_offset) = self.pre_context_offset {
+ return Err(GraphemeIncomplete::PreContext(pre_context_offset));
+ }
+ let offset_in_chunk = self.offset - chunk_start;
+ if self.cat_after.is_none() {
+ let ch = chunk[offset_in_chunk..].chars().next().unwrap();
+ self.cat_after = Some(gr::grapheme_category(ch));
+ }
+ if self.offset == chunk_start {
+ let mut need_pre_context = true;
+ match self.cat_after.unwrap() {
+ gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
+ gr::GC_E_Modifier => self.state = GraphemeState::Emoji,
+ _ => need_pre_context = self.cat_before.is_none(),
+ }
+ if need_pre_context {
+ self.pre_context_offset = Some(chunk_start);
+ return Err(GraphemeIncomplete::PreContext(chunk_start));
+ }
+ }
+ if self.cat_before.is_none() {
+ let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
+ self.cat_before = Some(gr::grapheme_category(ch));
+ }
+ match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
+ PairResult::NotBreak => return self.decision(false),
+ PairResult::Break => return self.decision(true),
+ PairResult::Extended => {
+ let is_extended = self.is_extended;
+ return self.decision(!is_extended);
+ }
+ PairResult::Regional => {
+ if let Some(ris_count) = self.ris_count {
+ return self.decision((ris_count % 2) == 0);
+ }
+ self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
+ self.is_boundary_result()
+ }
+ PairResult::Emoji => {
+ self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
+ self.is_boundary_result()
+ }
+ }
+ }
+
+ /// Find the next boundary after the current cursor position. Only a part of
+ /// the string need be supplied. If the chunk is incomplete, then this
+ /// method might return `GraphemeIncomplete::PreContext` or
+ /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
+ /// call `provide_context` with the requested chunk, then retry. In the
+ /// latter case, the caller should provide the chunk following the one
+ /// given, then retry.
+ ///
+ /// See `is_boundary` for expectations on the provided chunk.
+ ///
+ /// ```rust
+ /// # use unicode_segmentation::GraphemeCursor;
+ /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
+ /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
+ /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
+ /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
+ /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
+ /// ```
+ ///
+ /// And an example that uses partial strings:
+ ///
+ /// ```rust
+ /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
+ /// let s = "abcd";
+ /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
+ /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
+ /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
+ /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
+ /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
+ /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
+ /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
+ /// ```
+ pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
+ use tables::grapheme as gr;
+ if self.offset == self.len {
+ return Ok(None);
+ }
+ let mut iter = chunk[self.offset - chunk_start..].chars();
+ let mut ch = iter.next().unwrap();
+ loop {
+ if self.resuming {
+ if self.cat_after.is_none() {
+ self.cat_after = Some(gr::grapheme_category(ch));
+ }
+ } else {
+ self.offset += ch.len_utf8();
+ self.state = GraphemeState::Unknown;
+ self.cat_before = self.cat_after.take();
+ if self.cat_before.is_none() {
+ self.cat_before = Some(gr::grapheme_category(ch));
+ }
+ if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
+ self.ris_count = self.ris_count.map(|c| c + 1);
+ } else {
+ self.ris_count = Some(0);
+ }
+ if let Some(next_ch) = iter.next() {
+ ch = next_ch;
+ self.cat_after = Some(gr::grapheme_category(ch));
+ } else if self.offset == self.len {
+ self.decide(true);
+ } else {
+ self.resuming = true;
+ return Err(GraphemeIncomplete::NextChunk);
+ }
+ }
+ self.resuming = true;
+ if self.is_boundary(chunk, chunk_start)? {
+ self.resuming = false;
+ return Ok(Some(self.offset));
+ }
+ self.resuming = false;
+ }
+ }
+
+ /// Find the previous boundary after the current cursor position. Only a part
+ /// of the string need be supplied. If the chunk is incomplete, then this
+ /// method might return `GraphemeIncomplete::PreContext` or
+ /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
+ /// call `provide_context` with the requested chunk, then retry. In the
+ /// latter case, the caller should provide the chunk preceding the one
+ /// given, then retry.
+ ///
+ /// See `is_boundary` for expectations on the provided chunk.
+ ///
+ /// ```rust
+ /// # use unicode_segmentation::GraphemeCursor;
+ /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
+ /// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
+ /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
+ /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
+ /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
+ /// ```
+ ///
+ /// And an example that uses partial strings (note the exact return is not
+ /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
+ ///
+ /// ```rust
+ /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
+ /// let s = "abcd";
+ /// let mut cursor = GraphemeCursor::new(4, s.len(), false);
+ /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
+ /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
+ /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
+ /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
+ /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
+ /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
+ /// ```
+ pub fn prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
+ use tables::grapheme as gr;
+ if self.offset == 0 {
+ return Ok(None);
+ }
+ if self.offset == chunk_start {
+ return Err(GraphemeIncomplete::PrevChunk);
+ }
+ let mut iter = chunk[..self.offset - chunk_start].chars().rev();
+ let mut ch = iter.next().unwrap();
+ loop {
+ if self.offset == chunk_start {
+ self.resuming = true;
+ return Err(GraphemeIncomplete::PrevChunk);
+ }
+ if self.resuming {
+ self.cat_before = Some(gr::grapheme_category(ch));
+ } else {
+ self.offset -= ch.len_utf8();
+ self.cat_after = self.cat_before.take();
+ self.state = GraphemeState::Unknown;
+ if let Some(ris_count) = self.ris_count {
+ self.ris_count = if ris_count > 0 { Some(ris_count - 1) } else { None };
+ }
+ if let Some(prev_ch) = iter.next() {
+ ch = prev_ch;
+ self.cat_before = Some(gr::grapheme_category(ch));
+ } else if self.offset == 0 {
+ self.decide(true);
+ } else {
+ self.resuming = true;
+ self.cat_after = Some(gr::grapheme_category(ch));
+ return Err(GraphemeIncomplete::PrevChunk);
+ }
+ }
+ self.resuming = true;
+ if self.is_boundary(chunk, chunk_start)? {
+ self.resuming = false;
+ return Ok(Some(self.offset));
+ }
+ self.resuming = false;
+ }
+ }
+}
+
+#[test]
+fn test_grapheme_cursor_ris_precontext() {
+ let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
+ let mut c = GraphemeCursor::new(8, s.len(), true);
+ assert_eq!(c.is_boundary(&s[4..], 4), Err(GraphemeIncomplete::PreContext(4)));
+ c.provide_context(&s[..4], 0);
+ assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
+}
+
+#[test]
+fn test_grapheme_cursor_chunk_start_require_precontext() {
+ let s = "\r\n";
+ let mut c = GraphemeCursor::new(1, s.len(), true);
+ assert_eq!(c.is_boundary(&s[1..], 1), Err(GraphemeIncomplete::PreContext(1)));
+ c.provide_context(&s[..1], 0);
+ assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
+}
+
+#[test]
+fn test_grapheme_cursor_prev_boundary() {
+ let s = "abcd";
+ let mut c = GraphemeCursor::new(3, s.len(), true);
+ assert_eq!(c.prev_boundary(&s[2..], 2), Err(GraphemeIncomplete::PrevChunk));
+ assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
+}
+
+#[test]
+fn test_grapheme_cursor_prev_boundary_chunk_start() {
+ let s = "abcd";
+ let mut c = GraphemeCursor::new(2, s.len(), true);
+ assert_eq!(c.prev_boundary(&s[2..], 2), Err(GraphemeIncomplete::PrevChunk));
+ assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
+}
--- a/third_party/rust/unicode-segmentation/src/tables.rs
+++ b/third_party/rust/unicode-segmentation/src/tables.rs
@@ -291,23 +291,25 @@ pub mod grapheme {
use core::result::Result::{Ok, Err};
pub use self::GraphemeCat::*;
#[allow(non_camel_case_types)]
#[derive(Clone, Copy, PartialEq, Eq)]
pub enum GraphemeCat {
GC_Any,
+ GC_CR,
GC_Control,
GC_E_Base,
GC_E_Base_GAZ,
GC_E_Modifier,
GC_Extend,
GC_Glue_After_Zwj,
GC_L,
+ GC_LF,
GC_LV,
GC_LVT,
GC_Prepend,
GC_Regional_Indicator,
GC_SpacingMark,
GC_T,
GC_V,
GC_ZWJ,
@@ -328,81 +330,83 @@ pub mod grapheme {
}
}
pub fn grapheme_category(c: char) -> GraphemeCat {
bsearch_range_value_table(c, grapheme_cat_table)
}
const grapheme_cat_table: &'static [(char, char, GraphemeCat)] = &[
- ('\u{0}', '\u{1f}', GC_Control), ('\u{7f}', '\u{9f}', GC_Control), ('\u{ad}', '\u{ad}',
- GC_Control), ('\u{300}', '\u{36f}', GC_Extend), ('\u{483}', '\u{489}', GC_Extend),
- ('\u{591}', '\u{5bd}', GC_Extend), ('\u{5bf}', '\u{5bf}', GC_Extend), ('\u{5c1}', '\u{5c2}',
- GC_Extend), ('\u{5c4}', '\u{5c5}', GC_Extend), ('\u{5c7}', '\u{5c7}', GC_Extend),
- ('\u{600}', '\u{605}', GC_Prepend), ('\u{610}', '\u{61a}', GC_Extend), ('\u{61c}',
- '\u{61c}', GC_Control), ('\u{64b}', '\u{65f}', GC_Extend), ('\u{670}', '\u{670}',
- GC_Extend), ('\u{6d6}', '\u{6dc}', GC_Extend), ('\u{6dd}', '\u{6dd}', GC_Prepend),
- ('\u{6df}', '\u{6e4}', GC_Extend), ('\u{6e7}', '\u{6e8}', GC_Extend), ('\u{6ea}', '\u{6ed}',
- GC_Extend), ('\u{70f}', '\u{70f}', GC_Prepend), ('\u{711}', '\u{711}', GC_Extend),
- ('\u{730}', '\u{74a}', GC_Extend), ('\u{7a6}', '\u{7b0}', GC_Extend), ('\u{7eb}', '\u{7f3}',
- GC_Extend), ('\u{816}', '\u{819}', GC_Extend), ('\u{81b}', '\u{823}', GC_Extend),
- ('\u{825}', '\u{827}', GC_Extend), ('\u{829}', '\u{82d}', GC_Extend), ('\u{859}', '\u{85b}',
- GC_Extend), ('\u{8d4}', '\u{8e1}', GC_Extend), ('\u{8e2}', '\u{8e2}', GC_Prepend),
- ('\u{8e3}', '\u{902}', GC_Extend), ('\u{903}', '\u{903}', GC_SpacingMark), ('\u{93a}',
- '\u{93a}', GC_Extend), ('\u{93b}', '\u{93b}', GC_SpacingMark), ('\u{93c}', '\u{93c}',
- GC_Extend), ('\u{93e}', '\u{940}', GC_SpacingMark), ('\u{941}', '\u{948}', GC_Extend),
- ('\u{949}', '\u{94c}', GC_SpacingMark), ('\u{94d}', '\u{94d}', GC_Extend), ('\u{94e}',
- '\u{94f}', GC_SpacingMark), ('\u{951}', '\u{957}', GC_Extend), ('\u{962}', '\u{963}',
- GC_Extend), ('\u{981}', '\u{981}', GC_Extend), ('\u{982}', '\u{983}', GC_SpacingMark),
- ('\u{9bc}', '\u{9bc}', GC_Extend), ('\u{9be}', '\u{9be}', GC_Extend), ('\u{9bf}', '\u{9c0}',
- GC_SpacingMark), ('\u{9c1}', '\u{9c4}', GC_Extend), ('\u{9c7}', '\u{9c8}', GC_SpacingMark),
- ('\u{9cb}', '\u{9cc}', GC_SpacingMark), ('\u{9cd}', '\u{9cd}', GC_Extend), ('\u{9d7}',
- '\u{9d7}', GC_Extend), ('\u{9e2}', '\u{9e3}', GC_Extend), ('\u{a01}', '\u{a02}', GC_Extend),
- ('\u{a03}', '\u{a03}', GC_SpacingMark), ('\u{a3c}', '\u{a3c}', GC_Extend), ('\u{a3e}',
- '\u{a40}', GC_SpacingMark), ('\u{a41}', '\u{a42}', GC_Extend), ('\u{a47}', '\u{a48}',
- GC_Extend), ('\u{a4b}', '\u{a4d}', GC_Extend), ('\u{a51}', '\u{a51}', GC_Extend),
- ('\u{a70}', '\u{a71}', GC_Extend), ('\u{a75}', '\u{a75}', GC_Extend), ('\u{a81}', '\u{a82}',
- GC_Extend), ('\u{a83}', '\u{a83}', GC_SpacingMark), ('\u{abc}', '\u{abc}', GC_Extend),
- ('\u{abe}', '\u{ac0}', GC_SpacingMark), ('\u{ac1}', '\u{ac5}', GC_Extend), ('\u{ac7}',
- '\u{ac8}', GC_Extend), ('\u{ac9}', '\u{ac9}', GC_SpacingMark), ('\u{acb}', '\u{acc}',
- GC_SpacingMark), ('\u{acd}', '\u{acd}', GC_Extend), ('\u{ae2}', '\u{ae3}', GC_Extend),
- ('\u{b01}', '\u{b01}', GC_Extend), ('\u{b02}', '\u{b03}', GC_SpacingMark), ('\u{b3c}',
- '\u{b3c}', GC_Extend), ('\u{b3e}', '\u{b3f}', GC_Extend), ('\u{b40}', '\u{b40}',
- GC_SpacingMark), ('\u{b41}', '\u{b44}', GC_Extend), ('\u{b47}', '\u{b48}', GC_SpacingMark),
- ('\u{b4b}', '\u{b4c}', GC_SpacingMark), ('\u{b4d}', '\u{b4d}', GC_Extend), ('\u{b56}',
- '\u{b57}', GC_Extend), ('\u{b62}', '\u{b63}', GC_Extend), ('\u{b82}', '\u{b82}', GC_Extend),
- ('\u{bbe}', '\u{bbe}', GC_Extend), ('\u{bbf}', '\u{bbf}', GC_SpacingMark), ('\u{bc0}',
- '\u{bc0}', GC_Extend), ('\u{bc1}', '\u{bc2}', GC_SpacingMark), ('\u{bc6}', '\u{bc8}',
- GC_SpacingMark), ('\u{bca}', '\u{bcc}', GC_SpacingMark), ('\u{bcd}', '\u{bcd}', GC_Extend),
- ('\u{bd7}', '\u{bd7}', GC_Extend), ('\u{c00}', '\u{c00}', GC_Extend), ('\u{c01}', '\u{c03}',
- GC_SpacingMark), ('\u{c3e}', '\u{c40}', GC_Extend), ('\u{c41}', '\u{c44}', GC_SpacingMark),
- ('\u{c46}', '\u{c48}', GC_Extend), ('\u{c4a}', '\u{c4d}', GC_Extend), ('\u{c55}', '\u{c56}',
- GC_Extend), ('\u{c62}', '\u{c63}', GC_Extend), ('\u{c81}', '\u{c81}', GC_Extend),
- ('\u{c82}', '\u{c83}', GC_SpacingMark), ('\u{cbc}', '\u{cbc}', GC_Extend), ('\u{cbe}',
- '\u{cbe}', GC_SpacingMark), ('\u{cbf}', '\u{cbf}', GC_Extend), ('\u{cc0}', '\u{cc1}',
- GC_SpacingMark), ('\u{cc2}', '\u{cc2}', GC_Extend), ('\u{cc3}', '\u{cc4}', GC_SpacingMark),
- ('\u{cc6}', '\u{cc6}', GC_Extend), ('\u{cc7}', '\u{cc8}', GC_SpacingMark), ('\u{cca}',
- '\u{ccb}', GC_SpacingMark), ('\u{ccc}', '\u{ccd}', GC_Extend), ('\u{cd5}', '\u{cd6}',
- GC_Extend), ('\u{ce2}', '\u{ce3}', GC_Extend), ('\u{d01}', '\u{d01}', GC_Extend),
- ('\u{d02}', '\u{d03}', GC_SpacingMark), ('\u{d3e}', '\u{d3e}', GC_Extend), ('\u{d3f}',
- '\u{d40}', GC_SpacingMark), ('\u{d41}', '\u{d44}', GC_Extend), ('\u{d46}', '\u{d48}',
- GC_SpacingMark), ('\u{d4a}', '\u{d4c}', GC_SpacingMark), ('\u{d4d}', '\u{d4d}', GC_Extend),
- ('\u{d4e}', '\u{d4e}', GC_Prepend), ('\u{d57}', '\u{d57}', GC_Extend), ('\u{d62}',
- '\u{d63}', GC_Extend), ('\u{d82}', '\u{d83}', GC_SpacingMark), ('\u{dca}', '\u{dca}',
- GC_Extend), ('\u{dcf}', '\u{dcf}', GC_Extend), ('\u{dd0}', '\u{dd1}', GC_SpacingMark),
- ('\u{dd2}', '\u{dd4}', GC_Extend), ('\u{dd6}', '\u{dd6}', GC_Extend), ('\u{dd8}', '\u{dde}',
- GC_SpacingMark), ('\u{ddf}', '\u{ddf}', GC_Extend), ('\u{df2}', '\u{df3}', GC_SpacingMark),
- ('\u{e31}', '\u{e31}', GC_Extend), ('\u{e33}', '\u{e33}', GC_SpacingMark), ('\u{e34}',
- '\u{e3a}', GC_Extend), ('\u{e47}', '\u{e4e}', GC_Extend), ('\u{eb1}', '\u{eb1}', GC_Extend),
- ('\u{eb3}', '\u{eb3}', GC_SpacingMark), ('\u{eb4}', '\u{eb9}', GC_Extend), ('\u{ebb}',
- '\u{ebc}', GC_Extend), ('\u{ec8}', '\u{ecd}', GC_Extend), ('\u{f18}', '\u{f19}', GC_Extend),
- ('\u{f35}', '\u{f35}', GC_Extend), ('\u{f37}', '\u{f37}', GC_Extend), ('\u{f39}', '\u{f39}',
- GC_Extend), ('\u{f3e}', '\u{f3f}', GC_SpacingMark), ('\u{f71}', '\u{f7e}', GC_Extend),
- ('\u{f7f}', '\u{f7f}', GC_SpacingMark), ('\u{f80}', '\u{f84}', GC_Extend), ('\u{f86}',
- '\u{f87}', GC_Extend), ('\u{f8d}', '\u{f97}', GC_Extend), ('\u{f99}', '\u{fbc}', GC_Extend),
+ ('\u{0}', '\u{9}', GC_Control), ('\u{a}', '\u{a}', GC_LF), ('\u{b}', '\u{c}', GC_Control),
+ ('\u{d}', '\u{d}', GC_CR), ('\u{e}', '\u{1f}', GC_Control), ('\u{7f}', '\u{9f}',
+ GC_Control), ('\u{ad}', '\u{ad}', GC_Control), ('\u{300}', '\u{36f}', GC_Extend),
+ ('\u{483}', '\u{489}', GC_Extend), ('\u{591}', '\u{5bd}', GC_Extend), ('\u{5bf}', '\u{5bf}',
+ GC_Extend), ('\u{5c1}', '\u{5c2}', GC_Extend), ('\u{5c4}', '\u{5c5}', GC_Extend),
+ ('\u{5c7}', '\u{5c7}', GC_Extend), ('\u{600}', '\u{605}', GC_Prepend), ('\u{610}',
+ '\u{61a}', GC_Extend), ('\u{61c}', '\u{61c}', GC_Control), ('\u{64b}', '\u{65f}',
+ GC_Extend), ('\u{670}', '\u{670}', GC_Extend), ('\u{6d6}', '\u{6dc}', GC_Extend),
+ ('\u{6dd}', '\u{6dd}', GC_Prepend), ('\u{6df}', '\u{6e4}', GC_Extend), ('\u{6e7}',
+ '\u{6e8}', GC_Extend), ('\u{6ea}', '\u{6ed}', GC_Extend), ('\u{70f}', '\u{70f}',
+ GC_Prepend), ('\u{711}', '\u{711}', GC_Extend), ('\u{730}', '\u{74a}', GC_Extend),
+ ('\u{7a6}', '\u{7b0}', GC_Extend), ('\u{7eb}', '\u{7f3}', GC_Extend), ('\u{816}', '\u{819}',
+ GC_Extend), ('\u{81b}', '\u{823}', GC_Extend), ('\u{825}', '\u{827}', GC_Extend),
+ ('\u{829}', '\u{82d}', GC_Extend), ('\u{859}', '\u{85b}', GC_Extend), ('\u{8d4}', '\u{8e1}',
+ GC_Extend), ('\u{8e2}', '\u{8e2}', GC_Prepend), ('\u{8e3}', '\u{902}', GC_Extend),
+ ('\u{903}', '\u{903}', GC_SpacingMark), ('\u{93a}', '\u{93a}', GC_Extend), ('\u{93b}',
+ '\u{93b}', GC_SpacingMark), ('\u{93c}', '\u{93c}', GC_Extend), ('\u{93e}', '\u{940}',
+ GC_SpacingMark), ('\u{941}', '\u{948}', GC_Extend), ('\u{949}', '\u{94c}', GC_SpacingMark),
+ ('\u{94d}', '\u{94d}', GC_Extend), ('\u{94e}', '\u{94f}', GC_SpacingMark), ('\u{951}',
+ '\u{957}', GC_Extend), ('\u{962}', '\u{963}', GC_Extend), ('\u{981}', '\u{981}', GC_Extend),
+ ('\u{982}', '\u{983}', GC_SpacingMark), ('\u{9bc}', '\u{9bc}', GC_Extend), ('\u{9be}',
+ '\u{9be}', GC_Extend), ('\u{9bf}', '\u{9c0}', GC_SpacingMark), ('\u{9c1}', '\u{9c4}',
+ GC_Extend), ('\u{9c7}', '\u{9c8}', GC_SpacingMark), ('\u{9cb}', '\u{9cc}', GC_SpacingMark),
+ ('\u{9cd}', '\u{9cd}', GC_Extend), ('\u{9d7}', '\u{9d7}', GC_Extend), ('\u{9e2}', '\u{9e3}',
+ GC_Extend), ('\u{a01}', '\u{a02}', GC_Extend), ('\u{a03}', '\u{a03}', GC_SpacingMark),
+ ('\u{a3c}', '\u{a3c}', GC_Extend), ('\u{a3e}', '\u{a40}', GC_SpacingMark), ('\u{a41}',
+ '\u{a42}', GC_Extend), ('\u{a47}', '\u{a48}', GC_Extend), ('\u{a4b}', '\u{a4d}', GC_Extend),
+ ('\u{a51}', '\u{a51}', GC_Extend), ('\u{a70}', '\u{a71}', GC_Extend), ('\u{a75}', '\u{a75}',
+ GC_Extend), ('\u{a81}', '\u{a82}', GC_Extend), ('\u{a83}', '\u{a83}', GC_SpacingMark),
+ ('\u{abc}', '\u{abc}', GC_Extend), ('\u{abe}', '\u{ac0}', GC_SpacingMark), ('\u{ac1}',
+ '\u{ac5}', GC_Extend), ('\u{ac7}', '\u{ac8}', GC_Extend), ('\u{ac9}', '\u{ac9}',
+ GC_SpacingMark), ('\u{acb}', '\u{acc}', GC_SpacingMark), ('\u{acd}', '\u{acd}', GC_Extend),
+ ('\u{ae2}', '\u{ae3}', GC_Extend), ('\u{b01}', '\u{b01}', GC_Extend), ('\u{b02}', '\u{b03}',
+ GC_SpacingMark), ('\u{b3c}', '\u{b3c}', GC_Extend), ('\u{b3e}', '\u{b3f}', GC_Extend),
+ ('\u{b40}', '\u{b40}', GC_SpacingMark), ('\u{b41}', '\u{b44}', GC_Extend), ('\u{b47}',
+ '\u{b48}', GC_SpacingMark), ('\u{b4b}', '\u{b4c}', GC_SpacingMark), ('\u{b4d}', '\u{b4d}',
+ GC_Extend), ('\u{b56}', '\u{b57}', GC_Extend), ('\u{b62}', '\u{b63}', GC_Extend),
+ ('\u{b82}', '\u{b82}', GC_Extend), ('\u{bbe}', '\u{bbe}', GC_Extend), ('\u{bbf}', '\u{bbf}',
+ GC_SpacingMark), ('\u{bc0}', '\u{bc0}', GC_Extend), ('\u{bc1}', '\u{bc2}', GC_SpacingMark),
+ ('\u{bc6}', '\u{bc8}', GC_SpacingMark), ('\u{bca}', '\u{bcc}', GC_SpacingMark), ('\u{bcd}',
+ '\u{bcd}', GC_Extend), ('\u{bd7}', '\u{bd7}', GC_Extend), ('\u{c00}', '\u{c00}', GC_Extend),
+ ('\u{c01}', '\u{c03}', GC_SpacingMark), ('\u{c3e}', '\u{c40}', GC_Extend), ('\u{c41}',
+ '\u{c44}', GC_SpacingMark), ('\u{c46}', '\u{c48}', GC_Extend), ('\u{c4a}', '\u{c4d}',
+ GC_Extend), ('\u{c55}', '\u{c56}', GC_Extend), ('\u{c62}', '\u{c63}', GC_Extend),
+ ('\u{c81}', '\u{c81}', GC_Extend), ('\u{c82}', '\u{c83}', GC_SpacingMark), ('\u{cbc}',
+ '\u{cbc}', GC_Extend), ('\u{cbe}', '\u{cbe}', GC_SpacingMark), ('\u{cbf}', '\u{cbf}',
+ GC_Extend), ('\u{cc0}', '\u{cc1}', GC_SpacingMark), ('\u{cc2}', '\u{cc2}', GC_Extend),
+ ('\u{cc3}', '\u{cc4}', GC_SpacingMark), ('\u{cc6}', '\u{cc6}', GC_Extend), ('\u{cc7}',
+ '\u{cc8}', GC_SpacingMark), ('\u{cca}', '\u{ccb}', GC_SpacingMark), ('\u{ccc}', '\u{ccd}',
+ GC_Extend), ('\u{cd5}', '\u{cd6}', GC_Extend), ('\u{ce2}', '\u{ce3}', GC_Extend),
+ ('\u{d01}', '\u{d01}', GC_Extend), ('\u{d02}', '\u{d03}', GC_SpacingMark), ('\u{d3e}',
+ '\u{d3e}', GC_Extend), ('\u{d3f}', '\u{d40}', GC_SpacingMark), ('\u{d41}', '\u{d44}',
+ GC_Extend), ('\u{d46}', '\u{d48}', GC_SpacingMark), ('\u{d4a}', '\u{d4c}', GC_SpacingMark),
+ ('\u{d4d}', '\u{d4d}', GC_Extend), ('\u{d4e}', '\u{d4e}', GC_Prepend), ('\u{d57}',
+ '\u{d57}', GC_Extend), ('\u{d62}', '\u{d63}', GC_Extend), ('\u{d82}', '\u{d83}',
+ GC_SpacingMark), ('\u{dca}', '\u{dca}', GC_Extend), ('\u{dcf}', '\u{dcf}', GC_Extend),
+ ('\u{dd0}', '\u{dd1}', GC_SpacingMark), ('\u{dd2}', '\u{dd4}', GC_Extend), ('\u{dd6}',
+ '\u{dd6}', GC_Extend), ('\u{dd8}', '\u{dde}', GC_SpacingMark), ('\u{ddf}', '\u{ddf}',
+ GC_Extend), ('\u{df2}', '\u{df3}', GC_SpacingMark), ('\u{e31}', '\u{e31}', GC_Extend),
+ ('\u{e33}', '\u{e33}', GC_SpacingMark), ('\u{e34}', '\u{e3a}', GC_Extend), ('\u{e47}',
+ '\u{e4e}', GC_Extend), ('\u{eb1}', '\u{eb1}', GC_Extend), ('\u{eb3}', '\u{eb3}',
+ GC_SpacingMark), ('\u{eb4}', '\u{eb9}', GC_Extend), ('\u{ebb}', '\u{ebc}', GC_Extend),
+ ('\u{ec8}', '\u{ecd}', GC_Extend), ('\u{f18}', '\u{f19}', GC_Extend), ('\u{f35}', '\u{f35}',
+ GC_Extend), ('\u{f37}', '\u{f37}', GC_Extend), ('\u{f39}', '\u{f39}', GC_Extend),
+ ('\u{f3e}', '\u{f3f}', GC_SpacingMark), ('\u{f71}', '\u{f7e}', GC_Extend), ('\u{f7f}',
+ '\u{f7f}', GC_SpacingMark), ('\u{f80}', '\u{f84}', GC_Extend), ('\u{f86}', '\u{f87}',
+ GC_Extend), ('\u{f8d}', '\u{f97}', GC_Extend), ('\u{f99}', '\u{fbc}', GC_Extend),
('\u{fc6}', '\u{fc6}', GC_Extend), ('\u{102d}', '\u{1030}', GC_Extend), ('\u{1031}',
'\u{1031}', GC_SpacingMark), ('\u{1032}', '\u{1037}', GC_Extend), ('\u{1039}', '\u{103a}',
GC_Extend), ('\u{103b}', '\u{103c}', GC_SpacingMark), ('\u{103d}', '\u{103e}', GC_Extend),
('\u{1056}', '\u{1057}', GC_SpacingMark), ('\u{1058}', '\u{1059}', GC_Extend), ('\u{105e}',
'\u{1060}', GC_Extend), ('\u{1071}', '\u{1074}', GC_Extend), ('\u{1082}', '\u{1082}',
GC_Extend), ('\u{1084}', '\u{1084}', GC_SpacingMark), ('\u{1085}', '\u{1086}', GC_Extend),
('\u{108d}', '\u{108d}', GC_Extend), ('\u{109d}', '\u{109d}', GC_Extend), ('\u{1100}',
'\u{115f}', GC_L), ('\u{1160}', '\u{11a7}', GC_V), ('\u{11a8}', '\u{11ff}', GC_T),
@@ -863,17 +867,17 @@ pub mod grapheme {
}
pub mod word {
use core::result::Result::{Ok, Err};
pub use self::WordCat::*;
#[allow(non_camel_case_types)]
- #[derive(Clone, Copy, PartialEq, Eq, Debug)]
+ #[derive(Clone, Copy, PartialEq, Eq)]
pub enum WordCat {
WC_ALetter,
WC_Any,
WC_CR,
WC_Double_Quote,
WC_E_Base,
WC_E_Base_GAZ,
WC_E_Modifier,