l10n-compare-locales: changeset 328:37cfd135777ea6d3cfd5db5cb8280ebffdfe3901

--- a/compare_locales/parser.py
+++ b/compare_locales/parser.py
@@ -27,36 +27,30 @@ CAN_SKIP = 2
 CAN_MERGE = 4
 
 
 class EntityBase(object):
     '''
     Abstraction layer for a localizable entity.
     Currently supported are grammars of the form:
 
-    1: pre white space
-    2: entity definition
-    3: entity key (name)
-    4: entity value
-    5: post white space
-                                                 <--[1]
+    1: entity definition
+    2: entity key (name)
+    3: entity value
+
     <!ENTITY key "value">
 
-    <-------[2]--------->
+    <--- definition ---->
     '''
-    def __init__(self, ctx, pre_comment,
-                 span, pre_ws_span, def_span,
-                 key_span, val_span, post_span):
+    def __init__(self, ctx, pre_comment, span, def_span, key_span, val_span):
         self.ctx = ctx
         self.span = span
-        self.pre_ws_span = pre_ws_span
         self.def_span = def_span
         self.key_span = key_span
         self.val_span = val_span
-        self.post_span = post_span
         self.pre_comment = pre_comment
         pass
 
     def position(self, offset=0):
         """Get the 1-based line and column of the character
         with given offset into the Entity.
 
         If offset is negative, return the end of the Entity.
@@ -79,40 +73,32 @@ class EntityBase(object):
             pos = self.val_span[0] + offset
         return self.ctx.lines(pos)[0]
 
     # getter helpers
 
     def get_all(self):
         return self.ctx.contents[self.span[0]:self.span[1]]
 
-    def get_pre_ws(self):
-        return self.ctx.contents[self.pre_ws_span[0]:self.pre_ws_span[1]]
-
     def get_def(self):
         return self.ctx.contents[self.def_span[0]:self.def_span[1]]
 
     def get_key(self):
         return self.ctx.contents[self.key_span[0]:self.key_span[1]]
 
     def get_raw_val(self):
         return self.ctx.contents[self.val_span[0]:self.val_span[1]]
 
-    def get_post(self):
-        return self.ctx.contents[self.post_span[0]:self.post_span[1]]
-
     # getters
 
     all = property(get_all)
-    pre_ws = property(get_pre_ws)
     definition = property(get_def)
     key = property(get_key)
     val = property(get_raw_val)
     raw_val = property(get_raw_val)
-    post = property(get_post)
 
     def __repr__(self):
         return self.key
 
     re_br = re.compile('<br\s*/?>', re.U)
     re_sgml = re.compile('</?\w+.*?>', re.U | re.M)
 
     def count_words(self):
@@ -127,23 +113,20 @@ class EntityBase(object):
         return self.key == other.key and self.val == other.val
 
 
 class Entity(EntityBase):
     pass
 
 
 class Comment(EntityBase):
-    def __init__(self, ctx, span, pre_ws_span, def_span,
-                 post_span):
+    def __init__(self, ctx, span, def_span):
         self.ctx = ctx
         self.span = span
-        self.pre_ws_span = pre_ws_span
         self.def_span = def_span
-        self.post_span = post_span
 
     @property
     def key(self):
         return None
 
     @property
     def val(self):
         return None
@@ -158,17 +141,17 @@ class Junk(object):
     This way, we can signal bad content as stuff we don't understand.
     And the either fix that, or report real bugs in localizations.
     '''
     junkid = 0
 
     def __init__(self, ctx, span):
         self.ctx = ctx
         self.span = span
-        self.pre_ws = self.definition = self.post = ''
+        self.definition = ''
         self.__class__.junkid += 1
         self.key = '_junk_%d_%d-%d' % (self.__class__.junkid, span[0], span[1])
 
     def position(self, offset=0):
         """Get the 1-based line and column of the character
         with given offset into the Entity.
 
         If offset is negative, return the end of the Entity.
@@ -193,18 +176,17 @@ class Junk(object):
 
 class Whitespace(EntityBase):
     '''Entity-like object representing an empty file with whitespace,
     if allowed
     '''
     def __init__(self, ctx, span):
         self.ctx = ctx
         self.key_span = self.val_span = self.span = span
-        self.def_span = self.pre_ws_span = (span[0], span[0])
-        self.post_span = (span[1], span[1])
+        self.def_span = (span[0], span[0])
 
     def __repr__(self):
         return self.raw_val
 
 
 class Parser(object):
     capabilities = CAN_SKIP | CAN_MERGE
     tail = re.compile('\s+\Z')
@@ -282,17 +264,17 @@ class Parser(object):
         m = self.reKey.match(ctx.contents, offset)
         if m:
             offset = m.end()
             entity = self.createEntity(ctx, m)
             return (entity, offset)
         m = self.reComment.match(ctx.contents, offset)
         if m:
             offset = m.end()
-            self.last_comment = Comment(ctx, *[m.span(i) for i in xrange(4)])
+            self.last_comment = Comment(ctx, *[m.span(i) for i in xrange(2)])
             return (self.last_comment, offset)
         return self.getTrailing(ctx, offset, self.reKey, self.reComment)
 
     def getTrailing(self, ctx, offset, *expressions):
         junkend = None
         for exp in expressions:
             m = exp.search(ctx.contents, offset)
             if m:
@@ -304,17 +286,17 @@ class Parser(object):
             else:
                 return (None, offset)
         return (Junk(ctx, (offset, junkend)), junkend)
 
     def createEntity(self, ctx, m):
         pre_comment = self.last_comment
         self.last_comment = None
         return Entity(ctx, pre_comment,
-                      *[m.span(i) for i in xrange(6)])
+                      *[m.span(i) for i in xrange(4)])
 
     @classmethod
     def findDuplicates(cls, entities):
         found = Counter(entity.key for entity in entities)
         for entity_id, cnt in found.items():
             if cnt > 1:
                 yield '{} occurs {} times'.format(entity_id, cnt)
 
@@ -322,27 +304,25 @@ class Parser(object):
 def getParser(path):
     for item in __constructors:
         if re.search(item[0], path):
             return item[1]
     raise UserWarning("Cannot find Parser")
 
 
 # Subgroups of the match will:
-# 1: pre white space
-# 2: pre comments
-# 3: entity definition
-# 4: entity key (name)
-# 5: entity value
-# 6: post comment (and white space) in the same line (dtd only)
-#                                            <--[1]
-# <!-- pre comments -->                      <--[2]
-# <!ENTITY key "value"> <!-- comment -->
+# 1: pre comments
+# 2: entity definition
+# 3: entity key (name)
+# 4: entity value
+
+# <!-- pre comments -->                      <--[1]
+# <!ENTITY key "value">
 #
-# <-------[3]---------><------[6]------>
+# <-------[2]--------->
 
 
 class DTDEntity(Entity):
     def value_position(self, offset=0):
         # DTDChecker already returns tuples of (line, col) positions
         if isinstance(offset, tuple):
             line_pos, col_pos = offset
             line, col = super(DTDEntity, self).value_position()
@@ -369,28 +349,27 @@ class DTDParser(Parser):
         u'\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F' + \
         u'\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD'
     # + \U00010000-\U000EFFFF seems to be unsupported in python
 
     # NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 |
     #     [#x0300-#x036F] | [#x203F-#x2040]
     NameChar = NameStartChar + ur'\-\.0-9' + u'\xB7\u0300-\u036F\u203F-\u2040'
     Name = '[' + NameStartChar + '][' + NameChar + ']*'
-    reKey = re.compile('(?:(?P<pre>\s*)(?P<entity><!ENTITY\s+(?P<key>' + Name +
-                       ')\s+(?P<val>\"[^\"]*\"|\'[^\']*\'?)\s*>)'
-                       '(?P<post>\s+)?)',
+    reKey = re.compile('(?:\s*(?P<entity><!ENTITY\s+(?P<key>' + Name +
+                       ')\s+(?P<val>\"[^\"]*\"|\'[^\']*\'?)\s*>)\s*)',
                        re.DOTALL | re.M)
     # add BOM to DTDs, details in bug 435002
     reHeader = re.compile(u'^\ufeff')
-    reComment = re.compile('(\s*)(<!--(-?[%s])*?-->)(\s*)' % CharMinusDash,
+    reComment = re.compile('\s*(<!--(-?[%s])*?-->)\s*' % CharMinusDash,
                            re.S)
-    rePE = re.compile(u'(?:(\s*)'
+    rePE = re.compile(u'(?:\s*'
                       u'(<!ENTITY\s+%\s+(' + Name +
                       u')\s+SYSTEM\s+(\"[^\"]*\"|\'[^\']*\')\s*>\s*%' + Name +
-                      u';)([ \t]*(?:' + XmlComment + u'\s*)*\n?)?)')
+                      u';)(?:[ \t]*(?:' + XmlComment + u'\s*)*\n?)?)')
 
     def getEntity(self, ctx, offset):
         '''
         Overload Parser.getEntity to special-case ParsedEntities.
         Just check for a parsed entity if that method claims junk.
 
         <!ENTITY % foo SYSTEM "url">
         %foo;
@@ -398,29 +377,27 @@ class DTDParser(Parser):
         if offset is 0 and self.reHeader.match(ctx.contents):
             offset += 1
         entity, inneroffset = Parser.getEntity(self, ctx, offset)
         if (entity and isinstance(entity, Junk)) or entity is None:
             m = self.rePE.match(ctx.contents, offset)
             if m:
                 inneroffset = m.end()
                 self.last_comment = None
-                entity = DTDEntity(ctx, '', *[m.span(i) for i in xrange(6)])
+                entity = DTDEntity(ctx, '', *[m.span(i) for i in xrange(4)])
         return (entity, inneroffset)
 
     def createEntity(self, ctx, m):
         valspan = m.span('val')
         valspan = (valspan[0]+1, valspan[1]-1)
         pre_comment = self.last_comment
         self.last_comment = None
         return DTDEntity(ctx, pre_comment,
                          m.span(),
-                         m.span('pre'),
-                         m.span('entity'), m.span('key'), valspan,
-                         m.span('post'))
+                         m.span('entity'), m.span('key'), valspan)
 
 
 class PropertiesEntity(Entity):
     escape = re.compile(r'\\((?P<uni>u[0-9a-fA-F]{1,4})|'
                         '(?P<nl>\n\s*)|(?P<single>.))', re.M)
     known_escapes = {'n': '\n', 'r': '\r', 't': '\t', '\\': '\\'}
 
     @property
@@ -433,36 +410,38 @@ class PropertiesEntity(Entity):
                 return ''
             return self.known_escapes.get(found['single'], found['single'])
 
         return self.escape.sub(unescape, self.raw_val)
 
 
 class PropertiesParser(Parser):
     def __init__(self):
-        self.reKey = re.compile('^(\s*)'
+        self.reKey = re.compile('^\s*'
                                 '([^#!\s\n][^=:\n]*?)\s*[:=][ \t]*', re.M)
-        self.reComment = re.compile('(\s*)(((?:[#!][^\n]*\n?)+))', re.M)
+        self.reComment = re.compile('\s*(((?:[#!][^\n]*\n?)+))', re.M)
         self._escapedEnd = re.compile(r'\\+$')
         self._trailingWS = re.compile(r'\s*(?:\n|\Z)', re.M)
         Parser.__init__(self)
 
     def getEntity(self, ctx, offset):
         # overwritten to parse values line by line
         contents = ctx.contents
         m = self.reComment.match(contents, offset)
         if m:
-            spans = [m.span(i) for i in xrange(3)]
-            start_trailing = offset = m.end()
+            spans = [m.span(0), m.span(1)]
+            offset = m.end()
             while offset < len(contents):
                 m = self._trailingWS.match(contents, offset)
                 if not m:
                     break
                 offset = m.end()
-            spans.append((start_trailing, offset))
+            # Extend the 0th span (the whole match) to the end of the trailing
+            # whitespace.
+            spans[0] = (spans[0][0], offset)
             self.last_comment = Comment(ctx, *spans)
             return (self.last_comment, offset)
         m = self.reKey.match(contents, offset)
         if m:
             startline = offset = m.end()
             while True:
                 endval = nextline = contents.find('\n', offset)
                 if nextline == -1:
@@ -482,90 +461,84 @@ class PropertiesParser(Parser):
             if ws:
                 endval = ws.start()
                 offset = ws.end()
             pre_comment = self.last_comment
             self.last_comment = None
             entity = PropertiesEntity(
                 ctx, pre_comment,
                 (m.start(), offset),   # full span
-                m.span(1),  # leading whitespan
-                (m.start(2), offset),   # entity def span
-                m.span(2),   # key span
-                (m.end(), endval),   # value span
-                (offset, offset))  # post comment span, empty
+                (m.start(1), offset),   # entity def span
+                m.span(1),   # key span
+                (m.end(), endval))   # value span
             return (entity, offset)
         return self.getTrailing(ctx, offset, self.reKey, self.reComment)
 
 
 class DefinesInstruction(EntityBase):
     '''Entity-like object representing processing instructions in inc files
     '''
-    def __init__(self, ctx, span, pre_ws_span, def_span, val_span, post_span):
+    def __init__(self, ctx, span, def_span, val_span):
         self.ctx = ctx
         self.span = span
-        self.pre_ws_span = pre_ws_span
         self.def_span = def_span
         self.key_span = self.val_span = val_span
-        self.post_span = post_span
 
     def __repr__(self):
         return self.raw_val
 
 
 class DefinesParser(Parser):
     # can't merge, #unfilter needs to be the last item, which we don't support
     capabilities = CAN_COPY
     tail = re.compile(r'(?!)')  # never match
 
     def __init__(self):
         self.reComment = re.compile(
-            '((?:[ \t]*\n)*)'
+            '(?:[ \t]*\n)*'
             '((?:^# .*?(?:\n|\Z))+)'
-            '((?:[ \t]*(?:\n|\Z))*)', re.M)
-        self.reKey = re.compile('((?:[ \t]*\n)*)'
+            '(?:[ \t]*(?:\n|\Z))*', re.M)
+        self.reKey = re.compile('(?:[ \t]*\n)*'
                                 '(#define[ \t]+(\w+)(?:[ \t](.*?))?(?:\n|\Z))'
-                                '((?:[ \t]*(?:\n|\Z))*)',
+                                '(?:[ \t]*(?:\n|\Z))*',
                                 re.M)
-        self.rePI = re.compile('((?:[ \t]*\n)*)'
+        self.rePI = re.compile('(?:[ \t]*\n)*'
                                '(#(\w+)[ \t]+(.*?)(?:\n|\Z))'
-                               '((?:[ \t]*(?:\n|\Z))*)',
+                               '(?:[ \t]*(?:\n|\Z))*',
                                re.M)
         Parser.__init__(self)
 
     def getEntity(self, ctx, offset):
         contents = ctx.contents
         m = self.reComment.match(contents, offset)
         if m:
             offset = m.end()
-            self.last_comment = Comment(ctx, *[m.span(i) for i in xrange(4)])
+            self.last_comment = Comment(ctx, *[m.span(i) for i in xrange(2)])
             return (self.last_comment, offset)
         m = self.reKey.match(contents, offset)
         if m:
             offset = m.end()
             return (self.createEntity(ctx, m), offset)
         m = self.rePI.match(contents, offset)
         if m:
             offset = m.end()
-            return (DefinesInstruction(ctx, *[m.span(i) for i in xrange(5)]),
+            return (DefinesInstruction(ctx, *[m.span(i) for i in xrange(3)]),
                     offset)
         return self.getTrailing(ctx, offset,
                                 self.reComment, self.reKey, self.rePI)
 
 
 class IniSection(EntityBase):
     '''Entity-like object representing sections in ini files
     '''
-    def __init__(self, ctx, span, pre_ws_span, def_span, val_span, post_span):
+    def __init__(self, ctx, span, def_span, val_span):
         self.ctx = ctx
         self.span = span
-        self.pre_ws_span = pre_ws_span
         self.def_span = def_span
         self.key_span = self.val_span = val_span
-        self.post_span = post_span
 
     def __repr__(self):
         return self.raw_val
 
 
 class IniParser(Parser):
     '''
     Parse files of the form:
@@ -573,40 +546,40 @@ class IniParser(Parser):
     [cat]
     whitespace*
     #comment
     string=value
     ...
     '''
     def __init__(self):
         self.reComment = re.compile(
-            '((?:[ \t]*\n)*)'
+            '(?:[ \t]*\n)*'
             '((?:^[;#].*?(?:\n|\Z))+)'
-            '((?:[ \t]*(?:\n|\Z))*)', re.M)
+            '(?:[ \t]*(?:\n|\Z))*', re.M)
         self.reSection = re.compile(
-            '((?:[ \t]*\n)*)'
+            '(?:[ \t]*\n)*'
             '(\[(.*?)\])'
-            '((?:[ \t]*(?:\n|\Z))*)', re.M)
+            '(?:[ \t]*(?:\n|\Z))*', re.M)
         self.reKey = re.compile(
-            '((?:[ \t]*\n)*)'
+            '(?:[ \t]*\n)*'
             '((.+?)=(.*))'
-            '((?:[ \t]*(?:\n|\Z))*)', re.M)
+            '(?:[ \t]*(?:\n|\Z))*', re.M)
         Parser.__init__(self)
 
     def getEntity(self, ctx, offset):
         contents = ctx.contents
         m = self.reComment.match(contents, offset)
         if m:
             offset = m.end()
-            self.last_comment = Comment(ctx, *[m.span(i) for i in xrange(4)])
+            self.last_comment = Comment(ctx, *[m.span(i) for i in xrange(2)])
             return (self.last_comment, offset)
         m = self.reSection.match(contents, offset)
         if m:
             offset = m.end()
-            return (IniSection(ctx, *[m.span(i) for i in xrange(5)]), offset)
+            return (IniSection(ctx, *[m.span(i) for i in xrange(3)]), offset)
         m = self.reKey.match(contents, offset)
         if m:
             offset = m.end()
             return (self.createEntity(ctx, m), offset)
         return self.getTrailing(ctx, offset,
                                 self.reComment, self.reSection, self.reKey)

--- a/compare_locales/tests/test_checks.py
+++ b/compare_locales/tests/test_checks.py
@@ -226,26 +226,25 @@ class TestAndroid(unittest.TestCase):
     apos_msg = u"Apostrophes in Android DTDs need escaping with \\' or " + \
                u"\\u0027, or use \u2019, or put string in quotes."
     quot_msg = u"Quotes in Android DTDs need escaping with \\\" or " + \
                u"\\u0022, or put string in apostrophes."
 
     def getEntity(self, v):
         ctx = Parser.Context(v)
         return DTDEntity(
-            ctx, '', (0, len(v)), (), (), (), (0, len(v)), ())
+            ctx, '', (0, len(v)), (), (), (0, len(v)))
 
     def getDTDEntity(self, v):
         v = v.replace('"', '&quot;')
         ctx = Parser.Context('<!ENTITY foo "%s">' % v)
         return DTDEntity(
             ctx,
             '',
-            (0, len(v) + 16), (), (), (9, 12),
-            (14, len(v) + 14), ())
+            (0, len(v) + 16), (), (9, 12), (14, len(v) + 14))
 
     def test_android_dtd(self):
         """Testing the actual android checks. The logic is involved,
         so this is a lot of nitty gritty detail tests.
         """
         f = File("embedding/android/strings.dtd", "strings.dtd",
                  "embedding/android")
         checker = getChecker(f, extra_tests=['android-dtd'])

--- a/compare_locales/tests/test_dtd.py
+++ b/compare_locales/tests/test_dtd.py
@@ -119,24 +119,16 @@ escaped value">
         self.assertEqual(one.position(), (1, 1))
         self.assertEqual(one.value_position(), (1, 16))
         self.assertEqual(one.position(-1), (2, 1))
         self.assertEqual(two.position(), (2, 1))
         self.assertEqual(two.value_position(), (2, 16))
         self.assertEqual(two.value_position(-1), (3, 14))
         self.assertEqual(two.value_position(10), (3, 5))
 
-    def test_post(self):
-        self.parser.readContents('<!ENTITY a "a"><!ENTITY b "b">')
-        a, b = list(self.parser)
-        self.assertEqual(a.post, '')
-        self.parser.readContents('<!ENTITY a "a"> <!ENTITY b "b">')
-        a, b = list(self.parser)
-        self.assertEqual(a.post, ' ')
-
     def test_word_count(self):
         self.parser.readContents('''\
 <!ENTITY a "one">
 <!ENTITY b "one<br>two">
 <!ENTITY c "one<span>word</span>">
 <!ENTITY d "one <a href='foo'>two</a> three">
 ''')
         a, b, c, d = list(self.parser)

author	Staś Małolepszy <stas@mozilla.com>
	Fri, 15 Sep 2017 11:23:19 +0200
changeset 328	37cfd135777ea6d3cfd5db5cb8280ebffdfe3901
parent 327	7506dca6666842e4e8fc121ee1f22a57ceae4336
child 329	db20f168ef12041571b277d7bfb14582246f4b2a
push id	105
push user	smalolepszy@mozilla.com
push date	Thu, 21 Sep 2017 17:02:01 +0000
reviewers	Pike
bugs	1399059

compare_locales/parser.py		file \| annotate \| diff \| comparison \| revisions
compare_locales/tests/test_checks.py		file \| annotate \| diff \| comparison \| revisions
compare_locales/tests/test_dtd.py		file \| annotate \| diff \| comparison \| revisions