bug 1310980, part 1: fix DTDParser draft
authorAxel Hecht <axel@pike.org>
Mon, 24 Oct 2016 16:21:33 +0200
changeset 142 db503092c7006fc987f1ac861c8cd29bc1a75a2a
parent 141 8a8223a74b870adfdea9430cc4d63fd70db83e2c
child 143 60bb42f6bf9eb7f802ed37ffdae162b5903c63f8
push id30
push useraxel@mozilla.com
push dateMon, 24 Oct 2016 14:52:08 +0000
bugs1310980
bug 1310980, part 1: fix DTDParser MozReview-Commit-ID: 2MCZWX5dw71
compare_locales/parser.py
compare_locales/tests/test_checks.py
compare_locales/tests/test_dtd.py
--- a/compare_locales/parser.py
+++ b/compare_locales/parser.py
@@ -11,34 +11,31 @@ import logging
 
 
 class Entity(object):
     '''
     Abstraction layer for a localizable entity.
     Currently supported are grammars of the form:
 
     1: pre white space
-    2: pre comments
-    3: entity definition
-    4: entity key (name)
-    5: entity value
-    6: post comment (and white space) in the same line (dtd only)
+    2: entity definition
+    3: entity key (name)
+    4: entity value
+    5: post white space
                                                  <--[1]
-    <!-- pre comments -->                        <--[2]
-    <!ENTITY key "value"> <!-- comment -->
+    <!ENTITY key "value">
 
-    <-------[3]---------><------[6]------>
+    <-------[2]--------->
     '''
     def __init__(self, ctx, pp,
-                 span, pre_ws_span, pre_comment_span, def_span,
+                 span, pre_ws_span, def_span,
                  key_span, val_span, post_span):
         self.ctx = ctx
         self.span = span
         self.pre_ws_span = pre_ws_span
-        self.pre_comment_span = pre_comment_span
         self.def_span = def_span
         self.key_span = key_span
         self.val_span = val_span
         self.post_span = post_span
         self.pp = pp
         pass
 
     def position(self, offset=0):
@@ -68,20 +65,16 @@ class Entity(object):
     # getter helpers
 
     def get_all(self):
         return self.ctx.contents[self.span[0]:self.span[1]]
 
     def get_pre_ws(self):
         return self.ctx.contents[self.pre_ws_span[0]:self.pre_ws_span[1]]
 
-    def get_pre_comment(self):
-        return self.ctx.contents[self.pre_comment_span[0]:
-                                 self.pre_comment_span[1]]
-
     def get_def(self):
         return self.ctx.contents[self.def_span[0]:self.def_span[1]]
 
     def get_key(self):
         return self.ctx.contents[self.key_span[0]:self.key_span[1]]
 
     def get_val(self):
         return self.pp(self.ctx.contents[self.val_span[0]:self.val_span[1]])
@@ -91,27 +84,48 @@ class Entity(object):
 
     def get_post(self):
         return self.ctx.contents[self.post_span[0]:self.post_span[1]]
 
     # getters
 
     all = property(get_all)
     pre_ws = property(get_pre_ws)
-    pre_comment = property(get_pre_comment)
     definition = property(get_def)
     key = property(get_key)
     val = property(get_val)
     raw_val = property(get_raw_val)
     post = property(get_post)
 
     def __repr__(self):
         return self.key
 
 
+class Comment(Entity):
+    def __init__(self, ctx, span, pre_ws_span, def_span,
+                 post_span):
+        self.ctx = ctx
+        self.span = span
+        self.pre_ws_span = pre_ws_span
+        self.def_span = def_span
+        self.post_span = post_span
+        self.pp = lambda v: v
+
+    @property
+    def key(self):
+        return None
+
+    @property
+    def val(self):
+        return None
+
+    def __repr__(self):
+        return self.all
+
+
 class Junk(object):
     '''
     An almost-Entity, representing junk data that we didn't parse.
     This way, we can signal bad content as stuff we don't understand.
     And the either fix that, or report real bugs in localizations.
     '''
     junkid = 0
 
@@ -193,63 +207,67 @@ class Parser:
             m[e.key] = len(l)
             l.append(e)
         return (l, m)
 
     def postProcessValue(self, val):
         return val
 
     def __iter__(self):
+        return self.walk(onlyEntities=True)
+
+    def walk(self, onlyEntities=False):
         ctx = self.ctx
         contents = ctx.contents
         offset = 0
-        self.header, offset = self.getHeader(contents, offset)
-        self.footer = ''
         entity, offset = self.getEntity(ctx, offset)
         while entity:
-            yield entity
+            if (not onlyEntities or
+                    type(entity) is Entity or
+                    type(entity) is Junk):
+                yield entity
             entity, offset = self.getEntity(ctx, offset)
-        f = self.reFooter.match(contents, offset)
-        if f:
-            self.footer = f.group()
-            offset = f.end()
         if len(contents) > offset:
             yield Junk(ctx, (offset, len(contents)))
-        pass
 
     def getHeader(self, contents, offset):
         header = ''
         h = self.reHeader.match(contents)
         if h:
             header = h.group()
             offset = h.end()
         return (header, offset)
 
     def getEntity(self, ctx, offset):
         m = self.reKey.match(ctx.contents, offset)
         if m:
             offset = m.end()
             entity = self.createEntity(ctx, m)
             return (entity, offset)
-        # first check if footer has a non-empty match,
-        # 'cause then we don't find junk
-        m = self.reFooter.match(ctx.contents, offset)
-        if m and m.end() > offset:
-            return (None, offset)
-        m = self.reKey.search(ctx.contents, offset)
+        m = self.reComment.match(ctx.contents, offset)
+        if m:
+            offset = m.end()
+            return (Comment(ctx, *[m.span(i) for i in xrange(4)]), offset)
+        mkey = self.reKey.search(ctx.contents, offset)
+        mcomment = self.reComment.search(ctx.contents, offset)
+        m = None
+        if mkey and mcomment:
+            m = mkey if mkey.start() < mcomment.start() else mcomment
+        else:
+            m = mkey if mkey else mcomment
         if m:
             # we didn't match, but search, so there's junk between offset
             # and start. We'll match() on the next turn
             junkend = m.start()
             return (Junk(ctx, (offset, junkend)), junkend)
         return (None, offset)
 
     def createEntity(self, ctx, m):
         return Entity(ctx, self.postProcessValue,
-                      *[m.span(i) for i in xrange(7)])
+                      *[m.span(i) for i in xrange(6)])
 
 
 def getParser(path):
     for item in __constructors:
         if re.search(item[0], path):
             return item[1]
     raise UserWarning("Cannot find Parser")
 
@@ -281,53 +299,54 @@ class DTDParser(Parser):
         u'\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F' + \
         u'\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD'
     # + \U00010000-\U000EFFFF seems to be unsupported in python
 
     # NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 |
     #     [#x0300-#x036F] | [#x203F-#x2040]
     NameChar = NameStartChar + ur'\-\.0-9' + u'\xB7\u0300-\u036F\u203F-\u2040'
     Name = '[' + NameStartChar + '][' + NameChar + ']*'
-    reKey = re.compile('(?:(?P<pre>\s*)(?P<precomment>(?:' + XmlComment +
-                       '\s*)*)(?P<entity><!ENTITY\s+(?P<key>' + Name +
+    reKey = re.compile('(?:(?P<pre>\s*)(?P<entity><!ENTITY\s+(?P<key>' + Name +
                        ')\s+(?P<val>\"[^\"]*\"|\'[^\']*\'?)\s*>)'
-                       '(?P<post>[ \t]*(?:' + XmlComment + '\s*)*\n?)?)',
-                       re.DOTALL)
+                       '(?P<post>\s*)?)',
+                       re.DOTALL | re.M)
     # add BOM to DTDs, details in bug 435002
-    reHeader = re.compile(u'^\ufeff?'
-                          u'(\s*<!--.*(http://mozilla.org/MPL/2.0/|'
-                          u'LICENSE BLOCK)([^-]+-)*[^-]+-->)?', re.S)
-    reFooter = re.compile('\s*(<!--([^-]+-)*[^-]+-->\s*)*$')
-    rePE = re.compile('(?:(\s*)((?:' + XmlComment + '\s*)*)'
-                      '(<!ENTITY\s+%\s+(' + Name +
-                      ')\s+SYSTEM\s+(\"[^\"]*\"|\'[^\']*\')\s*>\s*%' + Name +
-                      ';)([ \t]*(?:' + XmlComment + '\s*)*\n?)?)')
+    reHeader = re.compile(u'^\ufeff?')
+    reComment = re.compile('(\s*)(<!--(-?[%s])*?-->)(\s*)' % CharMinusDash,
+                           re.S)
+    rePE = re.compile(u'(?:(\s*)'
+                      u'(<!ENTITY\s+%\s+(' + Name +
+                      u')\s+SYSTEM\s+(\"[^\"]*\"|\'[^\']*\')\s*>\s*%' + Name +
+                      u';)([ \t]*(?:' + XmlComment + u'\s*)*\n?)?)')
 
     def getEntity(self, ctx, offset):
         '''
         Overload Parser.getEntity to special-case ParsedEntities.
         Just check for a parsed entity if that method claims junk.
 
         <!ENTITY % foo SYSTEM "url">
         %foo;
         '''
         entity, inneroffset = Parser.getEntity(self, ctx, offset)
         if (entity and isinstance(entity, Junk)) or entity is None:
             m = self.rePE.match(ctx.contents, offset)
             if m:
                 inneroffset = m.end()
                 entity = Entity(ctx, self.postProcessValue,
-                                *[m.span(i) for i in xrange(7)])
+                                *[m.span(i) for i in xrange(6)])
         return (entity, inneroffset)
 
     def createEntity(self, ctx, m):
         valspan = m.span('val')
         valspan = (valspan[0]+1, valspan[1]-1)
-        return Entity(ctx, self.postProcessValue, m.span(),
-                      m.span('pre'), m.span('precomment'),
+        pre_comment = unicode(self.last_comment) if self.last_comment else ''
+        self.last_comment = ''
+        return Entity(ctx, self.postProcessValue, pre_comment,
+                      m.span(),
+                      m.span('pre'),
                       m.span('entity'), m.span('key'), valspan,
                       m.span('post'))
 
 
 class PropertiesParser(Parser):
     escape = re.compile(r'\\((?P<uni>u[0-9a-fA-F]{1,4})|'
                         '(?P<nl>\n\s*)|(?P<single>.))', re.M)
     known_escapes = {'n': '\n', 'r': '\r', 't': '\t', '\\': '\\'}
@@ -374,17 +393,16 @@ class PropertiesParser(Parser):
                     break
             # strip trailing whitespace
             ws = self._trailingWS.search(contents, m.end(), offset)
             if ws:
                 endval -= ws.end() - ws.start()
             entity = Entity(ctx, self.postProcessValue,
                             (m.start(), offset),   # full span
                             m.span(1),  # leading whitespan
-                            m.span(2),  # leading comment span
                             (m.start(3), offset),   # entity def span
                             m.span(3),   # key span
                             (m.end(), endval),   # value span
                             (offset, offset))  # post comment span, empty
             return (entity, offset)
         m = self.reKey.search(contents, offset)
         if m:
             # we didn't match, but search, so there's junk between offset
@@ -406,17 +424,17 @@ class PropertiesParser(Parser):
         return val
 
 
 class DefinesParser(Parser):
     # can't merge, #unfilter needs to be the last item, which we don't support
     canMerge = False
 
     def __init__(self):
-        self.reKey = re.compile('^(\s*)((?:^#(?!define\s).*\s*)*)'
+        self.reKey = re.compile('^(\s*)'
                                 '(#define[ \t]+(\w+)[ \t]+(.*?))([ \t]*$\n?)',
                                 re.M)
         self.reHeader = re.compile('^\s*(#(?!define\s).*\s*)*')
         self.reFooter = re.compile('\s*(#(?!define\s).*\s*)*$', re.M)
         Parser.__init__(self)
 
 
 class IniParser(Parser):
--- a/compare_locales/tests/test_checks.py
+++ b/compare_locales/tests/test_checks.py
@@ -235,25 +235,25 @@ class TestAndroid(unittest.TestCase):
     """
     apos_msg = u"Apostrophes in Android DTDs need escaping with \\' or " + \
                u"\\u0027, or use \u2019, or put string in quotes."
     quot_msg = u"Quotes in Android DTDs need escaping with \\\" or " + \
                u"\\u0022, or put string in apostrophes."
 
     def getEntity(self, v):
         ctx = Parser.Context(v)
-        return Entity(ctx, lambda s: s, (0, len(v)), (), (0, 0), (), (),
+        return Entity(ctx, lambda s: s, '', (0, len(v)), (), (), (),
                       (0, len(v)), ())
 
     def getDTDEntity(self, v):
         v = v.replace('"', '&quot;')
         ctx = Parser.Context('<!ENTITY foo "%s">' % v)
         return Entity(ctx,
-                      lambda s: s,
-                      (0, len(v) + 16), (), (0, 0), (), (9, 12),
+                      lambda s: s, '',
+                      (0, len(v) + 16), (), (), (9, 12),
                       (14, len(v) + 14), ())
 
     def test_android_dtd(self):
         """Testing the actual android checks. The logic is involved,
         so this is a lot of nitty gritty detail tests.
         """
         f = File("embedding/android/strings.dtd", "strings.dtd",
                  "embedding/android")
--- a/compare_locales/tests/test_dtd.py
+++ b/compare_locales/tests/test_dtd.py
@@ -3,17 +3,17 @@
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 '''Tests for the DTD parser.
 '''
 
 import unittest
 import re
 
-from compare_locales.parser import getParser
+from compare_locales import parser
 from compare_locales.tests import ParserTestMixin
 
 
 class TestDTD(ParserTestMixin, unittest.TestCase):
     '''Tests for the DTD Parser.'''
     filename = 'foo.dtd'
 
     def test_one_entity(self):
@@ -57,30 +57,67 @@ class TestDTD(ParserTestMixin, unittest.
 
     def test_trailing_comment(self):
         self._test('''<!ENTITY first "string">
 <!ENTITY second "string">
 <!--
 <!ENTITY commented "out">
 -->
 ''',
-                   (('first', 'string'), ('second', 'string')))
+                   (('first', 'string'), ('second', 'string'),
+                    ('Comment', 'out')))
 
     def test_license_header(self):
-        p = getParser('foo.dtd')
+        p = parser.getParser('foo.dtd')
         p.readContents(self.resource('triple-license.dtd'))
-        for e in p:
-            self.assertEqual(e.key, 'foo')
-            self.assertEqual(e.val, 'value')
-        self.assert_('MPL' in p.header)
+        entities = list(p.walk())
+        self.assert_(isinstance(entities[0], parser.Comment))
+        self.assertIn('MPL', entities[0].all)
+        e = entities[1]
+        self.assert_(isinstance(e, parser.Entity))
+        self.assertEqual(e.key, 'foo')
+        self.assertEqual(e.val, 'value')
+        self.assertEqual(len(entities), 2)
         p.readContents('''\
 <!-- This Source Code Form is subject to the terms of the Mozilla Public
    - License, v. 2.0. If a copy of the MPL was not distributed with this file,
    - You can obtain one at http://mozilla.org/MPL/2.0/.  -->
 <!ENTITY foo "value">
 ''')
-        for e in p:
-            self.assertEqual(e.key, 'foo')
-            self.assertEqual(e.val, 'value')
-        self.assert_('MPL' in p.header)
+        entities = list(p.walk())
+        self.assert_(isinstance(entities[0], parser.Comment))
+        self.assertIn('MPL', entities[0].all)
+        e = entities[1]
+        self.assert_(isinstance(e, parser.Entity))
+        self.assertEqual(e.key, 'foo')
+        self.assertEqual(e.val, 'value')
+        self.assertEqual(len(entities), 2)
+
+    def testBOM(self):
+        self._test(u'\ufeff<!ENTITY foo.label "stuff">'.encode('utf-8'),
+                   (('foo.label', 'stuff'),))
+
+    def test_trailing_whitespace(self):
+        self._test('<!ENTITY foo.label "stuff">\n  \n',
+                   (('foo.label', 'stuff'),))
+
+    def test_unicode_comment(self):
+        self._test('<!-- \xe5\x8f\x96 -->',
+                   (('Comment', u'\u53d6'),))
+
+    def test_positions(self):
+        self.parser.readContents('''\
+<!ENTITY one  "value">
+<!ENTITY  two "other
+escaped value">
+''')
+        one, two = list(self.parser)
+        self.assertEqual(one.position(), (1, 1))
+        self.assertEqual(one.value_position(), (1, 16))
+        self.assertEqual(one.position(-1), (2, 1))
+        self.assertEqual(two.position(), (2, 1))
+        self.assertEqual(two.value_position(), (2, 16))
+        self.assertEqual(two.value_position(-1), (3, 14))
+        self.assertEqual(two.value_position(10), (3, 5))
+
 
 if __name__ == '__main__':
     unittest.main()