Bug 1399059 - Part 6 - Return Whitespace from getNext (previously called getEntity). r?Pike draft
authorStaś Małolepszy <stas@mozilla.com>
Thu, 21 Sep 2017 16:11:46 +0200
changeset 332 cb961af913a0d50749331338c9bff9d32686b042
parent 331 0dd7555cbd790ab72dde3873a7365dce313cba9f
child 333 72f81f3794c1062407ec1c46b40df1d720bcec5c
push id105
push usersmalolepszy@mozilla.com
push dateThu, 21 Sep 2017 17:02:01 +0000
reviewersPike
bugs1399059
Bug 1399059 - Part 6 - Return Whitespace from getNext (previously called getEntity). r?Pike MozReview-Commit-ID: 82hCkcov9ZA
compare_locales/parser.py
compare_locales/tests/test_checks.py
compare_locales/tests/test_defines.py
compare_locales/tests/test_dtd.py
compare_locales/tests/test_ini.py
compare_locales/tests/test_merge.py
compare_locales/tests/test_properties.py
--- a/compare_locales/parser.py
+++ b/compare_locales/parser.py
@@ -35,24 +35,22 @@ class EntityBase(object):
     1: entity definition
     2: entity key (name)
     3: entity value
 
     <!ENTITY key "value">
 
     <--- definition ---->
     '''
-    def __init__(self, ctx, pre_comment, span, def_span, key_span, val_span):
+    def __init__(self, ctx, pre_comment, span, key_span, val_span):
         self.ctx = ctx
         self.span = span
-        self.def_span = def_span
         self.key_span = key_span
         self.val_span = val_span
         self.pre_comment = pre_comment
-        pass
 
     def position(self, offset=0):
         """Get the 1-based line and column of the character
         with given offset into the Entity.
 
         If offset is negative, return the end of the Entity.
         """
         if offset < 0:
@@ -73,29 +71,25 @@ class EntityBase(object):
             pos = self.val_span[0] + offset
         return self.ctx.lines(pos)[0]
 
     # getter helpers
 
     def get_all(self):
         return self.ctx.contents[self.span[0]:self.span[1]]
 
-    def get_def(self):
-        return self.ctx.contents[self.def_span[0]:self.def_span[1]]
-
     def get_key(self):
         return self.ctx.contents[self.key_span[0]:self.key_span[1]]
 
     def get_raw_val(self):
         return self.ctx.contents[self.val_span[0]:self.val_span[1]]
 
     # getters
 
     all = property(get_all)
-    definition = property(get_def)
     key = property(get_key)
     val = property(get_raw_val)
     raw_val = property(get_raw_val)
 
     def __repr__(self):
         return self.key
 
     re_br = re.compile('<br\s*/?>', re.U)
@@ -113,45 +107,43 @@ class EntityBase(object):
         return self.key == other.key and self.val == other.val
 
 
 class Entity(EntityBase):
     pass
 
 
 class Comment(EntityBase):
-    def __init__(self, ctx, span, def_span):
+    def __init__(self, ctx, span):
         self.ctx = ctx
         self.span = span
-        self.def_span = def_span
 
     @property
     def key(self):
         return None
 
     @property
     def val(self):
         return None
 
     def __repr__(self):
-        return self.definition
+        return self.all
 
 
 class Junk(object):
     '''
     An almost-Entity, representing junk data that we didn't parse.
     This way, we can signal bad content as stuff we don't understand.
     And the either fix that, or report real bugs in localizations.
     '''
     junkid = 0
 
     def __init__(self, ctx, span):
         self.ctx = ctx
         self.span = span
-        self.def_span = span
         self.__class__.junkid += 1
         self.key = '_junk_%d_%d-%d' % (self.__class__.junkid, span[0], span[1])
 
     def position(self, offset=0):
         """Get the 1-based line and column of the character
         with given offset into the Entity.
 
         If offset is negative, return the end of the Entity.
@@ -175,30 +167,33 @@ class Junk(object):
 
 
 class Whitespace(EntityBase):
     '''Entity-like object representing an empty file with whitespace,
     if allowed
     '''
     def __init__(self, ctx, span):
         self.ctx = ctx
-        self.span = self.def_span = self.key_span = self.val_span = span
+        self.span = self.key_span = self.val_span = span
 
     def __repr__(self):
         return self.raw_val
 
 
 class Parser(object):
     capabilities = CAN_SKIP | CAN_MERGE
-    tail = re.compile('\s+\Z')
+    tail = re.compile('\s+')
+    reWhitespace = re.compile('\s+', re.M)
 
     class Context(object):
         "Fixture for content and line numbers"
         def __init__(self, contents):
             self.contents = contents
+            # Subclasses may use bitmasks to keep state.
+            self.state = 0
             self._lines = None
 
         def lines(self, *positions):
             # return line and column tuples, 1-based
             if self._lines is None:
                 nl = re.compile('\n', re.M)
                 self._lines = [m.end()
                                for m in nl.finditer(self.contents)]
@@ -245,46 +240,41 @@ class Parser(object):
     def walk(self, onlyEntities=False, withWhitespace=False):
         if not self.ctx:
             # loading file failed, or we just didn't load anything
             return
         ctx = self.ctx
         contents = ctx.contents
 
         next_entity_offset = 0
-        entity = self.getEntity(ctx, next_entity_offset)
+        entity = self.getNext(ctx, next_entity_offset)
         while entity:
-            if withWhitespace:
-                def_start = entity.def_span[0]
-                if next_entity_offset < def_start:
-                    yield Whitespace(ctx, (next_entity_offset, def_start))
-
-            if (not onlyEntities or isinstance(entity, (Entity, Junk))):
+            if (isinstance(entity, (Entity, Junk))):
+                yield entity
+            elif (not onlyEntities and not isinstance(entity, Whitespace)):
+                yield entity
+            elif (withWhitespace):
                 yield entity
 
-            if withWhitespace:
-                def_end = entity.def_span[1]
-                outer_end = entity.span[1]
-                if def_end < outer_end:
-                    yield Whitespace(ctx, (def_end, outer_end))
-
             next_entity_offset = entity.span[1]
-            entity = self.getEntity(ctx, next_entity_offset)
+            entity = self.getNext(ctx, next_entity_offset)
 
         if len(contents) > next_entity_offset:
             yield Junk(ctx, (next_entity_offset, len(contents)))
 
-    def getEntity(self, ctx, offset):
+    def getNext(self, ctx, offset):
+        m = self.reWhitespace.match(ctx.contents, offset)
+        if m:
+            return Whitespace(ctx, m.span())
         m = self.reKey.match(ctx.contents, offset)
         if m:
-            entity = self.createEntity(ctx, m)
-            return entity
+            return self.createEntity(ctx, m)
         m = self.reComment.match(ctx.contents, offset)
         if m:
-            self.last_comment = Comment(ctx, *[m.span(i) for i in xrange(2)])
+            self.last_comment = Comment(ctx, m.span())
             return self.last_comment
         return self.getTrailing(ctx, offset, self.reKey, self.reComment)
 
     def getTrailing(self, ctx, offset, *expressions):
         junkend = None
         for exp in expressions:
             m = exp.search(ctx.contents, offset)
             if m:
@@ -296,17 +286,17 @@ class Parser(object):
             else:
                 return None
         return Junk(ctx, (offset, junkend))
 
     def createEntity(self, ctx, m):
         pre_comment = self.last_comment
         self.last_comment = None
         return Entity(ctx, pre_comment,
-                      *[m.span(i) for i in xrange(4)])
+                      *[m.span(i) for i in xrange(3)])
 
     @classmethod
     def findDuplicates(cls, entities):
         found = Counter(entity.key for entity in entities)
         for entity_id, cnt in found.items():
             if cnt > 1:
                 yield '{} occurs {} times'.format(entity_id, cnt)
 
@@ -359,54 +349,53 @@ class DTDParser(Parser):
         u'\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F' + \
         u'\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD'
     # + \U00010000-\U000EFFFF seems to be unsupported in python
 
     # NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 |
     #     [#x0300-#x036F] | [#x203F-#x2040]
     NameChar = NameStartChar + ur'\-\.0-9' + u'\xB7\u0300-\u036F\u203F-\u2040'
     Name = '[' + NameStartChar + '][' + NameChar + ']*'
-    reKey = re.compile('(?:\s*(?P<entity><!ENTITY\s+(?P<key>' + Name +
-                       ')\s+(?P<val>\"[^\"]*\"|\'[^\']*\'?)\s*>)\s*)',
+    reKey = re.compile('<!ENTITY\s+(?P<key>' + Name + ')\s+'
+                       '(?P<val>\"[^\"]*\"|\'[^\']*\'?)\s*>',
                        re.DOTALL | re.M)
     # add BOM to DTDs, details in bug 435002
     reHeader = re.compile(u'^\ufeff')
-    reComment = re.compile('\s*(<!--(-?[%s])*?-->)\s*' % CharMinusDash,
+    reComment = re.compile('(<!--(-?[%s])*?-->)' % CharMinusDash,
                            re.S)
-    rePE = re.compile(u'(?:\s*'
-                      u'(<!ENTITY\s+%\s+(' + Name +
-                      u')\s+SYSTEM\s+(\"[^\"]*\"|\'[^\']*\')\s*>\s*%' + Name +
-                      u';)(?:[ \t]*(?:' + XmlComment + u'\s*)*\n?)?)')
+    rePE = re.compile(u'<!ENTITY\s+%\s+(' + Name + ')\s+'
+                      u'SYSTEM\s+(\"[^\"]*\"|\'[^\']*\')\s*>\s*'
+                      u'%' + Name + ';'
+                      u'(?:[ \t]*(?:' + XmlComment + u'\s*)*\n?)?')
 
-    def getEntity(self, ctx, offset):
+    def getNext(self, ctx, offset):
         '''
-        Overload Parser.getEntity to special-case ParsedEntities.
+        Overload Parser.getNext to special-case ParsedEntities.
         Just check for a parsed entity if that method claims junk.
 
         <!ENTITY % foo SYSTEM "url">
         %foo;
         '''
         if offset is 0 and self.reHeader.match(ctx.contents):
             offset += 1
-        entity = Parser.getEntity(self, ctx, offset)
+        entity = Parser.getNext(self, ctx, offset)
         if (entity and isinstance(entity, Junk)) or entity is None:
             m = self.rePE.match(ctx.contents, offset)
             if m:
                 self.last_comment = None
-                entity = DTDEntity(ctx, '', *[m.span(i) for i in xrange(4)])
+                entity = DTDEntity(ctx, '', *[m.span(i) for i in xrange(3)])
         return entity
 
     def createEntity(self, ctx, m):
         valspan = m.span('val')
         valspan = (valspan[0]+1, valspan[1]-1)
         pre_comment = self.last_comment
         self.last_comment = None
         return DTDEntity(ctx, pre_comment,
-                         m.span(),
-                         m.span('entity'), m.span('key'), valspan)
+                         m.span(), m.span('key'), valspan)
 
 
 class PropertiesEntity(Entity):
     escape = re.compile(r'\\((?P<uni>u[0-9a-fA-F]{1,4})|'
                         '(?P<nl>\n\s*)|(?P<single>.))', re.M)
     known_escapes = {'n': '\n', 'r': '\r', 't': '\t', '\\': '\\'}
 
     @property
@@ -419,39 +408,35 @@ class PropertiesEntity(Entity):
                 return ''
             return self.known_escapes.get(found['single'], found['single'])
 
         return self.escape.sub(unescape, self.raw_val)
 
 
 class PropertiesParser(Parser):
     def __init__(self):
-        self.reKey = re.compile('^\s*([^#!\s\n][^=:\n]*?)\s*[:=][ \t]*', re.M)
-        self.reComment = re.compile(
-            '\s*((?:[#!][^\n]*\n)*(?:[#!][^\n]*))', re.M)
+        self.reKey = re.compile('([^#!\s\n][^=:\n]*?)\s*[:=][ \t]*', re.M)
+        self.reComment = re.compile('([#!][^\n]*\n)*([#!][^\n]*)', re.M)
         self._escapedEnd = re.compile(r'\\+$')
-        self._trailingWS = re.compile(r'\s*(?:\n|\Z)', re.M)
+        self._trailingWS = re.compile(r'\s*[\n\Z]', re.M)
         Parser.__init__(self)
 
-    def getEntity(self, ctx, offset):
+    def getNext(self, ctx, offset):
         # overwritten to parse values line by line
         contents = ctx.contents
+
+        m = self.reWhitespace.match(contents, offset)
+        if m:
+            return Whitespace(ctx, m.span())
+
         m = self.reComment.match(contents, offset)
         if m:
-            offset = m.end()
+            self.last_comment = Comment(ctx, m.span())
+            return self.last_comment
 
-            while offset < len(contents):
-                trailing = self._trailingWS.match(contents, offset)
-                if not trailing:
-                    break
-                offset = trailing.end()
-
-            span = (m.start(), offset)
-            self.last_comment = Comment(ctx, span, m.span(1))
-            return self.last_comment
         m = self.reKey.match(contents, offset)
         if m:
             startline = offset = m.end()
             while True:
                 endval = nextline = contents.find('\n', offset)
                 if nextline == -1:
                     endval = offset = len(contents)
                     break
@@ -459,89 +444,101 @@ class PropertiesParser(Parser):
                 _e = self._escapedEnd.search(contents, offset, nextline)
                 offset = nextline + 1
                 if _e is None:
                     break
                 # backslashes at end of line, if 2*n, not escaped
                 if len(_e.group()) % 2 == 0:
                     break
                 startline = offset
+
             # strip trailing whitespace
             ws = self._trailingWS.search(contents, startline)
             if ws:
                 endval = ws.start()
                 offset = ws.end()
+
             pre_comment = self.last_comment
             self.last_comment = None
             entity = PropertiesEntity(
                 ctx, pre_comment,
-                (m.start(), offset),   # full span
-                (m.start(1), endval),   # def span
+                (m.start(), endval),   # full span
                 m.span(1),   # key span
                 (m.end(), endval))   # value span
             return entity
         return self.getTrailing(ctx, offset, self.reKey, self.reComment)
 
 
 class DefinesInstruction(EntityBase):
     '''Entity-like object representing processing instructions in inc files
     '''
-    def __init__(self, ctx, span, def_span, val_span):
+    def __init__(self, ctx, span, val_span):
         self.ctx = ctx
         self.span = span
-        self.def_span = def_span
         self.key_span = self.val_span = val_span
 
     def __repr__(self):
         return self.raw_val
 
 
 class DefinesParser(Parser):
     # can't merge, #unfilter needs to be the last item, which we don't support
     capabilities = CAN_COPY
     tail = re.compile(r'(?!)')  # never match
+    reWhitespace = re.compile('\n+', re.M)
+
+    EMPTY_LINES = 1 << 0
+    PAST_FIRST_LINE = 1 << 1
 
     def __init__(self):
-        self.reComment = re.compile(
-            '(?:[ \t]*\n)*'
-            '((?:^# .*?(?:\n|\Z))+)'
-            '(?:[ \t]*(?:\n|\Z))*', re.M)
-        self.reKey = re.compile('(?:[ \t]*\n)*'
-                                '(#define[ \t]+(\w+)(?:[ \t](.*?))?(?:\n|\Z))'
-                                '(?:[ \t]*(?:\n|\Z))*',
-                                re.M)
-        self.rePI = re.compile('(?:[ \t]*\n)*'
-                               '(#(\w+)[ \t]+(.*?)(?:\n|\Z))'
-                               '(?:[ \t]*(?:\n|\Z))*',
-                               re.M)
+        self.reComment = re.compile('(?:^# .*?\n)*(?:^# [^\n]*)', re.M)
+        self.reKey = re.compile('#define[ \t]+(\w+)(?:[ \t]+([^\n]*))?', re.M)
+        self.rePI = re.compile('#(\w+[ \t]+[^\n]+)', re.M)
         Parser.__init__(self)
 
-    def getEntity(self, ctx, offset):
+    def getNext(self, ctx, offset):
         contents = ctx.contents
+
+        m = self.reWhitespace.match(contents, offset)
+        if m:
+            if ctx.state & self.EMPTY_LINES:
+                return Whitespace(ctx, m.span())
+            if ctx.state & self.PAST_FIRST_LINE and len(m.group()) == 1:
+                return Whitespace(ctx, m.span())
+            else:
+                return Junk(ctx, m.span())
+
+        # We're not in the first line anymore.
+        ctx.state |= self.PAST_FIRST_LINE
+
         m = self.reComment.match(contents, offset)
         if m:
-            self.last_comment = Comment(ctx, *[m.span(i) for i in xrange(2)])
+            self.last_comment = Comment(ctx, m.span())
             return self.last_comment
         m = self.reKey.match(contents, offset)
         if m:
             return self.createEntity(ctx, m)
         m = self.rePI.match(contents, offset)
         if m:
-            return DefinesInstruction(ctx, *[m.span(i) for i in xrange(3)])
+            instr = DefinesInstruction(ctx, m.span(), m.span(1))
+            if instr.val == 'filter emptyLines':
+                ctx.state |= self.EMPTY_LINES
+            if instr.val == 'unfilter emptyLines':
+                ctx.state &= ~ self.EMPTY_LINES
+            return instr
         return self.getTrailing(ctx, offset,
                                 self.reComment, self.reKey, self.rePI)
 
 
 class IniSection(EntityBase):
     '''Entity-like object representing sections in ini files
     '''
-    def __init__(self, ctx, span, def_span, val_span):
+    def __init__(self, ctx, span, val_span):
         self.ctx = ctx
         self.span = span
-        self.def_span = def_span
         self.key_span = self.val_span = val_span
 
     def __repr__(self):
         return self.raw_val
 
 
 class IniParser(Parser):
     '''
@@ -549,39 +546,33 @@ class IniParser(Parser):
     # initial comment
     [cat]
     whitespace*
     #comment
     string=value
     ...
     '''
     def __init__(self):
-        self.reComment = re.compile(
-            '(?:[ \t]*\n)*'
-            '((?:^[;#].*?(?:\n|\Z))+)'
-            '(?:[ \t]*(?:\n|\Z))*', re.M)
-        self.reSection = re.compile(
-            '(?:[ \t]*\n)*'
-            '(\[(.*?)\])'
-            '(?:[ \t]*(?:\n|\Z))*', re.M)
-        self.reKey = re.compile(
-            '(?:[ \t]*\n)*'
-            '((.+?)=(.*))'
-            '(?:[ \t]*(?:\n|\Z))*', re.M)
+        self.reComment = re.compile('(^[;#][^\n]*\n)*(^[;#][^\n]*)', re.M)
+        self.reSection = re.compile('\[(.*?)\]', re.M)
+        self.reKey = re.compile('(.+?)=(.*)', re.M)
         Parser.__init__(self)
 
-    def getEntity(self, ctx, offset):
+    def getNext(self, ctx, offset):
         contents = ctx.contents
+        m = self.reWhitespace.match(contents, offset)
+        if m:
+            return Whitespace(ctx, m.span())
         m = self.reComment.match(contents, offset)
         if m:
-            self.last_comment = Comment(ctx, *[m.span(i) for i in xrange(2)])
+            self.last_comment = Comment(ctx, m.span())
             return self.last_comment
         m = self.reSection.match(contents, offset)
         if m:
-            return IniSection(ctx, *[m.span(i) for i in xrange(3)])
+            return IniSection(ctx, m.span(), m.span(1))
         m = self.reKey.match(contents, offset)
         if m:
             return self.createEntity(ctx, m)
         return self.getTrailing(ctx, offset,
                                 self.reComment, self.reSection, self.reKey)
 
 
 class FluentAttribute(EntityBase):
--- a/compare_locales/tests/test_checks.py
+++ b/compare_locales/tests/test_checks.py
@@ -223,28 +223,26 @@ class TestAndroid(unittest.TestCase):
     Make sure we're hitting our extra rules only if
     we're passing in a DTD file in the embedding/android module.
     """
     apos_msg = u"Apostrophes in Android DTDs need escaping with \\' or " + \
                u"\\u0027, or use \u2019, or put string in quotes."
     quot_msg = u"Quotes in Android DTDs need escaping with \\\" or " + \
                u"\\u0022, or put string in apostrophes."
 
-    def getEntity(self, v):
+    def getNext(self, v):
         ctx = Parser.Context(v)
         return DTDEntity(
-            ctx, '', (0, len(v)), (), (), (0, len(v)))
+            ctx, '', (0, len(v)), (), (0, len(v)))
 
     def getDTDEntity(self, v):
         v = v.replace('"', '&quot;')
         ctx = Parser.Context('<!ENTITY foo "%s">' % v)
         return DTDEntity(
-            ctx,
-            '',
-            (0, len(v) + 16), (), (9, 12), (14, len(v) + 14))
+            ctx, '', (0, len(v) + 16), (9, 12), (14, len(v) + 14))
 
     def test_android_dtd(self):
         """Testing the actual android checks. The logic is involved,
         so this is a lot of nitty gritty detail tests.
         """
         f = File("embedding/android/strings.dtd", "strings.dtd",
                  "embedding/android")
         checker = getChecker(f, extra_tests=['android-dtd'])
@@ -320,33 +318,33 @@ class TestAndroid(unittest.TestCase):
                          (('error', 14, 'truncated \\uXXXX escape',
                            'android'),))
 
     def test_android_prop(self):
         f = File("embedding/android/strings.properties", "strings.properties",
                  "embedding/android")
         checker = getChecker(f, extra_tests=['android-dtd'])
         # good plain string
-        ref = self.getEntity("plain string")
-        l10n = self.getEntity("plain localized string")
+        ref = self.getNext("plain string")
+        l10n = self.getNext("plain localized string")
         self.assertEqual(tuple(checker.check(ref, l10n)),
                          ())
         # no dtd warning
-        ref = self.getEntity("plain string")
-        l10n = self.getEntity("plain localized string &ref;")
+        ref = self.getNext("plain string")
+        l10n = self.getNext("plain localized string &ref;")
         self.assertEqual(tuple(checker.check(ref, l10n)),
                          ())
         # no report on stray ampersand
-        ref = self.getEntity("plain string")
-        l10n = self.getEntity("plain localized string with apos: '")
+        ref = self.getNext("plain string")
+        l10n = self.getNext("plain localized string with apos: '")
         self.assertEqual(tuple(checker.check(ref, l10n)),
                          ())
         # report on bad printf
-        ref = self.getEntity("string with %s")
-        l10n = self.getEntity("string with %S")
+        ref = self.getNext("string with %s")
+        l10n = self.getNext("string with %S")
         self.assertEqual(tuple(checker.check(ref, l10n)),
                          (('error', 0, 'argument 1 `S` should be `s`',
                            'printf'),))
 
     def test_non_android_dtd(self):
         f = File("browser/strings.dtd", "strings.dtd", "browser")
         checker = getChecker(f)
         # good string
--- a/compare_locales/tests/test_defines.py
+++ b/compare_locales/tests/test_defines.py
@@ -6,75 +6,128 @@
 import unittest
 
 from compare_locales.tests import ParserTestMixin
 
 
 mpl2 = '''\
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
-# You can obtain one at http://mozilla.org/MPL/2.0/.
-'''
+# You can obtain one at http://mozilla.org/MPL/2.0/.'''
 
 
 class TestDefinesParser(ParserTestMixin, unittest.TestCase):
 
     filename = 'defines.inc'
 
     def testBrowser(self):
-        self._test(mpl2 + '''#filter emptyLines
+        self._test(mpl2 + '''
+#filter emptyLines
 
 #define MOZ_LANGPACK_CREATOR mozilla.org
 
 # If non-English locales wish to credit multiple contributors, uncomment this
 # variable definition and use the format specified.
 # #define MOZ_LANGPACK_CONTRIBUTORS <em:contributor>Joe Solon</em:contributor>
 
 #unfilter emptyLines
 
 ''', (
             ('Comment', mpl2),
             ('DefinesInstruction', 'filter emptyLines'),
             ('MOZ_LANGPACK_CREATOR', 'mozilla.org'),
             ('Comment', '#define'),
-            ('DefinesInstruction', 'unfilter emptyLines')))
+            ('DefinesInstruction', 'unfilter emptyLines'),
+            ('Junk', '\n')))
 
     def testBrowserWithContributors(self):
-        self._test(mpl2 + '''#filter emptyLines
+        self._test(mpl2 + '''
+#filter emptyLines
 
 #define MOZ_LANGPACK_CREATOR mozilla.org
 
 # If non-English locales wish to credit multiple contributors, uncomment this
 # variable definition and use the format specified.
 #define MOZ_LANGPACK_CONTRIBUTORS <em:contributor>Joe Solon</em:contributor>
 
 #unfilter emptyLines
 
 ''', (
             ('Comment', mpl2),
             ('DefinesInstruction', 'filter emptyLines'),
             ('MOZ_LANGPACK_CREATOR', 'mozilla.org'),
             ('Comment', 'non-English'),
             ('MOZ_LANGPACK_CONTRIBUTORS',
              '<em:contributor>Joe Solon</em:contributor>'),
-            ('DefinesInstruction', 'unfilter emptyLines')))
+            ('DefinesInstruction', 'unfilter emptyLines'),
+            ('Junk', '\n')))
 
     def testCommentWithNonAsciiCharacters(self):
-        self._test(mpl2 + '''#filter emptyLines
+        self._test(mpl2 + '''
+#filter emptyLines
 
 # e.g. #define seamonkey_l10n <DT><A HREF="urn:foo">SeaMonkey v češtině</a>
 #define seamonkey_l10n_long
 
 #unfilter emptyLines
 
 ''', (
             ('Comment', mpl2),
             ('DefinesInstruction', 'filter emptyLines'),
             ('Comment', u'češtině'),
             ('seamonkey_l10n_long', ''),
+            ('DefinesInstruction', 'unfilter emptyLines'),
+            ('Junk', '\n')))
+
+    def test_no_empty_lines(self):
+        self._test('''#define MOZ_LANGPACK_CREATOR mozilla.org
+#define MOZ_LANGPACK_CREATOR mozilla.org
+''', (
+            ('MOZ_LANGPACK_CREATOR', 'mozilla.org'),
+            ('MOZ_LANGPACK_CREATOR', 'mozilla.org')))
+
+    def test_empty_line_between(self):
+        self._test('''#define MOZ_LANGPACK_CREATOR mozilla.org
+
+#define MOZ_LANGPACK_CREATOR mozilla.org
+''', (
+            ('MOZ_LANGPACK_CREATOR', 'mozilla.org'),
+            ('Junk', '\n'),
+            ('MOZ_LANGPACK_CREATOR', 'mozilla.org')))
+
+    def test_empty_line_at_the_beginning(self):
+        self._test('''
+#define MOZ_LANGPACK_CREATOR mozilla.org
+#define MOZ_LANGPACK_CREATOR mozilla.org
+''', (
+            ('Junk', '\n'),
+            ('MOZ_LANGPACK_CREATOR', 'mozilla.org'),
+            ('MOZ_LANGPACK_CREATOR', 'mozilla.org')))
+
+    def test_filter_empty_lines(self):
+        self._test('''#filter emptyLines
+
+#define MOZ_LANGPACK_CREATOR mozilla.org
+#define MOZ_LANGPACK_CREATOR mozilla.org
+#unfilter emptyLines''', (
+            ('DefinesInstruction', 'filter emptyLines'),
+            ('MOZ_LANGPACK_CREATOR', 'mozilla.org'),
+            ('MOZ_LANGPACK_CREATOR', 'mozilla.org'),
+            ('DefinesInstruction', 'unfilter emptyLines')))
+
+    def test_unfilter_empty_lines_with_trailing(self):
+        self._test('''#filter emptyLines
+
+#define MOZ_LANGPACK_CREATOR mozilla.org
+#define MOZ_LANGPACK_CREATOR mozilla.org
+#unfilter emptyLines
+''', (
+            ('DefinesInstruction', 'filter emptyLines'),
+            ('MOZ_LANGPACK_CREATOR', 'mozilla.org'),
+            ('MOZ_LANGPACK_CREATOR', 'mozilla.org'),
             ('DefinesInstruction', 'unfilter emptyLines')))
 
     def testToolkit(self):
         self._test('''#define MOZ_LANG_TITLE English (US)
 ''', (
             ('MOZ_LANG_TITLE', 'English (US)'),))
 
     def testToolkitEmpty(self):
--- a/compare_locales/tests/test_dtd.py
+++ b/compare_locales/tests/test_dtd.py
@@ -100,30 +100,30 @@ class TestDTD(ParserTestMixin, unittest.
                    (('foo.label', 'stuff'),))
 
     def test_unicode_comment(self):
         self._test('<!-- \xe5\x8f\x96 -->',
                    (('Comment', u'\u53d6'),))
 
     def test_empty_file(self):
         self._test('', tuple())
-        self._test('\n', (('Whitespace', '\n'),))
-        self._test('\n\n', (('Whitespace', '\n\n'),))
-        self._test(' \n\n', (('Whitespace', ' \n\n'),))
+        self._test('\n', tuple())
+        self._test('\n\n', tuple())
+        self._test(' \n\n', tuple())
 
     def test_positions(self):
         self.parser.readContents('''\
 <!ENTITY one  "value">
 <!ENTITY  two "other
 escaped value">
 ''')
         one, two = list(self.parser)
         self.assertEqual(one.position(), (1, 1))
         self.assertEqual(one.value_position(), (1, 16))
-        self.assertEqual(one.position(-1), (2, 1))
+        self.assertEqual(one.position(-1), (1, 23))
         self.assertEqual(two.position(), (2, 1))
         self.assertEqual(two.value_position(), (2, 16))
         self.assertEqual(two.value_position(-1), (3, 14))
         self.assertEqual(two.value_position(10), (3, 5))
 
     def test_word_count(self):
         self.parser.readContents('''\
 <!ENTITY a "one">
--- a/compare_locales/tests/test_ini.py
+++ b/compare_locales/tests/test_ini.py
@@ -6,18 +6,17 @@
 import unittest
 
 from compare_locales.tests import ParserTestMixin
 
 
 mpl2 = '''\
 ; This Source Code Form is subject to the terms of the Mozilla Public
 ; License, v. 2.0. If a copy of the MPL was not distributed with this file,
-; You can obtain one at http://mozilla.org/MPL/2.0/.
-'''
+; You can obtain one at http://mozilla.org/MPL/2.0/.'''
 
 
 class TestIniParser(ParserTestMixin, unittest.TestCase):
 
     filename = 'foo.ini'
 
     def testSimpleHeader(self):
         self._test('''; This file is in the UTF-8 encoding
@@ -25,85 +24,90 @@ class TestIniParser(ParserTestMixin, uni
 TitleText=Some Title
 ''', (
             ('Comment', 'UTF-8 encoding'),
             ('IniSection', 'Strings'),
             ('TitleText', 'Some Title'),))
 
     def testMPL2_Space_UTF(self):
         self._test(mpl2 + '''
+
 ; This file is in the UTF-8 encoding
 [Strings]
 TitleText=Some Title
 ''', (
             ('Comment', mpl2),
             ('Comment', 'UTF-8'),
             ('IniSection', 'Strings'),
             ('TitleText', 'Some Title'),))
 
     def testMPL2_Space(self):
         self._test(mpl2 + '''
+
 [Strings]
 TitleText=Some Title
 ''', (
             ('Comment', mpl2),
             ('IniSection', 'Strings'),
             ('TitleText', 'Some Title'),))
 
     def testMPL2_MultiSpace(self):
-        self._test(mpl2 + '''\
+        self._test(mpl2 + '''
 
 ; more comments
 
 [Strings]
 TitleText=Some Title
 ''', (
             ('Comment', mpl2),
             ('Comment', 'more comments'),
             ('IniSection', 'Strings'),
             ('TitleText', 'Some Title'),))
 
     def testMPL2_JunkBeforeCategory(self):
-        self._test(mpl2 + '''\
+        self._test(mpl2 + '''
 Junk
 [Strings]
 TitleText=Some Title
 ''', (
             ('Comment', mpl2),
             ('Junk', 'Junk'),
             ('IniSection', 'Strings'),
             ('TitleText', 'Some Title')))
 
     def test_TrailingComment(self):
         self._test(mpl2 + '''
+
 [Strings]
 TitleText=Some Title
 ;Stray trailing comment
 ''', (
             ('Comment', mpl2),
             ('IniSection', 'Strings'),
             ('TitleText', 'Some Title'),
             ('Comment', 'Stray trailing')))
 
     def test_SpacedTrailingComments(self):
         self._test(mpl2 + '''
+
 [Strings]
 TitleText=Some Title
 
 ;Stray trailing comment
 ;Second stray comment
 
 ''', (
             ('Comment', mpl2),
             ('IniSection', 'Strings'),
             ('TitleText', 'Some Title'),
             ('Comment', 'Second stray comment')))
 
     def test_TrailingCommentsAndJunk(self):
         self._test(mpl2 + '''
+
 [Strings]
 TitleText=Some Title
 
 ;Stray trailing comment
 Junk
 ;Second stray comment
 
 ''', (
@@ -111,30 +115,31 @@ Junk
             ('IniSection', 'Strings'),
             ('TitleText', 'Some Title'),
             ('Comment', 'Stray trailing'),
             ('Junk', 'Junk'),
             ('Comment', 'Second stray comment')))
 
     def test_JunkInbetweenEntries(self):
         self._test(mpl2 + '''
+
 [Strings]
 TitleText=Some Title
 
 Junk
 
 Good=other string
 ''', (
             ('Comment', mpl2),
             ('IniSection', 'Strings'),
             ('TitleText', 'Some Title'),
             ('Junk', 'Junk'),
             ('Good', 'other string')))
 
     def test_empty_file(self):
         self._test('', tuple())
-        self._test('\n', (('Whitespace', '\n'),))
-        self._test('\n\n', (('Whitespace', '\n\n'),))
-        self._test(' \n\n', (('Whitespace', ' \n\n'),))
+        self._test('\n', tuple())
+        self._test('\n\n', tuple())
+        self._test(' \n\n', tuple())
 
 
 if __name__ == '__main__':
     unittest.main()
--- a/compare_locales/tests/test_merge.py
+++ b/compare_locales/tests/test_merge.py
@@ -288,19 +288,19 @@ class TestDTD(unittest.TestCase, Content
                     'missing': 1,
                     'missing_w': 1,
                     'unchanged': 2,
                     'unchanged_w': 2
                 }},
              'details': {
                  'l10n.dtd': [
                      {'error': u'Unparsed content "<!ENTY bar '
-                               u'\'gimmick\'>" '
+                               u'\'gimmick\'>\n" '
                                u'from line 2 column 1 to '
-                               u'line 2 column 22'},
+                               u'line 3 column 1'},
                      {'missingEntity': u'bar'}]
                 }
              })
         mergefile = mozpath.join(self.tmp, "merge", "l10n.dtd")
         self.assertTrue(os.path.isfile(mergefile))
         p = getParser(mergefile)
         p.readFile(mergefile)
         [m, n] = p.parse()
--- a/compare_locales/tests/test_properties.py
+++ b/compare_locales/tests/test_properties.py
@@ -123,19 +123,19 @@ foo = bar
 # which is available as a panel in the Debugger.
 
 
 
 ''',  (('Comment', 'LOCALIZATION NOTE'),))
 
     def test_empty_file(self):
         self._test('', tuple())
-        self._test('\n', (('Whitespace', '\n'),))
-        self._test('\n\n', (('Whitespace', '\n\n'),))
-        self._test(' \n\n', (('Whitespace', ' \n\n'),))
+        self._test('\n', tuple())
+        self._test('\n\n', tuple())
+        self._test(' \n\n', tuple())
 
     def test_positions(self):
         self.parser.readContents('''\
 one = value
 two = other \\
 escaped value
 ''')
         one, two = list(self.parser)