--- a/compare_locales/parser.py
+++ b/compare_locales/parser.py
@@ -27,36 +27,30 @@ CAN_SKIP = 2
CAN_MERGE = 4
class EntityBase(object):
'''
Abstraction layer for a localizable entity.
Currently supported are grammars of the form:
- 1: pre white space
- 2: entity definition
- 3: entity key (name)
- 4: entity value
- 5: post white space
- <--[1]
+ 1: entity definition
+ 2: entity key (name)
+ 3: entity value
+
<!ENTITY key "value">
- <-------[2]--------->
+ <--- definition ---->
'''
- def __init__(self, ctx, pre_comment,
- span, pre_ws_span, def_span,
- key_span, val_span, post_span):
+ def __init__(self, ctx, pre_comment, span, def_span, key_span, val_span):
self.ctx = ctx
self.span = span
- self.pre_ws_span = pre_ws_span
self.def_span = def_span
self.key_span = key_span
self.val_span = val_span
- self.post_span = post_span
self.pre_comment = pre_comment
pass
def position(self, offset=0):
"""Get the 1-based line and column of the character
with given offset into the Entity.
If offset is negative, return the end of the Entity.
@@ -79,40 +73,32 @@ class EntityBase(object):
pos = self.val_span[0] + offset
return self.ctx.lines(pos)[0]
# getter helpers
def get_all(self):
return self.ctx.contents[self.span[0]:self.span[1]]
- def get_pre_ws(self):
- return self.ctx.contents[self.pre_ws_span[0]:self.pre_ws_span[1]]
-
def get_def(self):
return self.ctx.contents[self.def_span[0]:self.def_span[1]]
def get_key(self):
return self.ctx.contents[self.key_span[0]:self.key_span[1]]
def get_raw_val(self):
return self.ctx.contents[self.val_span[0]:self.val_span[1]]
- def get_post(self):
- return self.ctx.contents[self.post_span[0]:self.post_span[1]]
-
# getters
all = property(get_all)
- pre_ws = property(get_pre_ws)
definition = property(get_def)
key = property(get_key)
val = property(get_raw_val)
raw_val = property(get_raw_val)
- post = property(get_post)
def __repr__(self):
return self.key
re_br = re.compile('<br\s*/?>', re.U)
re_sgml = re.compile('</?\w+.*?>', re.U | re.M)
def count_words(self):
@@ -127,23 +113,20 @@ class EntityBase(object):
return self.key == other.key and self.val == other.val
class Entity(EntityBase):
pass
class Comment(EntityBase):
- def __init__(self, ctx, span, pre_ws_span, def_span,
- post_span):
+ def __init__(self, ctx, span, def_span):
self.ctx = ctx
self.span = span
- self.pre_ws_span = pre_ws_span
self.def_span = def_span
- self.post_span = post_span
@property
def key(self):
return None
@property
def val(self):
return None
@@ -158,17 +141,17 @@ class Junk(object):
This way, we can signal bad content as stuff we don't understand.
And the either fix that, or report real bugs in localizations.
'''
junkid = 0
def __init__(self, ctx, span):
self.ctx = ctx
self.span = span
- self.pre_ws = self.definition = self.post = ''
+ self.definition = ''
self.__class__.junkid += 1
self.key = '_junk_%d_%d-%d' % (self.__class__.junkid, span[0], span[1])
def position(self, offset=0):
"""Get the 1-based line and column of the character
with given offset into the Entity.
If offset is negative, return the end of the Entity.
@@ -193,18 +176,17 @@ class Junk(object):
class Whitespace(EntityBase):
'''Entity-like object representing an empty file with whitespace,
if allowed
'''
def __init__(self, ctx, span):
self.ctx = ctx
self.key_span = self.val_span = self.span = span
- self.def_span = self.pre_ws_span = (span[0], span[0])
- self.post_span = (span[1], span[1])
+ self.def_span = (span[0], span[0])
def __repr__(self):
return self.raw_val
class Parser(object):
capabilities = CAN_SKIP | CAN_MERGE
tail = re.compile('\s+\Z')
@@ -282,17 +264,17 @@ class Parser(object):
m = self.reKey.match(ctx.contents, offset)
if m:
offset = m.end()
entity = self.createEntity(ctx, m)
return (entity, offset)
m = self.reComment.match(ctx.contents, offset)
if m:
offset = m.end()
- self.last_comment = Comment(ctx, *[m.span(i) for i in xrange(4)])
+ self.last_comment = Comment(ctx, *[m.span(i) for i in xrange(2)])
return (self.last_comment, offset)
return self.getTrailing(ctx, offset, self.reKey, self.reComment)
def getTrailing(self, ctx, offset, *expressions):
junkend = None
for exp in expressions:
m = exp.search(ctx.contents, offset)
if m:
@@ -304,17 +286,17 @@ class Parser(object):
else:
return (None, offset)
return (Junk(ctx, (offset, junkend)), junkend)
def createEntity(self, ctx, m):
pre_comment = self.last_comment
self.last_comment = None
return Entity(ctx, pre_comment,
- *[m.span(i) for i in xrange(6)])
+ *[m.span(i) for i in xrange(4)])
@classmethod
def findDuplicates(cls, entities):
found = Counter(entity.key for entity in entities)
for entity_id, cnt in found.items():
if cnt > 1:
yield '{} occurs {} times'.format(entity_id, cnt)
@@ -322,27 +304,25 @@ class Parser(object):
def getParser(path):
for item in __constructors:
if re.search(item[0], path):
return item[1]
raise UserWarning("Cannot find Parser")
# Subgroups of the match will:
-# 1: pre white space
-# 2: pre comments
-# 3: entity definition
-# 4: entity key (name)
-# 5: entity value
-# 6: post comment (and white space) in the same line (dtd only)
-# <--[1]
-# <!-- pre comments --> <--[2]
-# <!ENTITY key "value"> <!-- comment -->
+# 1: pre comments
+# 2: entity definition
+# 3: entity key (name)
+# 4: entity value
+
+# <!-- pre comments --> <--[1]
+# <!ENTITY key "value">
#
-# <-------[3]---------><------[6]------>
+# <-------[2]--------->
class DTDEntity(Entity):
def value_position(self, offset=0):
# DTDChecker already returns tuples of (line, col) positions
if isinstance(offset, tuple):
line_pos, col_pos = offset
line, col = super(DTDEntity, self).value_position()
@@ -369,28 +349,27 @@ class DTDParser(Parser):
u'\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F' + \
u'\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD'
# + \U00010000-\U000EFFFF seems to be unsupported in python
# NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 |
# [#x0300-#x036F] | [#x203F-#x2040]
NameChar = NameStartChar + ur'\-\.0-9' + u'\xB7\u0300-\u036F\u203F-\u2040'
Name = '[' + NameStartChar + '][' + NameChar + ']*'
- reKey = re.compile('(?:(?P<pre>\s*)(?P<entity><!ENTITY\s+(?P<key>' + Name +
- ')\s+(?P<val>\"[^\"]*\"|\'[^\']*\'?)\s*>)'
- '(?P<post>\s+)?)',
+ reKey = re.compile('(?:\s*(?P<entity><!ENTITY\s+(?P<key>' + Name +
+ ')\s+(?P<val>\"[^\"]*\"|\'[^\']*\'?)\s*>)\s*)',
re.DOTALL | re.M)
# add BOM to DTDs, details in bug 435002
reHeader = re.compile(u'^\ufeff')
- reComment = re.compile('(\s*)(<!--(-?[%s])*?-->)(\s*)' % CharMinusDash,
+ reComment = re.compile('\s*(<!--(-?[%s])*?-->)\s*' % CharMinusDash,
re.S)
- rePE = re.compile(u'(?:(\s*)'
+ rePE = re.compile(u'(?:\s*'
u'(<!ENTITY\s+%\s+(' + Name +
u')\s+SYSTEM\s+(\"[^\"]*\"|\'[^\']*\')\s*>\s*%' + Name +
- u';)([ \t]*(?:' + XmlComment + u'\s*)*\n?)?)')
+ u';)(?:[ \t]*(?:' + XmlComment + u'\s*)*\n?)?)')
def getEntity(self, ctx, offset):
'''
Overload Parser.getEntity to special-case ParsedEntities.
Just check for a parsed entity if that method claims junk.
<!ENTITY % foo SYSTEM "url">
%foo;
@@ -398,29 +377,27 @@ class DTDParser(Parser):
if offset is 0 and self.reHeader.match(ctx.contents):
offset += 1
entity, inneroffset = Parser.getEntity(self, ctx, offset)
if (entity and isinstance(entity, Junk)) or entity is None:
m = self.rePE.match(ctx.contents, offset)
if m:
inneroffset = m.end()
self.last_comment = None
- entity = DTDEntity(ctx, '', *[m.span(i) for i in xrange(6)])
+ entity = DTDEntity(ctx, '', *[m.span(i) for i in xrange(4)])
return (entity, inneroffset)
def createEntity(self, ctx, m):
valspan = m.span('val')
valspan = (valspan[0]+1, valspan[1]-1)
pre_comment = self.last_comment
self.last_comment = None
return DTDEntity(ctx, pre_comment,
m.span(),
- m.span('pre'),
- m.span('entity'), m.span('key'), valspan,
- m.span('post'))
+ m.span('entity'), m.span('key'), valspan)
class PropertiesEntity(Entity):
escape = re.compile(r'\\((?P<uni>u[0-9a-fA-F]{1,4})|'
'(?P<nl>\n\s*)|(?P<single>.))', re.M)
known_escapes = {'n': '\n', 'r': '\r', 't': '\t', '\\': '\\'}
@property
@@ -433,36 +410,38 @@ class PropertiesEntity(Entity):
return ''
return self.known_escapes.get(found['single'], found['single'])
return self.escape.sub(unescape, self.raw_val)
class PropertiesParser(Parser):
def __init__(self):
- self.reKey = re.compile('^(\s*)'
+ self.reKey = re.compile('^\s*'
'([^#!\s\n][^=:\n]*?)\s*[:=][ \t]*', re.M)
- self.reComment = re.compile('(\s*)(((?:[#!][^\n]*\n?)+))', re.M)
+ self.reComment = re.compile('\s*(((?:[#!][^\n]*\n?)+))', re.M)
self._escapedEnd = re.compile(r'\\+$')
self._trailingWS = re.compile(r'\s*(?:\n|\Z)', re.M)
Parser.__init__(self)
def getEntity(self, ctx, offset):
# overwritten to parse values line by line
contents = ctx.contents
m = self.reComment.match(contents, offset)
if m:
- spans = [m.span(i) for i in xrange(3)]
- start_trailing = offset = m.end()
+ spans = [m.span(0), m.span(1)]
+ offset = m.end()
while offset < len(contents):
m = self._trailingWS.match(contents, offset)
if not m:
break
offset = m.end()
- spans.append((start_trailing, offset))
+ # Extend the 0th span (the whole match) to the end of the trailing
+ # whitespace.
+ spans[0] = (spans[0][0], offset)
self.last_comment = Comment(ctx, *spans)
return (self.last_comment, offset)
m = self.reKey.match(contents, offset)
if m:
startline = offset = m.end()
while True:
endval = nextline = contents.find('\n', offset)
if nextline == -1:
@@ -482,90 +461,84 @@ class PropertiesParser(Parser):
if ws:
endval = ws.start()
offset = ws.end()
pre_comment = self.last_comment
self.last_comment = None
entity = PropertiesEntity(
ctx, pre_comment,
(m.start(), offset), # full span
- m.span(1), # leading whitespan
- (m.start(2), offset), # entity def span
- m.span(2), # key span
- (m.end(), endval), # value span
- (offset, offset)) # post comment span, empty
+ (m.start(1), offset), # entity def span
+ m.span(1), # key span
+ (m.end(), endval)) # value span
return (entity, offset)
return self.getTrailing(ctx, offset, self.reKey, self.reComment)
class DefinesInstruction(EntityBase):
'''Entity-like object representing processing instructions in inc files
'''
- def __init__(self, ctx, span, pre_ws_span, def_span, val_span, post_span):
+ def __init__(self, ctx, span, def_span, val_span):
self.ctx = ctx
self.span = span
- self.pre_ws_span = pre_ws_span
self.def_span = def_span
self.key_span = self.val_span = val_span
- self.post_span = post_span
def __repr__(self):
return self.raw_val
class DefinesParser(Parser):
# can't merge, #unfilter needs to be the last item, which we don't support
capabilities = CAN_COPY
tail = re.compile(r'(?!)') # never match
def __init__(self):
self.reComment = re.compile(
- '((?:[ \t]*\n)*)'
+ '(?:[ \t]*\n)*'
'((?:^# .*?(?:\n|\Z))+)'
- '((?:[ \t]*(?:\n|\Z))*)', re.M)
- self.reKey = re.compile('((?:[ \t]*\n)*)'
+ '(?:[ \t]*(?:\n|\Z))*', re.M)
+ self.reKey = re.compile('(?:[ \t]*\n)*'
'(#define[ \t]+(\w+)(?:[ \t](.*?))?(?:\n|\Z))'
- '((?:[ \t]*(?:\n|\Z))*)',
+ '(?:[ \t]*(?:\n|\Z))*',
re.M)
- self.rePI = re.compile('((?:[ \t]*\n)*)'
+ self.rePI = re.compile('(?:[ \t]*\n)*'
'(#(\w+)[ \t]+(.*?)(?:\n|\Z))'
- '((?:[ \t]*(?:\n|\Z))*)',
+ '(?:[ \t]*(?:\n|\Z))*',
re.M)
Parser.__init__(self)
def getEntity(self, ctx, offset):
contents = ctx.contents
m = self.reComment.match(contents, offset)
if m:
offset = m.end()
- self.last_comment = Comment(ctx, *[m.span(i) for i in xrange(4)])
+ self.last_comment = Comment(ctx, *[m.span(i) for i in xrange(2)])
return (self.last_comment, offset)
m = self.reKey.match(contents, offset)
if m:
offset = m.end()
return (self.createEntity(ctx, m), offset)
m = self.rePI.match(contents, offset)
if m:
offset = m.end()
- return (DefinesInstruction(ctx, *[m.span(i) for i in xrange(5)]),
+ return (DefinesInstruction(ctx, *[m.span(i) for i in xrange(3)]),
offset)
return self.getTrailing(ctx, offset,
self.reComment, self.reKey, self.rePI)
class IniSection(EntityBase):
'''Entity-like object representing sections in ini files
'''
- def __init__(self, ctx, span, pre_ws_span, def_span, val_span, post_span):
+ def __init__(self, ctx, span, def_span, val_span):
self.ctx = ctx
self.span = span
- self.pre_ws_span = pre_ws_span
self.def_span = def_span
self.key_span = self.val_span = val_span
- self.post_span = post_span
def __repr__(self):
return self.raw_val
class IniParser(Parser):
'''
Parse files of the form:
@@ -573,40 +546,40 @@ class IniParser(Parser):
[cat]
whitespace*
#comment
string=value
...
'''
def __init__(self):
self.reComment = re.compile(
- '((?:[ \t]*\n)*)'
+ '(?:[ \t]*\n)*'
'((?:^[;#].*?(?:\n|\Z))+)'
- '((?:[ \t]*(?:\n|\Z))*)', re.M)
+ '(?:[ \t]*(?:\n|\Z))*', re.M)
self.reSection = re.compile(
- '((?:[ \t]*\n)*)'
+ '(?:[ \t]*\n)*'
'(\[(.*?)\])'
- '((?:[ \t]*(?:\n|\Z))*)', re.M)
+ '(?:[ \t]*(?:\n|\Z))*', re.M)
self.reKey = re.compile(
- '((?:[ \t]*\n)*)'
+ '(?:[ \t]*\n)*'
'((.+?)=(.*))'
- '((?:[ \t]*(?:\n|\Z))*)', re.M)
+ '(?:[ \t]*(?:\n|\Z))*', re.M)
Parser.__init__(self)
def getEntity(self, ctx, offset):
contents = ctx.contents
m = self.reComment.match(contents, offset)
if m:
offset = m.end()
- self.last_comment = Comment(ctx, *[m.span(i) for i in xrange(4)])
+ self.last_comment = Comment(ctx, *[m.span(i) for i in xrange(2)])
return (self.last_comment, offset)
m = self.reSection.match(contents, offset)
if m:
offset = m.end()
- return (IniSection(ctx, *[m.span(i) for i in xrange(5)]), offset)
+ return (IniSection(ctx, *[m.span(i) for i in xrange(3)]), offset)
m = self.reKey.match(contents, offset)
if m:
offset = m.end()
return (self.createEntity(ctx, m), offset)
return self.getTrailing(ctx, offset,
self.reComment, self.reSection, self.reKey)
--- a/compare_locales/tests/test_checks.py
+++ b/compare_locales/tests/test_checks.py
@@ -226,26 +226,25 @@ class TestAndroid(unittest.TestCase):
apos_msg = u"Apostrophes in Android DTDs need escaping with \\' or " + \
u"\\u0027, or use \u2019, or put string in quotes."
quot_msg = u"Quotes in Android DTDs need escaping with \\\" or " + \
u"\\u0022, or put string in apostrophes."
def getEntity(self, v):
ctx = Parser.Context(v)
return DTDEntity(
- ctx, '', (0, len(v)), (), (), (), (0, len(v)), ())
+ ctx, '', (0, len(v)), (), (), (0, len(v)))
def getDTDEntity(self, v):
v = v.replace('"', '"')
ctx = Parser.Context('<!ENTITY foo "%s">' % v)
return DTDEntity(
ctx,
'',
- (0, len(v) + 16), (), (), (9, 12),
- (14, len(v) + 14), ())
+ (0, len(v) + 16), (), (9, 12), (14, len(v) + 14))
def test_android_dtd(self):
"""Testing the actual android checks. The logic is involved,
so this is a lot of nitty gritty detail tests.
"""
f = File("embedding/android/strings.dtd", "strings.dtd",
"embedding/android")
checker = getChecker(f, extra_tests=['android-dtd'])