python/moz/l10n/formats/properties/parse.py (205 lines of code) (raw):
# Copyright Mozilla Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from enum import Enum
from re import Match, compile
from typing import Callable
from ...model import Comment, Entry, LinePos, Message, PatternMessage, Resource, Section
from ...util.printf import parse_printf_pattern
from .. import Format
class LineKind(Enum):
EMPTY = 0
COMMENT = 1
KEY = 2
VALUE = 3
esc_re = compile("\\\\(u[0-9A-Fa-f]{1,4}|.)")
def esc_parse(match: Match[str]) -> str:
esc = match.group(1)
if len(esc) > 1 and esc[0] == "u":
n = int(esc[1:], 16)
return chr(n)
elif esc == "t":
return "\t"
elif esc == "n":
return "\n"
elif esc == "f":
return "\f"
elif esc == "r":
return "\r"
else:
return esc
def properties_parse(
source: bytes | str,
encoding: str = "utf-8",
parse_message: Callable[[str], Message] | None = None,
) -> Resource[Message]:
"""
Parse a .properties file into a message resource.
By default, all messages are parsed as PatternMessage([str]).
To customize that, define an appropriate `parse_message(str) -> Message`.
The parsed resource will not include any metadata.
"""
if not isinstance(source, str):
source = source.decode(encoding)
parser = PropertiesParser(source)
entries: list[Entry[Message] | Comment] = []
resource = Resource(Format.properties, [Section((), entries)])
start_line = 0
comment = ""
comment_linepos: LinePos | None = None
prev_linepos: LinePos | None = None
entry: Entry[Message] | None = None
for kind, line, value in parser:
if kind == LineKind.VALUE:
assert entry
if parse_message:
entry.value = parse_message(value)
else:
assert isinstance(entry.value, PatternMessage)
entry.value.pattern.append(value)
if entry.linepos and line > entry.linepos.key:
entry.linepos.value = line
entry.linepos.end = line + 1
entry = None
else:
if prev_linepos:
prev_linepos.end = max(prev_linepos.end, line)
prev_linepos = None
entry = None
if kind == LineKind.KEY:
entry = Entry(
id=(value,),
value=PatternMessage([]),
comment=comment,
linepos=LinePos(start_line or line, line, line, line + 1),
)
prev_linepos = entry.linepos
entries.append(entry)
comment = ""
comment_linepos = None
start_line = 0
elif kind == LineKind.COMMENT:
if comment:
comment += "\n" + value
assert comment_linepos
comment_linepos.end = line + 1
else:
comment = value
start_line = line
comment_linepos = LinePos(line, line, line, line + 1)
elif comment:
# empty line or EOF after a comment
if entries or resource.comment:
entries.append(Comment(comment=comment, linepos=comment_linepos))
else:
resource.comment = comment
comment = ""
comment_linepos = None
start_line = 0
return resource
def properties_parse_message(
source: str, *, printf_placeholders: bool = False
) -> PatternMessage:
"""
Parse a .properties message value.
If `printf_placeholders` is enabled,
printf specifiers are parsed as variables.
"""
parser = PropertiesParser(source)
parser.at_value = True
(kind, _, value), *rest = parser
if kind != LineKind.VALUE or not all(kind == LineKind.EMPTY for kind, _, _ in rest):
raise ValueError("Source is not a .properties value")
if printf_placeholders:
return PatternMessage(list(parse_printf_pattern(source)))
return PatternMessage([value])
class PropertiesParser:
def __init__(self, source: str) -> None:
self.source = source
self.pos = 0
self.line_pos = 1
self.at_value = False
self.done = False
def __iter__(self) -> PropertiesParser:
return self
def __next__(self) -> tuple[LineKind, int, str]:
if self.done:
raise StopIteration
lp = self.line_pos
self.ws()
if self.pos == len(self.source):
self.done = True
return LineKind.EMPTY, lp, ""
if self.nl():
self.at_value = False
return LineKind.EMPTY, lp, ""
if self.at_value:
# value
self.at_value = False
line_start = start = self.pos
at_escape = False
at_cr = False
idx = -1
lines: list[str] = []
for idx, ch in enumerate(self.source[start:]):
if ch == "\n" or ch == "\r":
if at_escape:
at_escape = False
self.line_pos += 1
end = start + idx - 1
lines.append(self.source[line_start:end])
at_cr = ch == "\r"
line_start = start + idx + 1
elif ch == "\n" and at_cr:
at_cr = False
line_start = start + idx + 1
else:
idx -= 1
break
else:
if at_cr:
at_cr = False
if at_escape:
at_escape = False
elif ch == "\\":
at_escape = True
self.pos = end = start + idx + 1
lines.append(self.source[line_start:end])
self.nl()
value = "".join(
esc_re.sub(esc_parse, line.lstrip("\f\t ")) for line in lines
)
return LineKind.VALUE, lp, value
if self.source.startswith(("#", "!"), self.pos):
# comment
self.pos += 1
if self.source.startswith(" ", self.pos):
# Ignore one space after #, if present.
self.pos += 1
start = self.pos
idx = -1
for idx, ch in enumerate(self.source[start:]):
if ch == "\n" or ch == "\r":
idx -= 1
break
end = self.pos = start + idx + 1
self.nl()
return LineKind.COMMENT, lp, self.source[start:end]
# key
start = self.pos
at_escape = False
idx = -1
for idx, ch in enumerate(self.source[start:]):
if at_escape:
at_escape = False
elif ch == "\\":
at_escape = True
elif ch in {"\n", "\r", "\t", "\f", " ", "=", ":"}:
idx -= 1
break
end = self.pos = start + idx + 1
self.ws()
if self.source.startswith(("=", ":"), self.pos):
self.pos += 1
self.at_value = True
return LineKind.KEY, lp, esc_re.sub(esc_parse, self.source[start:end])
def nl(self) -> bool:
if self.source.startswith("\n", self.pos):
self.pos += 1
self.line_pos += 1
return True
elif self.source.startswith("\r", self.pos):
self.pos += 1
self.line_pos += 1
if self.source.startswith("\n", self.pos):
self.pos += 1
return True
return False
def ws(self) -> None:
for ch in self.source[self.pos :]:
if ch in {" ", "\t", "\f"}:
self.pos += 1
else:
break