python/moz/l10n/formats/mf2/message_parser.py (373 lines of code) (raw):

# Copyright Mozilla Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import annotations from typing import Literal from ...model import ( CatchallKey, Expression, Markup, Message, Pattern, PatternMessage, SelectMessage, VariableRef, ) from .validate import name_re, number_re esc_chars = {"\\", "{", "|", "}"} bidi_chars = { "\u061c", # ALM "\u200e", # LRM "\u200f", # RLM "\u2066", # LRI "\u2067", # RLI "\u2068", # FSI "\u2069", # PDI } space_chars = { "\t", "\n", "\r", " ", "\u3000", # ideographic space } def mf2_parse_message(source: bytes | str) -> Message: """ Parse MF2 message syntax into a Message. May raise `MF2ParseError`. """ if not isinstance(source, str): source = source.decode() parser = MF2Parser(source) return parser.parse() class MF2ParseError(ValueError): def __init__(self, parser: MF2Parser, message: str): src = parser.source.replace("\n", "¶") message += f"\n{src}\n{' ' * parser.pos}^" super().__init__(message) self.pos = parser.pos class MF2Parser: def __init__(self, source: bytes | str): self.source = source if isinstance(source, str) else source.decode() self.pos = 0 def parse(self) -> Message: ch = self.skip_opt_space() if ch == ".": message = self.complex_message() elif self.source.startswith("{{", self.pos): message = PatternMessage(self.quoted_pattern()) else: self.pos = 0 message = PatternMessage(self.pattern()) if self.pos != len(self.source): raise MF2ParseError(self, "Extra content at message end") return message def complex_message(self) -> Message: assert self.char() == "." declarations: dict[str, Expression] = {} declared: set[str] = set() while keyword := self.source[self.pos : self.pos + 6]: if keyword == ".input": name, expr = self.input_declaration() elif keyword == ".local": name, expr = self.local_declaration() if isinstance(expr.arg, VariableRef): declared.add(expr.arg.name) else: break if expr.function: for opt_value in expr.options.values(): if isinstance(opt_value, VariableRef): declared.add(opt_value.name) if name in declared: raise MF2ParseError(self, f"Duplicate declaration for ${name}") declarations[name] = expr declared.add(name) self.skip_opt_space() if keyword == ".match": selectors = self.match_statement() for sel in selectors: sel_name = sel.name sel_expr = declarations.get(sel_name, None) while sel_expr is not None and sel_expr.function is None: if ( isinstance(sel_expr.arg, VariableRef) and sel_expr.arg.name != sel_name ): sel_name = sel_expr.arg.name sel_expr = declarations.get(sel_name, None) else: sel_expr = None if sel_expr is None: raise MF2ParseError( self, f"Missing selector annotation for ${sel.name}" ) variants = {} while self.pos < len(self.source): keys, pattern = self.variant(len(selectors)) if keys in variants: raise MF2ParseError(self, f"Duplicate variant with key ${keys}") variants[keys] = pattern fallback_key = (CatchallKey(),) * len(selectors) if fallback_key not in variants: raise MF2ParseError(self, "Missing fallback variant") return SelectMessage(declarations, selectors, variants) pattern = self.quoted_pattern() return PatternMessage(pattern, declarations) def input_declaration(self) -> tuple[str, Expression]: assert self.source.startswith(".input", self.pos) self.pos += 6 ch = self.skip_opt_space() self.expect("{", ch) expr = self.expression_or_markup() if not isinstance(expr, Expression) or not isinstance(expr.arg, VariableRef): raise MF2ParseError(self, "Variable argument required for .input") return expr.arg.name, expr def local_declaration(self) -> tuple[str, Expression]: assert self.source.startswith(".local", self.pos) self.pos += 6 if not self.req_space() or self.char() != "$": raise MF2ParseError(self, "Expected $ with leading space") name = self.name(1) ch = self.skip_opt_space() self.expect("=", ch) ch = self.skip_opt_space() self.expect("{", ch) expr = self.expression_or_markup() if not isinstance(expr, Expression): raise MF2ParseError(self, "Markup is not a valid .local value") if isinstance(expr.arg, VariableRef) and expr.arg.name == name: raise MF2ParseError(self, "A .local declaration cannot be self-referential") return name, expr def match_statement(self) -> tuple[VariableRef, ...]: assert self.source.startswith(".match", self.pos) self.pos += 6 names: list[str] = [] while (has_space := self.req_space()) and self.char() == "$": names.append(self.name(1)) if not names: raise MF2ParseError( self, "At least one variable reference is required for .match" ) if not has_space: raise MF2ParseError(self, "Expected space") return tuple(VariableRef(name) for name in names) def variant(self, num_sel: int) -> tuple[tuple[str | CatchallKey, ...], Pattern]: keys: list[str | CatchallKey] = [] ch = self.char() while ch != "{" and ch != "": if ch == "*": keys.append(CatchallKey()) self.pos += 1 else: keys.append(self.literal()) has_space = self.req_space() if not has_space: break ch = self.char() if len(keys) != num_sel: raise MF2ParseError( self, f"Variant key mismatch, expected {num_sel} but found {len(keys)}", ) return tuple(keys), self.quoted_pattern() def quoted_pattern(self) -> Pattern: if not self.source.startswith("{{", self.pos): raise MF2ParseError(self, "Expected {{") self.pos += 2 pattern = self.pattern() if not self.source.startswith("}}", self.pos): raise MF2ParseError(self, "Expected }}") self.pos += 2 self.skip_opt_space() return pattern def pattern(self) -> Pattern: pattern: Pattern = [] ch = self.char() while ch != "" and ch != "}": if ch == "{": self.pos += 1 pattern.append(self.expression_or_markup()) else: pattern.append(self.text()) ch = self.char() return pattern def text(self) -> str: text = "" at_esc = False for ch in self.source[self.pos :]: if at_esc: if ch not in esc_chars: raise MF2ParseError(self, f"Invalid escape: \\{ch}") text += ch at_esc = False elif ch == "\x00": raise MF2ParseError(self, "NUL character is not allowed") elif ch == "\\": at_esc = True elif ch == "{" or ch == "}": break else: text += ch self.pos += 1 return text def expression_or_markup(self) -> Expression | Markup: ch = self.skip_opt_space() value: Expression | Markup = ( self.markup_body(ch) if ch == "#" or ch == "/" else self.expression_body(ch) ) self.expect("}") return value def expression_body(self, ch: str) -> Expression: arg: str | VariableRef | None = None arg_end = self.pos if ch == "$": arg = self.variable() arg_end = self.pos ch = self.skip_opt_space() elif ch != ":": arg = self.literal() arg_end = self.pos ch = self.skip_opt_space() if ch == ":": if arg and self.pos == arg_end: raise MF2ParseError(self, "Expected space") function = self.identifier(1) options = self.options() else: function = None options = {} self.pos = arg_end attributes = self.attributes() self.skip_opt_space() return Expression(arg, function, options, attributes) def markup_body(self, ch: str) -> Markup: kind: Literal["open", "standalone", "close"] if ch == "#": kind = "open" elif ch == "/": kind = "close" else: raise MF2ParseError(self, "Expected # or /") id = self.identifier(1) options = self.options() attributes = self.attributes() ch = self.skip_opt_space() if ch == "/": if kind == "open": kind = "standalone" else: raise MF2ParseError(self, "Expected }") self.pos += 1 return Markup(kind, id, options, attributes) def options(self) -> dict[str, str | VariableRef]: options: dict[str, str | VariableRef] = {} opt_end = self.pos while self.req_space(): ch = self.char() if ch == "" or ch == "@" or ch == "/" or ch == "}": self.pos = opt_end break id = self.identifier(0) if id in options: raise MF2ParseError(self, f"Duplicate option name {id}") self.expect("=", self.skip_opt_space()) ch = self.skip_opt_space() options[id] = self.variable() if ch == "$" else self.literal() opt_end = self.pos return options def attributes(self) -> dict[str, str | Literal[True]]: attributes: dict[str, str | Literal[True]] = {} attr_end = self.pos while self.req_space(): ch = self.char() if ch != "@": self.pos = attr_end break id = self.identifier(1) id_end = self.pos if id in attributes: raise MF2ParseError(self, f"Duplicate attribute name {id}") if self.skip_opt_space() == "=": self.pos += 1 self.skip_opt_space() attributes[id] = self.literal() else: self.pos = id_end attributes[id] = True attr_end = self.pos return attributes def variable(self) -> VariableRef: assert self.char() == "$" name = self.name(1) return VariableRef(name) def literal(self) -> str: return self.quoted_literal() if self.char() == "|" else self.unquoted_literal() def quoted_literal(self) -> str: assert self.char() == "|" self.pos += 1 value = "" at_esc = False for ch in self.source[self.pos :]: self.pos += 1 if at_esc: if ch not in esc_chars: raise MF2ParseError(self, f"Invalid escape: \\{ch}") value += ch at_esc = False elif ch == "\x00": raise MF2ParseError(self, "NUL character is not allowed") elif ch == "\\": at_esc = True elif ch == "|": return value else: value += ch raise MF2ParseError(self, "Expected |") def unquoted_literal(self) -> str: match = number_re.match(self.source, self.pos) or name_re.match( self.source, self.pos ) if match is None: raise MF2ParseError(self, "Invalid name or number") self.pos = match.end() return match[0] def identifier(self, offset: int) -> str: ns = self.name(offset) if self.char() != ":": return ns name = self.name(1) return f"{ns}:{name}" def name(self, offset: int) -> str: self.pos += offset self.skip_bidi() match = name_re.match(self.source, self.pos) if match is None: raise MF2ParseError(self, "Invalid name") self.pos = match.end() self.skip_bidi() return match[0] def req_space(self) -> bool: start = self.pos ch = self.skip_bidi() if ch not in space_chars: self.pos = start return False while ch in space_chars or ch in bidi_chars: self.pos += 1 ch = self.char() return True def skip_opt_space(self) -> str: ch = self.char() while ch in space_chars or ch in bidi_chars: self.pos += 1 ch = self.char() return ch def skip_bidi(self) -> str: """Bidirectional marks and isolates""" ch = self.char() while ch in bidi_chars: self.pos += 1 ch = self.char() return ch def expect(self, exp: str, char: str = "") -> None: if (char or self.char()) != exp: raise MF2ParseError(self, f"Expected {exp}") self.pos += 1 def char(self) -> str: try: return self.source[self.pos] except IndexError: return ""