python/moz/l10n/formats/fluent/parse.py (345 lines of code) (raw):

# Copyright Mozilla Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import annotations from itertools import product from re import finditer from typing import Iterable, Tuple, cast from fluent.syntax import FluentParser from fluent.syntax import ast as ftl from ...model import ( CatchallKey, Comment, Entry, Expression, LinePos, Message, Metadata, Pattern, PatternMessage, Resource, Section, SelectMessage, VariableRef, ) from .. import Format def fluent_parse( source: bytes | str | ftl.Resource, *, with_linepos: bool = True ) -> Resource[Message]: """ Parse a .ftl file into a message resource. Message and term references are represented by `message` function annotations, with term identifiers prefixed with a `-`. Function names are lower-cased, so e.g. the Fluent `NUMBER` is `number` in the Resource. The parsed resource will not include any metadata. """ if isinstance(source, ftl.Resource): fluent_res = source lpm = None # Source is required for line positions else: source_str = source if isinstance(source, str) else source.decode("utf-8") fluent_res = FluentParser(with_spans=with_linepos).parse(source_str) lpm = LinePosMapper(source_str) if with_linepos else None entries: list[Entry[Message] | Comment] = [] section = Section((), entries) resource = Resource(Format.fluent, [section]) fluent_body = fluent_res.body if fluent_body and isinstance(fbc := fluent_body[0], ftl.Comment) and fbc.content: resource.meta.append(Metadata("info", fbc.content)) fluent_body = fluent_body[1:] for entry in fluent_body: if isinstance(entry, (ftl.Message, ftl.Term)): try: entries.extend(entries_iter(entry, lpm)) except Exception as err: raise ValueError(f"Error parsing message {entry.id.name}") from err elif isinstance(entry, ftl.ResourceComment): if entry.content: resource.comment = ( (resource.comment.rstrip() + "\n\n" + entry.content) if resource.comment else entry.content ) elif isinstance(entry, ftl.GroupComment): if entries or section.comment: entries = [] section = Section((), entries, comment=entry.content or "") if lpm and entry.span: span = entry.span section.linepos = lpm.get_linepos( span.start, span.start, span.start, span.end ) resource.sections.append(section) else: section.comment = entry.content or "" if lpm and entry.span: span = entry.span section.linepos = lpm.get_linepos( span.start, span.start, span.start, span.end ) elif isinstance(entry, ftl.Comment): if entry.content: comment = Comment(entry.content) if lpm and entry.span: span = entry.span comment.linepos = lpm.get_linepos( span.start, span.start, span.start, span.end ) entries.append(comment) else: # Junk try: message = entry.annotations[0].message except Exception: message = "" if not message: message = "Fluent parser error" if entries: prev_entry = next( (entry for entry in reversed(entries) if isinstance(entry, Entry)), None, ) if prev_entry: message += f" after message {'.'.join(prev_entry.id)}" prev_lp = entries[-1].linepos if prev_lp: message += f" at line {prev_lp.end}" raise ValueError(message) return resource def fluent_parse_messages( source: str | ftl.Message | ftl.Term, *, with_linepos: bool = True ) -> list[Entry[Message]]: """ Parse a Fluent message or term into a list of Entries. Message and term references are represented by `message` function annotations, with term identifiers prefixed with a `-`. Function names are lower-cased, so e.g. the Fluent `NUMBER` is `number` in the result. """ if isinstance(source, str): fluent_entry = FluentParser(with_spans=with_linepos).parse_entry(source) lpm = LinePosMapper(source) if with_linepos else None else: fluent_entry = source lpm = None # Source is required for line positions if not isinstance(fluent_entry, (ftl.Message, ftl.Term)): raise ValueError("Source is not a Fluent entry") return list(entries_iter(fluent_entry, lpm)) def entries_iter( ftl_entry: ftl.Message | ftl.Term, lpm: LinePosMapper | None, ) -> Iterable[Entry[Message]]: id = ftl_entry.id.name if isinstance(ftl_entry, ftl.Term): id = "-" + id comment = ftl_entry.comment.content or "" if ftl_entry.comment else "" if ftl_entry.value: entry = Entry(id=(id,), value=message(ftl_entry.value), comment=comment) if lpm and ftl_entry.span and ftl_entry.value.span: v_span = ftl_entry.value.span c_span = ( ftl_entry.comment.span if comment and ftl_entry.comment and ftl_entry.comment.span else ftl_entry.span ) k_span = ftl_entry.id.span or ftl_entry.span entry.linepos = lpm.get_linepos( c_span.start, k_span.start, v_span.start, v_span.end ) yield entry if comment: comment = "" for attr in ftl_entry.attributes: entry = Entry(id=(id, attr.id.name), value=message(attr.value), comment=comment) if lpm and attr.span: span = attr.span c_span = ( ftl_entry.comment.span if comment and ftl_entry.comment and ftl_entry.comment.span else span ) k_span = attr.id.span or span v_span = attr.value.span or span entry.linepos = lpm.get_linepos( c_span.start, k_span.start, v_span.start, span.end ) yield entry if comment: comment = "" def message(ftl_pattern: ftl.Pattern) -> Message: sel_data = find_selectors(ftl_pattern, []) sel_expressions = [sd[0] for sd in sel_data] filter: list[Key | None] = [None] * len(sel_expressions) msg_variants: dict[tuple[Key, ...], Pattern] var_names: set[str] = set() if sel_expressions: key_lists = [list(dict.fromkeys(sd[2])) for sd in sel_data] for keys in key_lists: keys.sort(key=lambda k: (k[2], not k[1])) msg_variants = {key: [] for key in product(*key_lists)} else: msg_variants = {(): []} def add_pattern(ftl_pattern: ftl.Pattern) -> None: el: ( ftl.TextElement | ftl.Placeable | ftl.InlineExpression | ftl.SelectExpression ) for el in ftl_pattern.elements: while isinstance(el, ftl.Placeable): el = el.expression if isinstance(el, ftl.SelectExpression): msg_sel = next(sd[0] for sd in sel_data if el.selector in sd[1]) idx = sel_expressions.index(msg_sel) prev_filt = filter[idx] for v in el.variants: filter[idx] = variant_key(v) add_pattern(v.value) filter[idx] = prev_filt else: for keys, msg_pattern in msg_variants.items(): if all( (filt is None or key == filt) for key, filt in zip(keys, filter) ): if isinstance(el, ftl.TextElement): if msg_pattern and isinstance(msg_pattern[-1], str): msg_pattern[-1] += el.value else: msg_pattern.append(el.value) else: expr = inline_expression(el) if isinstance(expr.arg, VariableRef): var_names.add(expr.arg.name) msg_pattern.append(expr) add_pattern(ftl_pattern) if sel_expressions: declarations = {} selectors = [] for expr in sel_expressions: stem = expr.arg.name if isinstance(expr.arg, VariableRef) else "" i = 0 name = stem while name in var_names or name == "": i += 1 name = f"{stem}_{i}" declarations[name] = expr selectors.append(VariableRef(name)) var_names.add(name) variants = { tuple(map(message_key, keys)): msg_pattern for keys, msg_pattern in msg_variants.items() if msg_pattern } return SelectMessage(declarations, tuple(selectors), variants) else: return PatternMessage(next(iter(msg_variants.values()))) Key = Tuple[str, bool, bool] "(name, is_numeric, is_default)" def variant_key(v: ftl.Variant) -> Key: name: str is_numeric: bool if isinstance(v.key, ftl.Identifier): name = v.key.name is_numeric = False else: name = v.key.value is_numeric = True return (name, is_numeric, v.default) def message_key(key: Key) -> str | CatchallKey: name, _, is_default = key return CatchallKey(name) if is_default else name def find_selectors( pattern: ftl.Pattern, result: list[tuple[Expression, list[ftl.InlineExpression], list[Key]]], ) -> list[tuple[Expression, list[ftl.InlineExpression], list[Key]]]: for el in pattern.elements: if isinstance(el, ftl.Placeable) and isinstance( el.expression, ftl.SelectExpression ): ftl_sel = el.expression.selector keys = [variant_key(v) for v in el.expression.variants] msg_sel = select_expression(ftl_sel, keys) prev = next((x for x in result if x[0] == msg_sel), None) if prev: _, ftl_list, key_list = prev ftl_list.append(ftl_sel) key_list += keys else: result.append((msg_sel, [ftl_sel], keys)) for v in el.expression.variants: find_selectors(v.value, result) return result def select_expression(ftl_sel: ftl.InlineExpression, keys: list[Key]) -> Expression: plural_categories = ("zero", "one", "two", "few", "many", "other") if isinstance(ftl_sel, ftl.VariableReference): name = ( "number" if all( is_numeric or name in plural_categories for name, is_numeric, _ in keys ) else "string" ) return Expression(VariableRef(ftl_sel.id.name), name) elif isinstance(ftl_sel, ftl.StringLiteral): return Expression(literal_value(ftl_sel), "string") else: return inline_expression(ftl_sel) def inline_expression(exp: ftl.InlineExpression) -> Expression: if isinstance(exp, ftl.NumberLiteral): value = exp.value return Expression(value, "number") elif isinstance(exp, ftl.StringLiteral): value = exp.parse().get("value") or "" return Expression(value) elif isinstance(exp, ftl.MessageReference): name = exp.id.name if exp.attribute is not None: name += "." + exp.attribute.name return Expression(name, "message") elif isinstance(exp, ftl.TermReference): name = "-" + exp.id.name if exp.attribute is not None: name += "." + exp.attribute.name ftl_named = exp.arguments.named if exp.arguments else [] return Expression( name, "message", {opt.name.name: literal_value(opt.value) for opt in ftl_named}, ) elif isinstance(exp, ftl.VariableReference): name = exp.id.name return Expression(VariableRef(name)) else: # ftl.FunctionReference name = exp.id.name.lower() if len(exp.arguments.positional) > 1: raise ValueError( f"Functions with more than one positional argument are not supported: {name}" ) ftl_arg: ftl.Placeable | ftl.InlineExpression | None = next( iter(exp.arguments.positional), None ) while isinstance(ftl_arg, ftl.Placeable): ftl_arg = cast(ftl.InlineExpression, ftl_arg.expression) arg: str | VariableRef | None if not ftl_arg: arg = None elif isinstance(ftl_arg, ftl.NumberLiteral) or isinstance( ftl_arg, ftl.StringLiteral ): arg = literal_value(ftl_arg) elif isinstance(ftl_arg, ftl.VariableReference): arg = VariableRef(ftl_arg.id.name) else: raise ValueError(f"Unexpected value: {ftl_arg}") ftl_named = exp.arguments.named return Expression( arg, name, {opt.name.name: literal_value(opt.value) for opt in ftl_named}, ) def literal_value(arg: ftl.NumberLiteral | ftl.StringLiteral) -> str: return ( arg.value if isinstance(arg, ftl.NumberLiteral) else arg.parse().get("value") or "" ) class LinePosMapper: def __init__(self, src: str) -> None: self._len = len(src) self._newlines = [m.start() for m in finditer("\n", src)] def _get_line(self, char_idx: int) -> int: # Treat the end of the string as a newline. if not self._newlines and char_idx >= self._len: return 2 return next( (idx + 1 for idx, nl in enumerate(self._newlines) if nl > char_idx), len(self._newlines) + 1, ) def get_linepos(self, start: int, key: int, value: int, end: int) -> LinePos: start_line = self._get_line(start) key_line = start_line if key == start else self._get_line(key) value_line = key_line if value == key else self._get_line(value) end_line = self._get_line(end) return LinePos(start_line, key_line, value_line, end_line)