python/moz/l10n/formats/android/parse.py (379 lines of code) (raw):

# Copyright Mozilla Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import annotations import logging from collections.abc import Callable, Iterable, Iterator from re import ASCII, compile, match from typing import Literal from lxml import etree from ...model import ( CatchallKey, Comment, Entry, Expression, Markup, Message, Metadata, PatternMessage, Resource, Section, SelectMessage, VariableRef, ) from .. import Format log = logging.getLogger(__name__) plural_categories = ("zero", "one", "two", "few", "many", "other") xliff_ns = "urn:oasis:names:tc:xliff:document:1.2" xliff_g = f"{{{xliff_ns}}}g" # Exclude : for compatibility with MF2 xml_name_start = r"A-Z_a-z\xC0-\xD6\xD8-\xF6\xF8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF" xml_name_rest = r".0-9\xB7\u0300-\u036F\u203F-\u2040-" xml_name = compile(f"[{xml_name_start}][{xml_name_start}{xml_name_rest}]*") # Android string resources contain four different kinds of localizable values: # # - HTML entity declarations, # which will be inserted into other strings during XML parsing. # - Strings with printf-style variables, # which also use "quotes" for special escaping behaviour. # These may include HTML as escaped string contents, # which will require fromHtml(String) processing # after being initially formatted with getString(int, Object...) # - Strings with HTML contents, which can't include variables, # and are generally used via setText(java.lang.CharSequence). # - Strings with ICU MessageFormat contents. # These also use "quotes" for special escaping behaviour. # ICU MessageFormat strings are not currently detected by this library. # # The source contents of each of the above needs to be parsed differently, # and message strings can be found in <string>, <string-array>, and <plurals> # elements, each of which also needs different parsing. # # For more information, see: # https://developer.android.com/guide/topics/resources/string-resource # https://developer.android.com/guide/topics/resources/localization#mark-message-parts def android_parse( source: str | bytes, *, ascii_spaces: bool = False, literal_quotes: bool = False ) -> Resource[Message]: """ Parse an Android strings XML file into a message resource. If any internal DOCTYPE entities are declared, they are included as messages in an "!ENTITY" section. Resource and entry attributes are parsed as metadata. All XML, Android, and printf escapes are unescaped except for %n, which has a platform-dependent meaning. Whitespace in messages is normalized. If `ascii_spaces` is set, this only applies to ASCII/Latin-1 space characters. With `literal_quotes`, all " double-quote characters within strings are treated as literal characters, rather than as delimiters for whitespace preservation. Spans of text and entities wrapped in an <xliff:g> will be parsed as expressions with a "translate": "no" attribute. Spans including elements will be wrapped with open/close markup with a similar attribute. """ parser = etree.XMLParser(resolve_entities=False) root = etree.fromstring( source.encode() if isinstance(source, str) else source, parser ) if root.tag != "resources": raise ValueError(f"Unsupported root node: {root}") if root.text and not root.text.isspace(): log.warning(f"Unexpected text in resource: {root.text}") res: Resource[Message] = Resource(Format.android, [Section((), [])]) root_comments = [c.text for c in root.itersiblings(etree.Comment, preceding=True)] if root_comments: root_comments.reverse() res.comment = comment_str(root_comments) res.meta = [Metadata(k, v) for k, v in root.attrib.items()] for ns, url in root.nsmap.items(): res.meta.append(Metadata(f"xmlns:{ns}" if ns else "xmlns", url)) entries = res.sections[0].entries dtd = root.getroottree().docinfo.internalDTD if dtd: entities: list[Entry[Message] | Comment] = [] for entity in dtd.iterentities(): name = entity.name if not name: raise ValueError(f"Unnamed entity: {entity}") value: Message = PatternMessage(list(parse_entity_value(entity.content))) entities.append(Entry((name,), value)) if entities: res.sections.insert(0, Section(("!ENTITY",), entities)) comment: list[str | None] = [] # TODO: should be list[str] for el in root: if isinstance(el, etree._Comment): comment.append(el.text) if el.tail and el.tail.count("\n") > 1 and comment: entries.append(Comment(comment_str(comment))) comment.clear() else: name = el.attrib.get("name", None) if not name: raise ValueError(f"Unnamed {el.tag} entry: {el}") meta = [Metadata(k, v) for k, v in el.attrib.items() if k != "name"] if el.tag == "string": value = PatternMessage( list(parse_pattern(el, ascii_spaces, literal_quotes)) ) entries.append(Entry((name,), value, comment_str(comment), meta)) elif el.tag == "plurals": if el.text and not el.text.isspace(): log.warning(f"Unexpected text in {name} plurals: {el.text}") else: value = parse_plurals( name, el, ascii_spaces, literal_quotes, comment.extend ) entries.append(Entry((name,), value, comment_str(comment), meta)) elif el.tag == "string-array": if el.text and not el.text.isspace(): log.warning(f"Unexpected text in {name} string-array: {el.text}") idx = 0 for item in el: if isinstance(item, etree._Comment): comment.append(item.text) elif item.tag == "item": value = PatternMessage( list(parse_pattern(item, ascii_spaces, literal_quotes)) ) ic = comment_str(comment) entries.append(Entry((name, str(idx)), value, ic, meta[:])) comment.clear() idx += 1 else: cs = etree.tostring(item, encoding="unicode") raise ValueError(f"Unsupported {name} string-array child: {cs}") if item.tail and not item.tail.isspace(): log.warning( f"Unexpected text in {name} string-array: {item.tail}" ) else: es = etree.tostring(el, encoding="unicode") raise ValueError(f"Unsupported entry: {es}") if comment: comment.clear() if el.tail and not el.tail.isspace(): log.warning(f"Unexpected text in resource: {el.tail}") return res def android_parse_message( source: str, *, ascii_spaces: bool = False, literal_quotes: bool = False ) -> PatternMessage: """ Parse an Android strings XML message. All XML, Android, and printf escapes are unescaped except for %n, which has a platform-dependent meaning. Whitespace in messages is normalized. If `ascii_spaces` is set, this only applies to ASCII/Latin-1 space characters. With `literal_quotes`, all " double-quote characters within strings are treated as literal characters, rather than as delimiters for whitespace preservation. Spans of text and entities wrapped in an <xliff:g> will be parsed as expressions with a "translate": "no" attribute. Spans including elements will be wrapped with open/close markup with a similar attribute. Entity references are supported, but are not validated. """ parser = etree.XMLParser(resolve_entities=False) doctype = "" entities: list[str] = [] while True: try: el = etree.fromstring(f"{doctype}<string>{source}</string>", parser) break except etree.XMLSyntaxError as err: if err.code == etree.ErrorTypes.ERR_UNDECLARED_ENTITY: # type: ignore[attr-defined] m = match(r"Entity '([^'\s]+)' not defined", err.args[0]) if m is not None: entities.append(f'<!ENTITY {m[1]} "">') doctype = f"<!DOCTYPE string [{' '.join(entities)}]>" continue raise err return PatternMessage(list(parse_pattern(el, ascii_spaces, literal_quotes))) dash_indent = compile(r" .+(\n - .*)+ ") def comment_str(body: list[str | None]) -> str: lines: list[str] = [] for comment in body: if comment: if dash_indent.fullmatch(comment): # A dash is considered as a part of the indent if it's aligned # with the last dash of <!-- in a top-level comment. lines.append(comment.replace("\n - ", "\n").strip(" ")) else: lines.append( "\n".join(line.strip() for line in comment.splitlines()).strip("\n") ) return "\n\n".join(lines).strip("\n") entity_ref = compile(f"&({xml_name.pattern});") def parse_entity_value(src: str | None) -> Iterator[str | Expression]: if src: pos = 0 for m in entity_ref.finditer(src): start = m.start() if start > pos: yield src[pos:start] yield Expression(VariableRef(m[1]), "entity") pos = m.end() if pos < len(src): yield src[pos:] def parse_plurals( name: str, el: etree._Element, ascii_spaces: bool, literal_quotes: bool, add_comment: Callable[[Iterable[str | None]], None], ) -> SelectMessage: msg = SelectMessage( declarations={"quantity": Expression(VariableRef("quantity"), "number")}, selectors=(VariableRef("quantity"),), variants={}, ) var_comment: list[str | None] = [] for item in el: if isinstance(item, etree._Comment): var_comment.append(item.text) elif item.tag == "item": key = item.attrib.get("quantity", None) if key not in plural_categories: raise ValueError(f"Invalid quantity for {name} plurals item: {key}") if var_comment: add_comment( (f"{key}: {c}" for c in var_comment if c) if msg.variants else var_comment ) var_comment.clear() msg.variants[(CatchallKey(key) if key == "other" else key,)] = list( parse_pattern(item, ascii_spaces, literal_quotes) ) else: cs = etree.tostring(item, encoding="unicode") raise ValueError(f"Unsupported {name} plurals child: {cs}") if item.tail and not item.tail.isspace(): log.warning(f"Unexpected text in {name} plurals: {item.tail}") return msg resource_ref = compile(r"@(?:\w+:)?\w+/\w+|\?(?:\w+:)?(\w+/)?\w+") def parse_pattern( el: etree._Element, ascii_spaces: bool, literal_quotes: bool ) -> Iterator[str | Expression | Markup]: if len(el) == 0 and el.text and resource_ref.fullmatch(el.text): # https://developer.android.com/guide/topics/resources/providing-resources#ResourcesFromXml yield Expression(el.text, "reference") else: flat = flatten(el) spaced = parse_quotes(flat, ascii_spaces, literal_quotes) yield from parse_inline(spaced) def flatten(el: etree._Element) -> Iterator[str | Expression | Markup]: if el.text: yield el.text for child in el: if isinstance(child, etree._Entity): yield Expression(VariableRef(child.name), "entity") else: name = ( f"{child.prefix}:{etree.QName(child.tag).localname}" if child.prefix else child.tag ) if child.tag == xliff_g: body = list(flatten(child)) if any( ( isinstance(gc, Expression) and gc.attributes.get("translate", None) == "no" ) or isinstance(gc, Markup) for gc in body ): # Any <xliff:g> around elements needs to be rendered explicitly yield Markup("open", name, dict(child.attrib), {"translate": "no"}) yield from body yield Markup("close", name, attributes={"translate": "no"}) else: id = child.get("id", None) for gc in body: if isinstance(gc, str): options: dict[str, str | VariableRef] = dict(child.attrib) attr: dict[str, str | Literal[True]] = {"translate": "no"} arg: str | VariableRef | None if id: arg = VariableRef(get_var_name(id)) attr["source"] = gc elif gc.startswith(("%", "{")): arg = VariableRef(get_var_name(gc)) attr["source"] = gc else: arg = gc if options: yield Expression(arg, name, options, attributes=attr) else: yield Expression(arg, attributes=attr) else: gc.attributes["translate"] = "no" gc.options = dict(child.attrib) yield gc else: yield Markup("open", name, options=dict(child.attrib)) yield from flatten(child) yield Markup("close", name) if child.tail: yield child.tail double_quote = compile(r'(?<!\\)"') tag_like = compile(r"<.+>") def parse_quotes( iter: Iterator[str | Expression | Markup], ascii_spaces: bool, literal_quotes: bool, ) -> Iterator[str | Expression | Markup]: spaces = compile(r"\s+", ASCII if ascii_spaces else 0) stack: list[str | Expression] = [] def collapse_stack() -> Iterator[str | Expression | Markup]: yield '"' for part in stack: yield spaces.sub(" ", part) if isinstance(part, str) else part for part in iter: if isinstance(part, str): pos = 0 quoted = bool(stack) if not literal_quotes: for m in double_quote.finditer(part): if pos == 0 and tag_like.search(part) is not None: # Double quotes don't need escaping in CDATA sections, # but lxml doesn't tell us about them. # (see https://bugs.launchpad.net/lxml/+bug/2108853) # Let's presume that's the case if we see tag-like contents nearby. break prev = part[pos : m.start()] if quoted: if stack: yield from stack stack.clear() if prev: yield prev elif prev: yield spaces.sub(" ", prev) quoted = not quoted pos = m.end() last = part[pos:] if quoted: stack.append(last) elif last: yield spaces.sub(" ", last) elif stack: if ( isinstance(part, Markup) or part.attributes.get("translate", None) == "no" ): yield from collapse_stack() stack.clear() yield part else: # Expression stack.append(part) else: yield part if stack: yield from collapse_stack() inline_re = compile( r"\\u([0-9]{4})|" r"\\(.)|" r"(<[^%>]+>)|" r"(%(?:[1-9]\$)?[-#+ 0,(]?[0-9.]*([a-su-zA-SU-Z%]|[tT][a-zA-Z]))" ) def parse_inline( iter: Iterator[str | Expression | Markup], ) -> Iterator[str | Expression | Markup]: acc = "" for part in iter: if not isinstance(part, str): if acc: yield acc acc = "" yield part else: pos = 0 for m in inline_re.finditer(part): start = m.start() if start > pos: acc += part[pos:start] if m[1]: # Unicode escape acc += chr(int(m[1])) elif m[2]: # Escaped character c = m[2] acc += "\n" if c == "n" else "\t" if c == "t" else c elif m[3]: # Escaped HTML element, e.g. &lt;b> # HTML elements containing internal % formatting are not wrapped as literals if acc: yield acc acc = "" yield Expression(m[3], "html") else: if acc: yield acc acc = "" conversion = m[5] if conversion == "%": # Literal % yield Expression("%", attributes={"source": m[4]}) else: # Placeholder func: str | None # TODO post-py38: should be a match if conversion in {"b", "B"}: func = "boolean" elif conversion in {"c", "C", "s", "S"}: func = "string" elif conversion in {"d", "h", "H", "o", "x", "X"}: func = "integer" elif conversion in {"a", "A", "e", "E", "f", "g", "G"}: func = "number" else: c0 = conversion[0] func = "datetime" if c0 == "t" or c0 == "T" else None name = get_var_name(m[4]) yield Expression( VariableRef(name), func, attributes={"source": m[4]} ) pos = m.end() acc += part[pos:] if acc: yield acc printf = compile(r"%([1-9]\$)?") not_name_char = compile(f"[^{xml_name_start}{xml_name_rest}]") not_name_start = compile(f"[^{xml_name_start}]") def get_var_name(src: str) -> str: """Returns a valid MF2 name.""" pm = printf.match(src) if pm: return f"arg{pm[1][0]}" if pm[1] else "arg" name = not_name_char.sub("", src) if not_name_start.match(name): name = name[1:] return name or "arg"