python/moz/l10n/formats/xliff/parse.py (166 lines of code) (raw):
# Copyright Mozilla Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from collections import defaultdict
from collections.abc import Iterator
from re import compile
from typing import List, Union, cast
from lxml import etree
from ...model import (
Comment,
Entry,
Expression,
Message,
Metadata,
PatternMessage,
Resource,
Section,
)
from .. import Format
from .common import (
attrib_as_metadata,
element_as_metadata,
pretty_name,
xcode_tool_id,
xliff_ns,
)
from .parse_trans_unit import parse_pattern, parse_trans_unit
from .parse_xcode import parse_xliff_stringsdict
def xliff_parse(source: str | bytes) -> Resource[Message]:
"""
Parse an XLIFF 1.2 file into a message resource.
Sections identify files and groups within them,
with the first identifier part parsed as the <file> "original" attribute,
and later parts as <group> "id" attributes.
An entry's value represents the <target> of a <trans-unit>,
and its comment the first <note>.
Other elements and attributes are represented by metadata.
Metadata keys encode XML element data, using XPath expressions as keys.
"""
root = etree.fromstring(source.encode() if isinstance(source, str) else source)
version = root.attrib.get("version", None)
if version not in ("1.0", "1.1", "1.2"):
raise ValueError(f"Unsupported <xliff> version: {version}")
ns = root.nsmap.get(None, "")
if ns:
if ns in xliff_ns:
ns = f"{{{ns}}}"
else:
raise ValueError(f"Unsupported namespace: {ns}")
if root.tag != f"{ns}xliff":
raise ValueError(f"Unsupported root node: {root}")
if root.text and not root.text.isspace():
raise ValueError(f"Unexpected text in <xliff>: {root.text}")
res: Resource[Message] = Resource(Format.xliff, [])
root_comments = [
c.text for c in root.itersiblings(etree.Comment, preceding=True) if c.text
]
if root_comments:
root_comments.reverse()
res.comment = comment_str(root_comments)
res.meta = attrib_as_metadata(root)
for key, uri in root.nsmap.items():
res.meta.append(Metadata(f"@xmlns:{key}" if key else "@xmlns", uri))
comment: list[str] = []
for file in root:
if file.tail and not file.tail.isspace():
raise ValueError(f"Unexpected text in <xliff>: {file.tail}")
if isinstance(file, etree._Comment):
comment.append(file.text)
elif file.tag == f"{ns}file":
file_name = file.attrib.get("original", None)
if file_name is None:
raise ValueError(f'Missing "original" attribute for <file>: {file}')
meta = attrib_as_metadata(file, None, ("original",))
entries: list[Entry[Message] | Comment] = []
body = None
for child in file:
if isinstance(child, etree._Comment):
entries.append(Comment(comment_str(child.text)))
elif child.tag == f"{ns}header":
meta += element_as_metadata(child, "header", True)
elif child.tag == f"{ns}body":
if body:
raise ValueError(f"Duplicate <body> in <file>: {file}")
body = child
else:
raise ValueError(
f"Unsupported <{child.tag}> element in <file>: {file}"
)
if child.tail and not child.tail.isspace():
raise ValueError(f"Unexpected text in <file>: {child.tail}")
section = Section((file_name,), entries, meta=meta)
if comment:
section.comment = comment_str(comment)
comment.clear()
res.sections.append(section)
if body is None:
raise ValueError(f"Missing <body> in <file>: {file}")
elif body.text and not body.text.isspace():
raise ValueError(f"Unexpected text in <body>: {body.text}")
is_xcode = xcode_tool_id in meta
if is_xcode and file_name.endswith(".stringsdict"):
plural_entries = parse_xliff_stringsdict(ns, body)
if plural_entries is not None:
entries += cast(
List[Union[Entry[Message], Comment]], plural_entries
)
continue
for unit in body:
if isinstance(unit, etree._Comment):
entries.append(Comment(comment_str(unit.text)))
elif unit.tag == f"{ns}trans-unit":
entries.append(parse_trans_unit(unit, is_xcode))
elif unit.tag == f"{ns}bin-unit":
entries.append(parse_bin_unit(unit))
elif unit.tag == f"{ns}group":
res.sections += parse_group(ns, [file_name], unit, is_xcode)
else:
raise ValueError(
f"Unsupported <{unit.tag}> element in <body>: {body}"
)
if unit.tail and not unit.tail.isspace():
raise ValueError(f"Unexpected text in <body>: {unit.tail}")
return res
def xliff_parse_message(source: str, *, is_xcode: bool = False) -> PatternMessage:
"""
Parse an XLIFF 1.2 <target> into a message.
Set `is_xcode=True` to parse XCode-style printf strings as variable references.
"""
parser = etree.XMLParser(resolve_entities=False)
el = etree.fromstring(f"<target>{source}</target>", parser)
return PatternMessage(list(parse_pattern(el, is_xcode)))
def parse_group(
ns: str, parent: list[str], group: etree._Element, is_xcode: bool
) -> Iterator[Section[Message]]:
id = group.attrib.get("id", "")
path = [*parent, id]
meta = attrib_as_metadata(group, None, ("id",))
entries: list[Entry[Message] | Comment] = []
if group.text and not group.text.isspace():
raise ValueError(f"Unexpected text in <group>: {group.text}")
# Note that this is modified after being emitted,
# To ensure that nested groups are ordered by path
yield Section(tuple(path), entries, meta=meta)
seen: dict[str, int] = defaultdict(int)
for unit in group:
if isinstance(unit, etree._Comment):
entries.append(Comment(comment_str(unit.text)))
elif unit.tag == f"{ns}trans-unit":
entries.append(parse_trans_unit(unit, is_xcode))
elif unit.tag == f"{ns}bin-unit":
entries.append(parse_bin_unit(unit))
elif unit.tag == f"{ns}group":
yield from parse_group(ns, path, unit, is_xcode)
else:
name = pretty_name(unit, unit.tag)
idx = seen[name] + 1
unit_base = f"{name}[{idx}]" if idx > 1 else name
meta += element_as_metadata(unit, unit_base, True)
seen[name] = idx
if unit.tail and not unit.tail.isspace():
raise ValueError(f"Unexpected text in <group>: {unit.tail}")
def parse_bin_unit(unit: etree._Element) -> Entry[Message]:
id = unit.attrib.get("id", None)
if id is None:
raise ValueError(f'Missing "id" attribute for <bin-unit>: {unit}')
meta = attrib_as_metadata(unit, None, ("id",))
meta += element_as_metadata(unit, "", False)
msg = PatternMessage([Expression(None, attributes={"bin-unit": True})])
return Entry((id,), msg, meta=meta)
dash_indent = compile(r" .+(\n - .*)+ ")
def comment_str(body: list[str] | str) -> str:
if isinstance(body, str):
body = [body]
lines: list[str] = []
for comment in body:
if comment:
if dash_indent.fullmatch(comment):
# A dash is considered as a part of the indent if it's aligned
# with the last dash of <!-- in a top-level comment.
lines.append(comment.replace("\n - ", "\n").strip(" "))
else:
lines.append(
"\n".join(line.strip() for line in comment.splitlines()).strip("\n")
)
return "\n\n".join(lines).strip("\n")