library/scripts/string_pack.py (225 lines of code) (raw):
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the Apache 2.0 license found in
# the LICENSE file in the root directory of this source tree.
import collections
import logging
import os
import re
import sys
from xml.etree import ElementTree
# This must be kept in sync with the `quantityIndex()` method in ParsedStringPack.java
_IDS_FOR_QUANTITY = {"other": 0, "zero": 1, "one": 2, "two": 3, "few": 4, "many": 5}
def normalize_locale(android_config_name):
if re.match("^[a-z]{2}$", android_config_name): # xx
return android_config_name
elif re.match("^[a-z]{2}-r[A-Z]{2}$", android_config_name): # xx-rYY
return android_config_name[:2] + "-" + android_config_name[-2:]
elif re.match("^b\+[a-z]{2}\+[A-Z][a-z]{3}$", android_config_name): # b+xx+Zzzz
return android_config_name[2:4] + "-" + android_config_name[5:]
else:
raise NotImplementedError()
def extract_locale_from_file_name(file_name):
escaped_sep = re.escape(os.path.sep)
match = re.search(escaped_sep + "values-(.*)" + escaped_sep, file_name)
assert match is not None
return normalize_locale(match.group(1))
def unescape(text):
if not text:
return ""
if len(text) >= 2 and text.startswith('"') and text.endswith('"'):
return text[1:-1] # Strip the quotation marks
else:
return text.replace(r"\'", "'").replace(r"\"", '"').replace(r"\n", "\n")
class TreeBuilderWithComments(ElementTree.TreeBuilder):
COMMENT_TAG = "comment"
def comment(self, data):
# Comment with 'generated' is put by the script, we can skip it.
if "\u0040generated" in data:
return
self.start(self.COMMENT_TAG, {})
self.data(data)
self.end(self.COMMENT_TAG)
def read_string_dict(locale, file_name, id_finder, plural_handler):
result_dict = {}
try:
root = ElementTree.parse(
file_name, parser=ElementTree.XMLParser(target=TreeBuilderWithComments())
).getroot()
except FileNotFoundError:
# Missing files are OK. They just mean no strings.
return result_dict
last_comment = ""
for element in root:
tag = element.tag
if tag == TreeBuilderWithComments.COMMENT_TAG:
# See if the comments include any metadata about plurals that we need to pass on.
last_comment = element.text
continue
assert tag in ["string", "plurals"]
string_name = element.attrib["name"]
id = id_finder.get_id(string_name)
if id is None:
# No integer ID was found for the string. The string was most probably removed,
# but still remains in the translations (such strings will be cleaned up next time
# move_strings_for_packing.py is run). Log a warning and skip the string.
sys.stderr.write(
"No ID found for '%s' while packing %s\n" % (string_name, file_name)
)
continue
if element.tag == "string":
text = element.text
result_dict[id] = unescape(text)
else: # plurals
plural_dict = {}
for item in element:
assert item.tag == "item"
quantity = item.attrib["quantity"]
if plural_handler(locale, last_comment, quantity):
continue
quantity_id = _IDS_FOR_QUANTITY[quantity]
plural_dict[quantity_id] = unescape(item.text)
result_dict[id] = plural_dict
return result_dict
def blob_append_32_bit(blob, integer):
assert 0 <= integer < 2 ** 31
blob.append(integer & 0xFF)
blob.append((integer & 0xFF00) >> 8)
blob.append((integer & 0xFF0000) >> 16)
blob.append((integer & 0xFF000000) >> 24)
def blob_append_16_bit(blob, integer):
assert 0 <= integer < 2 ** 15
blob.append(integer & 0xFF)
blob.append((integer & 0xFF00) >> 8)
def blob_append_locale(blob, locale):
assert len(locale) in [2, 5, 7]
blob += locale.encode("ASCII")
if len(locale) == 2:
blob += b"\0\0\0\0\0"
elif len(locale) == 5:
blob += b"\0\0"
class StringBuffer(object):
"A large byte buffer that just holds strings."
def __init__(self, encoding):
self.encoding = encoding
self.store = bytearray()
def add(self, string_or_plural):
if type(string_or_plural) is dict: # Plural
result = {}
for quantity_id, string in string_or_plural.items():
result[quantity_id] = self.add_string(string)
return result
else:
return self.add_string(string_or_plural)
def add_string(self, string):
string_bytes = string.encode(encoding=self.encoding)
bytes_len = len(string_bytes)
if bytes_len == 0: # empty string
return 0, 0
location = self.store.find(string_bytes)
if location == -1:
# Not found. But before trying to add it, see if a prefix of the new string
# is at the end of the store buffer. If that's the case, we can save a few bytes
# by sharing that.
prefix = bytearray(string_bytes[:-1])
while prefix and not self.store.endswith(prefix):
del prefix[-1]
if prefix:
# Some part of the prefix remains, which means it matches the end of the buffer.
start = len(self.store) - len(prefix)
self.store += string_bytes[len(prefix) :]
else:
# Add the string to the end of the buffer.
start = len(self.store)
self.store += string_bytes
return start, bytes_len
else:
return location, bytes_len
class LocaleStore(object):
def __init__(self):
self.strings = {}
self.plurals = {}
def add_plural_or_string(self, id, plural_or_string):
if type(plural_or_string) is dict:
self.add_plural(id, plural_or_string)
else:
self.add_string(id, plural_or_string)
def add_string(self, id, string):
assert id not in self.strings
self.strings[id] = string
def add_plural(self, id, plural):
assert id not in self.plurals
self.plurals[id] = plural
def get_binary_blob(self):
blob = bytearray()
blob_append_16_bit(blob, len(self.strings))
blob_append_16_bit(blob, len(self.plurals))
# Write the strings. Note that the parser in ParsedStringPack.java expects this to be
# sorted by ID.
# However the ids are already entered in sorted manner. So no need to re-sort them
for id in self.strings:
blob_append_16_bit(blob, id)
start, length = self.strings[id]
blob_append_32_bit(blob, start)
blob_append_16_bit(blob, length)
# Write the plurals
for id in self.plurals:
blob_append_16_bit(blob, id)
plural = self.plurals[id]
blob.append(len(plural)) # Just one byte
for quantity_id in sorted(plural):
blob.append(quantity_id) # Just one byte
start, length = plural[quantity_id]
blob_append_32_bit(blob, start)
blob_append_16_bit(blob, length)
return bytes(blob)
# Keep in sync with `ENCODINGS` in ParsedStringPack.java
_ENCODING_ID = {"UTF-8": 0, "UTF-16BE": 1}
# 2 bytes for number of locales, 4 bytes for starting index of locale data, 1 byte for the encoding
# of string data, and 4 bytes for starting index of the string data. Totalling 11 bytes.
_HEADER_SIZE = 11
# Each locale takes 11 bytes, right after the header. 7 bytes for the locale itself
# (see blob_append_locale), and 4 bytes for a pointer to where its table starts in
# file.
_LOCALE_HEADER_SIZE = 11
class StringPack(object):
"The full string pack, with information about locales, ids, plurals, etc"
def __init__(self, encoding):
assert encoding in _ENCODING_ID
self.encoding = encoding
self.store = collections.defaultdict(dict)
def add_for_locale(self, locale, string_dict):
locale_dict = self.store[locale]
for key, value in string_dict.items():
if key in locale_dict:
logging.warning(
"Warning: id {} being overridden by:{}, previous value:{}".format(
key, value, locale_dict[key]
)
)
locale_dict[key] = value
def compile(self):
self.string_buffer = StringBuffer(encoding=self.encoding)
locales = sorted(self.store.keys())
self.locales_info = bytearray()
locale_blobs_total_size = 0
self.locale_blobs = []
for locale in locales:
blob_append_locale(self.locales_info, locale)
locale_store = LocaleStore()
for id in sorted(self.store[locale].keys()):
value = self.store[locale][id]
locale_store.add_plural_or_string(id, self.string_buffer.add(value))
locale_blob = bytes(locale_store.get_binary_blob())
blob_append_32_bit(self.locales_info, locale_blobs_total_size) # start
locale_blobs_total_size += len(locale_blob)
self.locale_blobs.append(locale_blob)
self.header_blob = bytearray()
blob_append_16_bit(self.header_blob, len(locales)) # Number of locales
blob_append_32_bit(
self.header_blob, _HEADER_SIZE + len(locales) * _LOCALE_HEADER_SIZE
) # Start of locale data
self.header_blob.append(_ENCODING_ID[self.encoding]) # Just one byte
blob_append_32_bit(
self.header_blob,
_HEADER_SIZE
+ len(locales) * _LOCALE_HEADER_SIZE
+ sum([len(blob) for blob in self.locale_blobs]),
) # Start of string data
def string_buffer_size(self):
return len(self.string_buffer.store)
def write_to_file(self, pack_file_name):
with open(pack_file_name, "wb") as pack_file:
pack_file.write(self.header_blob)
pack_file.write(self.locales_info)
for locale_blob in self.locale_blobs:
pack_file.write(locale_blob)
pack_file.write(self.string_buffer.store)
def build(input_file_names, output_file_name, id_finder, plural_handler):
"""Builds the string pack and writes it to a file.
It tries both UTF-8 and UTF-16 to see which one is smaller, and then writes
the string pack in that encoding."""
packs = []
for encoding in _ENCODING_ID.keys():
full_store = StringPack(encoding=encoding)
for input_file_name in input_file_names:
locale = extract_locale_from_file_name(input_file_name)
full_store.add_for_locale(
locale,
read_string_dict(locale, input_file_name, id_finder, plural_handler),
)
full_store.compile()
packs.append(full_store)
smallest_pack = min(packs, key=lambda p: p.string_buffer_size())
smallest_pack.write_to_file(output_file_name)