in voxpopuli/get_lm_data.py [0:0]
def digit2text(text: str, lang: str) -> str:
out = text.strip(" ")
if len(text) == 0 or all([not c.isdigit() for c in text]):
return text
# remove leading and trailing punctuations
is_negative = text[0] == "-"
out = text.lstrip((string.punctuation))
out_tmp = out.rstrip((string.punctuation))
suffix = "" if out == out_tmp else out[len(out_tmp) :]
out = out_tmp.replace(",", ".")
out = out.replace(":", ".")
# leading characters, e.g. a10, h1n1, $10
m = re.search(r"^(\D+)", out)
if m:
prefix = m.groups()[0]
return prefix + " " + digit2text(out[len(prefix) :], lang) + suffix
# leading digits, e.g. 50th, 1900s
to_format = "cardinal"
# trailing characters as ordinal numbers, e.g. 50th
# TODO: more rules for multiple languages, e.g. date
m = re.search(r"\b(\d+)(st|nd|th)\b", out.lower())
if m:
to_format = "ordinal"
out = m.groups()[0]
# different cases for xx.xx
if "." in out:
segs = out.split(".")
if all([len(s) == 3 for s in segs[1:]]): # 12.000.000
out = out.replace(".", "")
else: # date 18.4.2009, IP address, time 18.30, etc.
norm_segs = []
for s in segs:
norm_segs.append(digit2text(s, lang))
return " ".join(norm_segs) + suffix
m = re.search(r"\b(\d+)(\D+)", out)
if m:
suffix = " " + digit2text(out[len(m.groups()[0]) :], lang) + suffix
out = m.groups()[0]
if is_negative:
out = "-" + out
try:
num = int(out)
except ValueError:
try:
num = float(out)
except Exception as e:
num = out
logging.warning(f"cannot transform '{out}' to numbers")
try:
d = num2words(num, lang=lang, to=to_format)
except NotImplementedError: # lang not supported, default to en
assert lang != "en"
d = digit2text(out, lang="en")
except Exception as e:
d = ""
logging.warning(f"cannot process {out} ({num}) with {lang} in {to_format} mode")
if suffix:
d = d + suffix
return d