def digit2text()

in voxpopuli/get_lm_data.py [0:0]


def digit2text(text: str, lang: str) -> str:
    out = text.strip(" ")
    if len(text) == 0 or all([not c.isdigit() for c in text]):
        return text

    # remove leading and trailing punctuations
    is_negative = text[0] == "-"
    out = text.lstrip((string.punctuation))
    out_tmp = out.rstrip((string.punctuation))
    suffix = "" if out == out_tmp else out[len(out_tmp) :]
    out = out_tmp.replace(",", ".")
    out = out.replace(":", ".")

    # leading characters, e.g. a10, h1n1, $10
    m = re.search(r"^(\D+)", out)
    if m:
        prefix = m.groups()[0]
        return prefix + " " + digit2text(out[len(prefix) :], lang) + suffix

    # leading digits, e.g. 50th, 1900s
    to_format = "cardinal"
    # trailing characters as ordinal numbers, e.g. 50th
    # TODO: more rules for multiple languages, e.g. date
    m = re.search(r"\b(\d+)(st|nd|th)\b", out.lower())
    if m:
        to_format = "ordinal"
        out = m.groups()[0]

    # different cases for xx.xx
    if "." in out:
        segs = out.split(".")
        if all([len(s) == 3 for s in segs[1:]]):  # 12.000.000
            out = out.replace(".", "")
        else:  # date 18.4.2009, IP address, time 18.30, etc.
            norm_segs = []
            for s in segs:
                norm_segs.append(digit2text(s, lang))
            return " ".join(norm_segs) + suffix

    m = re.search(r"\b(\d+)(\D+)", out)
    if m:
        suffix = " " + digit2text(out[len(m.groups()[0]) :], lang) + suffix
        out = m.groups()[0]

    if is_negative:
        out = "-" + out

    try:
        num = int(out)
    except ValueError:
        try:
            num = float(out)
        except Exception as e:
            num = out
            logging.warning(f"cannot transform '{out}' to numbers")

    try:
        d = num2words(num, lang=lang, to=to_format)
    except NotImplementedError:  # lang not supported, default to en
        assert lang != "en"
        d = digit2text(out, lang="en")
    except Exception as e:
        d = ""
        logging.warning(f"cannot process {out} ({num}) with {lang} in {to_format} mode")

    if suffix:
        d = d + suffix

    return d