in cc_net/text_normalizer.py [0:0]
def normalize_spacing_for_tok(text: str, language: str = "en") -> str:
res = (
text.replace("\r", "")
# remove extra spaces
.replace("(", " (")
.replace(")", ") ")
.replace(" +", " ")
)
res = re.sub(r"\) ([\.\!\:\?\;\,])", r"\)\1", res)
res = res.replace("( ", "(").replace(" )", ")")
res = re.sub(r"(\d) \%", r"\1\%", res)
res = res.replace(" :", ":").replace(" ;", ";")
res = res.replace("`", "'").replace("''", ' " ')
res = (
res.replace("„", '"')
.replace("“", '"')
.replace("”", '"')
.replace("–", "-")
.replace("—", " - ")
.replace(" +", " ")
.replace("´", "'")
.replace("([a-z])‘([a-z])", r"\1'\2/")
.replace("([a-z])’([a-z])", r"\1'\2/")
.replace("‘", '"')
.replace("‚", '"')
.replace("’", '"')
.replace("''", '"')
.replace("´´", '"')
.replace("…", "...")
# French quotes
.replace(" « ", ' "')
.replace("« ", '"')
.replace("«", '"')
.replace(" » ", '" ')
.replace(" »", '"')
.replace("»", '"')
# handle pseudo-spaces
.replace(" %", "%")
.replace("nº ", "nº ")
.replace(" :", ":")
.replace(" ºC", " ºC")
.replace(" cm", " cm")
.replace(" ?", "?")
.replace(" !", "!")
.replace(" ;", ";")
.replace(", ", ", ")
.replace(" +", " ")
.replace(".", ". ")
)
# English "quotation," followed by comma, style
if language == "en":
res = re.sub(r"\"([,\.]+)", r"\1\"", res)
# Czech is confused
elif language == "cs" or language == "cz":
pass
# German/Spanish/French "quotation", followed by comma, style
else:
res = res.replace(',"', '",')
res = re.sub(
r"(\.+)\"(\s*[^<])", r"\"\1\2", res
) # don't fix period at end of sentence
if (
language == "de"
or language == "es"
or language == "cz"
or language == "cs"
or language == "fr"
):
res = re.sub(r"(\d) (\d)", r"\1,\2", res)
else:
res = re.sub(r"(\d) (\d)", r"\1.\2", res)
return res