in fastchat/data/optional_clean.py [0:0]
def skip(conv, args):
# Remove certain languages
if args.keep_lang != "all" or args.skip_lang is not None:
text = "\n".join([x["value"] for x in conv["conversations"]])
try:
lang_code = Detector(text).language.code
except (pycld2.error, polyglot.detect.base.UnknownLanguage):
lang_code = "unknown"
if args.keep_lang != "all" and lang_code != args.keep_lang:
return True
if lang_code == args.skip_lang:
return True
# Remove repetitive numbers
if args.reduce_rep:
for sentence in conv["conversations"]:
val = sentence["value"]
sub = re.search(r"(\d)\1{8}", val)
if sub is not None:
return True
return False