in fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/filter_bad_conv.py [0:0]
def detect_type(conv):
for key in ["conversation_a", "conversation_b", "conversation"]:
if key not in conv:
continue
messages = [row["content"] for row in conv[key]]
for msg in messages:
if not isinstance(msg, str):
return TypeCode.BAD_FORMAT
if len(messages) == 0:
return TypeCode.BAD_FORMAT
user_prompts = [
row["content"].lower().strip() for row in conv[key] if row["role"] == "user"
]
for msg in messages:
msg = cc_converter.convert(msg.lower())
if "<anonymized>" in msg:
return TypeCode.ANONYMIZED
if "<redacted>" in msg:
return TypeCode.REDACTED
for w in blocked_words:
if w in msg:
return TypeCode.BLOCKED_WORD
return TypeCode.CORRECT