in fastchat/serve/monitor/dataset_release_scripts/arena_33k/filter_bad_conv.py [0:0]
def detect_type(conv):
for key in ["conversation_a", "conversation_b"]:
messages = [row["content"] for row in conv[key]]
for msg in messages:
if not isinstance(msg, str):
return TypeCode.BAD_FORMAT
user_prompts = [
row["content"].lower().strip() for row in conv[key] if row["role"] == "user"
]
if len(messages) <= 2 and all(len(x) < 16 for x in user_prompts):
return TypeCode.TOO_SHORT
if all(x in frequent_prompts for x in user_prompts):
return TypeCode.TOO_FREQUENT
for msg in messages:
msg = msg.lower()
if "<anonymized>" in msg:
return TypeCode.ANONYMIZED
if "<redacted>" in msg:
return TypeCode.REDACTED
for w in blocked_words:
if w in msg:
return TypeCode.BLOCKED_WORD
for key in ["model_a", "model_b"]:
if conv[key] in ["vicuna-33b", "mpt-30b-chat"]:
return TypeCode.BLOCKED_MODEL
return TypeCode.CORRECT