in fastchat/data/clean_sharegpt.py [0:0]
def clean_html_one_sample(sample):
roles = ["human", "gpt"]
if len(sample["conversations"]) <= 1:
return (sample, 1)
# Adjust the offset for cases like https://sharegpt.com/c/VyaZlh4
if sample["conversations"][0]["from"] != "human":
sample["conversations"] = sample["conversations"][1:]
if len(sample["conversations"]) <= 1:
return (sample, 1)
if sample["conversations"][-1]["from"] == "human":
sample["conversations"] = sample["conversations"][:-1]
if len(sample["conversations"]) <= 1:
return (sample, 1)
char_count = 0
new_conversations = []
for i, c in enumerate(sample["conversations"]):
if c["from"] != roles[i % 2]:
return (sample, 2)
if contain_blocked_words(c["value"]):
return (sample, 3)
try:
new_val = html_to_markdown(c["value"])
except (bs4.builder.ParserRejectedMarkup, AssertionError):
return (sample, 4)
# Filter empty answers like https://sharegpt.com/c/mrllZ6u
if not new_val or not new_val[0].isprintable():
break
char_count += len(new_val)
new_conversations.append(
{
"from": c["from"],
"value": new_val,
}
)
new_conversations = new_conversations[: len(new_conversations) // 2 * 2]
sample["conversations"] = new_conversations
if char_count < 16 or len(sample["conversations"]) <= 0:
return (sample, 1)
return (sample, 0)