def clean_html_one_sample()

in fastchat/data/clean_sharegpt.py [0:0]


def clean_html_one_sample(sample):
    roles = ["human", "gpt"]

    if len(sample["conversations"]) <= 1:
        return (sample, 1)

    # Adjust the offset for cases like https://sharegpt.com/c/VyaZlh4
    if sample["conversations"][0]["from"] != "human":
        sample["conversations"] = sample["conversations"][1:]
    if len(sample["conversations"]) <= 1:
        return (sample, 1)

    if sample["conversations"][-1]["from"] == "human":
        sample["conversations"] = sample["conversations"][:-1]
    if len(sample["conversations"]) <= 1:
        return (sample, 1)

    char_count = 0
    new_conversations = []
    for i, c in enumerate(sample["conversations"]):
        if c["from"] != roles[i % 2]:
            return (sample, 2)

        if contain_blocked_words(c["value"]):
            return (sample, 3)

        try:
            new_val = html_to_markdown(c["value"])
        except (bs4.builder.ParserRejectedMarkup, AssertionError):
            return (sample, 4)

        # Filter empty answers like https://sharegpt.com/c/mrllZ6u
        if not new_val or not new_val[0].isprintable():
            break

        char_count += len(new_val)
        new_conversations.append(
            {
                "from": c["from"],
                "value": new_val,
            }
        )

    new_conversations = new_conversations[: len(new_conversations) // 2 * 2]
    sample["conversations"] = new_conversations

    if char_count < 16 or len(sample["conversations"]) <= 0:
        return (sample, 1)

    return (sample, 0)