def clean_html_all()

in fastchat/data/clean_sharegpt.py [0:0]


def clean_html_all(content, begin, end):
    """
    Clean the source html files.
    """
    cnt_skip = 0
    cnt_blocked_words = 0
    cnt_wrong_format = 0
    cnt_parser_error = 0
    cnt_too_short = 0
    cnt_id_duplication = 0
    cnt_value_duplication = 0
    cnt_plugin = 0
    cnt_tag = 0

    content = content[begin:end]
    processed = []
    with ProcessPoolExecutor() as executor:
        for result in tqdm(
            executor.map(clean_html_one_sample, content), total=len(content)
        ):
            processed.append(result)

    visited = {}
    new_content = []
    for sample, error_code in processed:
        cid = sample["id"]
        skipped = True

        if error_code != 0:
            if error_code == 1:
                print(f"id {cid} is too short")
                cnt_too_short += 1
            elif error_code == 2:
                print(f"id {cid} has a wrong format")
                cnt_wrong_format += 1
            elif error_code == 3:
                print(f"id {cid} contains blocked words")
                cnt_blocked_words += 1
            elif error_code == 4:
                print(f"id {cid} contains parser errors")
                cnt_parser_error += 1
            else:
                raise ValueError(f"Invalid error_code: {error_code}")
        elif cid in visited:
            print(f"id {cid} is an id duplication of {visited[cid]}")
            cnt_id_duplication += 1
        elif sample.get("plugins", None) is not None:
            print(f"id {cid} contains plugin")
            cnt_plugin += 1
        else:
            key = (
                sample["conversations"][0]["value"],
                sample["conversations"][1]["value"],
            )
            if key in visited:
                print(f"id {cid} is a value duplication of {visited[key]}")
                cnt_value_duplication += 1
            else:
                visited[cid] = visited[key] = cid
                skipped = False

        if not skipped:
            new_content.append(sample)
        else:
            cnt_skip += 1

    print(
        f"total: {len(content)}, skip: {cnt_skip}, new: {len(new_content)}, "
        f"cnt_blocked_words: {cnt_blocked_words}, cnt_parser_error: {cnt_parser_error}, "
        f"cnt_wrong_format: {cnt_wrong_format}, "
        f"cnt_too_short: {cnt_too_short}, cnt_id_duplication: {cnt_id_duplication}, "
        f"cnt_value_duplication: {cnt_value_duplication}, cnt_plugin: {cnt_plugin}"
    )

    return new_content