in fastchat/data/clean_sharegpt.py [0:0]
def clean_html_all(content, begin, end):
"""
Clean the source html files.
"""
cnt_skip = 0
cnt_blocked_words = 0
cnt_wrong_format = 0
cnt_parser_error = 0
cnt_too_short = 0
cnt_id_duplication = 0
cnt_value_duplication = 0
cnt_plugin = 0
cnt_tag = 0
content = content[begin:end]
processed = []
with ProcessPoolExecutor() as executor:
for result in tqdm(
executor.map(clean_html_one_sample, content), total=len(content)
):
processed.append(result)
visited = {}
new_content = []
for sample, error_code in processed:
cid = sample["id"]
skipped = True
if error_code != 0:
if error_code == 1:
print(f"id {cid} is too short")
cnt_too_short += 1
elif error_code == 2:
print(f"id {cid} has a wrong format")
cnt_wrong_format += 1
elif error_code == 3:
print(f"id {cid} contains blocked words")
cnt_blocked_words += 1
elif error_code == 4:
print(f"id {cid} contains parser errors")
cnt_parser_error += 1
else:
raise ValueError(f"Invalid error_code: {error_code}")
elif cid in visited:
print(f"id {cid} is an id duplication of {visited[cid]}")
cnt_id_duplication += 1
elif sample.get("plugins", None) is not None:
print(f"id {cid} contains plugin")
cnt_plugin += 1
else:
key = (
sample["conversations"][0]["value"],
sample["conversations"][1]["value"],
)
if key in visited:
print(f"id {cid} is a value duplication of {visited[key]}")
cnt_value_duplication += 1
else:
visited[cid] = visited[key] = cid
skipped = False
if not skipped:
new_content.append(sample)
else:
cnt_skip += 1
print(
f"total: {len(content)}, skip: {cnt_skip}, new: {len(new_content)}, "
f"cnt_blocked_words: {cnt_blocked_words}, cnt_parser_error: {cnt_parser_error}, "
f"cnt_wrong_format: {cnt_wrong_format}, "
f"cnt_too_short: {cnt_too_short}, cnt_id_duplication: {cnt_id_duplication}, "
f"cnt_value_duplication: {cnt_value_duplication}, cnt_plugin: {cnt_plugin}"
)
return new_content