in fastchat/serve/monitor/topic_clustering.py [0:0]
def read_texts(input_file, min_length, max_length, english_only):
visited = set()
texts = []
lines = json.load(open(input_file, "r"))
for l in tqdm(lines):
if "text" in l:
line_texts = [l["text"]]
elif "conversation_a" in l:
line_texts = [
x["content"] for x in l["conversation_a"] if x["role"] == "user"
]
elif "conversation" in l:
line_texts = [
x["content"] for x in l["conversation"] if x["role"] == "user"
]
for text in line_texts:
text = text.strip()
# Filter language
if english_only:
lang = detect_language(text)
if lang != "English":
continue
# Filter short or long prompts
if min_length:
if len(text) < min_length:
continue
if max_length:
if len(text) > max_length:
continue
# De-duplication
words = sorted([x.lower() for x in remove_punctuation(text).split(" ")])
words = "".join(words)
if words in visited:
continue
visited.add(words)
texts.append(text)
return np.array(texts)