def read_texts()

in fastchat/serve/monitor/topic_clustering.py [0:0]


def read_texts(input_file, min_length, max_length, english_only):
    visited = set()
    texts = []

    lines = json.load(open(input_file, "r"))

    for l in tqdm(lines):
        if "text" in l:
            line_texts = [l["text"]]
        elif "conversation_a" in l:
            line_texts = [
                x["content"] for x in l["conversation_a"] if x["role"] == "user"
            ]
        elif "conversation" in l:
            line_texts = [
                x["content"] for x in l["conversation"] if x["role"] == "user"
            ]

        for text in line_texts:
            text = text.strip()

            # Filter language
            if english_only:
                lang = detect_language(text)
                if lang != "English":
                    continue

            # Filter short or long prompts
            if min_length:
                if len(text) < min_length:
                    continue

            if max_length:
                if len(text) > max_length:
                    continue

            # De-duplication
            words = sorted([x.lower() for x in remove_punctuation(text).split(" ")])
            words = "".join(words)
            if words in visited:
                continue

            visited.add(words)
            texts.append(text)
    return np.array(texts)