def print_stats_tokens()

in aoai/token_count_utils.py [0:0]


def print_stats_tokens(jsonl_files, model="gpt-4o-2024-05-13"):
    
    for jsonl_path in jsonl_files:
        logger.info('*' * 50)        
        logger.info(f"### [TOKEN_STATS] Processing file: {jsonl_path}")
        with open(jsonl_path, 'r', encoding='utf-8') as f:
            dataset = [json.loads(line) for line in f]

        total_tokens = []
        assistant_tokens = []
        function_tokens = []

        for idx, ex in enumerate(dataset):
            messages = ex.get("messages", {})
            functions = ex.get("tools", {""})
            total_tokens.append(count_token(messages, model))
            assistant_tokens.append(num_assistant_tokens_from_messages(messages, model))
            if len(functions) > 1 and functions != {''}:
                function_tokens.append(num_tokens_from_functions(functions, model))

        print_distribution(total_tokens, "total tokens")
        print_distribution(function_tokens, "function tokens")
        print_distribution(assistant_tokens, "assistant tokens")    
        logger.info('*' * 50)