in aoai/token_count_utils.py [0:0]
def print_stats_tokens(jsonl_files, model="gpt-4o-2024-05-13"):
for jsonl_path in jsonl_files:
logger.info('*' * 50)
logger.info(f"### [TOKEN_STATS] Processing file: {jsonl_path}")
with open(jsonl_path, 'r', encoding='utf-8') as f:
dataset = [json.loads(line) for line in f]
total_tokens = []
assistant_tokens = []
function_tokens = []
for idx, ex in enumerate(dataset):
messages = ex.get("messages", {})
functions = ex.get("tools", {""})
total_tokens.append(count_token(messages, model))
assistant_tokens.append(num_assistant_tokens_from_messages(messages, model))
if len(functions) > 1 and functions != {''}:
function_tokens.append(num_tokens_from_functions(functions, model))
print_distribution(total_tokens, "total tokens")
print_distribution(function_tokens, "function tokens")
print_distribution(assistant_tokens, "assistant tokens")
logger.info('*' * 50)