toolkits/sft_data_preprocessing/sample_stats.py (23 lines of code) (raw):
import sys
import pandas
import json
if len(sys.argv) < 2:
print("请提供文件路径作为参数")
sys.exit(1)
samples = []
file_path = sys.argv[1]
try:
with open(file_path, 'r', encoding='utf-8') as f:
fin = json.load(f)
except Exception:
fin = []
with open(file_path, 'r', encoding='utf-8') as f:
fin = [json.loads(d) for d in f.readlines()]
assert isinstance(fin, list)
for jdict in fin:
instruct = jdict["instruction"]
input = jdict["input"]
output = jdict["output"]
samples.append(instruct+input+output)
pd = pandas.Series(samples).map(len)
print(pd.describe())