in scripts/zeno_visualize.py [0:0]
def main():
"""Upload the results of your benchmark tasks to the Zeno AI evaluation platform.
This scripts expects your results to live in a data folder where subfolders contain results of individual models.
"""
args = parse_args()
client = ZenoClient(os.environ["ZENO_API_KEY"])
# Get all model subfolders from the parent data folder.
models = [
os.path.basename(os.path.normpath(f))
for f in os.scandir(Path(args.data_path))
if f.is_dir()
]
assert len(models) > 0, "No model directories found in the data_path."
# Get the tasks from the latest results file of the first model.
tasks = set(tasks_for_model(models[0], args.data_path))
# Get tasks names from the latest results file for each model
# Get intersection of tasks for all models
for model in models:
old_tasks = tasks.copy()
task_count = len(tasks)
model_tasks = set(tasks_for_model(model, args.data_path))
tasks.intersection(set(model_tasks))
if task_count != len(tasks):
eval_logger.warning(
f"All models must have the same tasks. {model} has tasks: {model_tasks} but have already recorded tasks: {old_tasks}. Taking intersection {tasks}"
)
assert (
len(tasks) > 0
), "Must provide at least one task in common amongst models to compare."
for task in tasks:
# Upload data for all models
for model_index, model in enumerate(models):
# Get latest results and sample results for a model
model_dir = Path(args.data_path, model)
model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
model_results_filenames = get_results_filenames(model_files)
model_sample_filenames = get_sample_results_filenames(model_files)
latest_results = get_latest_filename(
[Path(f).name for f in model_results_filenames]
)
latest_sample_results = get_latest_filename(
[Path(f).name for f in model_sample_filenames if task in f]
)
model_args = re.sub(
r"[\"<>:/\|\\?\*\[\]]+",
"__",
json.load(
open(Path(args.data_path, model, latest_results), encoding="utf-8")
)["config"]["model_args"],
)
print(model_args)
data = []
with open(
Path(args.data_path, model, latest_sample_results),
"r",
encoding="utf-8",
) as file:
for line in file:
data.append(json.loads(line.strip()))
configs = json.load(
open(Path(args.data_path, model, latest_results), encoding="utf-8")
)["configs"]
config = configs[task]
if model_index == 0: # Only need to assemble data for the first model
metrics = []
for metric in config["metric_list"]:
metrics.append(
ZenoMetric(
name=metric["metric"],
type="mean",
columns=[metric["metric"]],
)
)
project = client.create_project(
name=args.project_name + (f"_{task}" if len(tasks) > 1 else ""),
view="text-classification",
metrics=metrics,
)
project.upload_dataset(
generate_dataset(data, config),
id_column="id",
data_column="data",
label_column="labels",
)
project.upload_system(
generate_system_df(data, config),
name=model,
id_column="id",
output_column="output",
)