tracking/translations_parser/publishers.py

import csv import logging import sys from abc import ABC from collections import defaultdict from pathlib import Path from typing import Sequence import wandb import yaml from translations_parser.data import Metric, TrainingEpoch, ValidationEpoch from translations_parser.utils import parse_task_label, parse_gcp_metric, patch_model_name logger = logging.getLogger(__name__) METRIC_KEYS = sorted(set(Metric.__annotations__.keys()) - {"importer", "dataset", "augmentation"}) class Publisher(ABC): """ Abstract class used to publish parsed data. Either the `handle_*` methods can be overriden for real time publication (introduced later on) or the `publish` method with all results (including parser run date, configuration…). """ def open(self, parser) -> None: ... def handle_training(self, training: TrainingEpoch) -> None: ... def handle_validation(self, validation: ValidationEpoch) -> None: ... def handle_metrics(self, metrics: Sequence[Metric]) -> None: ... def publish(self) -> None: ... def close(self) -> None: ... class CSVExport(Publisher): def __init__(self, output_dir: Path) -> None: from translations_parser.parser import TrainingParser if not output_dir.is_dir(): raise ValueError("Output must be a valid directory for the CSV export") self.output_dir = output_dir self.parser: TrainingParser | None = None def open(self, parser=None) -> None: self.parser = parser def write_data( self, output: Path, entries: Sequence[TrainingEpoch | ValidationEpoch], dataclass: type ) -> None: if not entries: logger.warning(f"No {dataclass.__name__} entry, skipping.") with open(output, "w") as f: writer = csv.DictWriter(f, fieldnames=dataclass.__annotations__) writer.writeheader() for entry in entries: writer.writerow(vars(entry)) def publish(self) -> None: assert self.parser is not None, "Parser must be set to run CSV publication." training_log = self.parser.output training_output = self.output_dir / "training.csv" if training_output.exists(): logger.warning(f"Training output file {training_output} exists, skipping.") else: self.write_data(training_output, training_log.training, TrainingEpoch) validation_output = self.output_dir / "validation.csv" if validation_output.exists(): logger.warning(f"Validation output file {validation_output} exists, skipping.") else: self.write_data(validation_output, training_log.validation, ValidationEpoch) class WandB(Publisher): def __init__( self, *, project: str, group: str, name: str, suffix: str = "", # Optional path to a directory containing training artifacts artifacts: Path | None = None, artifacts_name: str = "logs", **extra_kwargs, ): from translations_parser.parser import TrainingParser # Set logging of wandb module to WARNING, so we output training logs instead self.wandb_logger = logging.getLogger("wandb") self.wandb_logger.setLevel(logging.ERROR) self.project = project self.group = group self.suffix = suffix # Build a unique run identifier based on the passed suffix # This ID is also used as display name on W&B, as the interface expects unique display names among runs self.run = f"{name}{suffix}" self.artifacts = artifacts self.artifacts_name = artifacts_name self.extra_kwargs = extra_kwargs self.parser: TrainingParser | None = None self.wandb: wandb.sdk.wandb_run.Run | wandb.sdk.lib.disabled.RunDisabled | None = None def close(self) -> None: if self.wandb is None: return # Publish artifacts if self.artifacts: artifact = wandb.Artifact(name=self.artifacts_name, type=self.artifacts_name) artifact.add_dir(local_path=str(self.artifacts.resolve())) self.wandb.log_artifact(artifact) if self.parser is not None: # Store Marian logs as the main log artifact, instead of W&B client runtime. # This will be overwritten in case an unhandled exception occurs. for line in self.parser.parsed_logs: sys.stdout.write(f"{line}\n") self.wandb.finish() def open(self, parser=None) -> None: self.parser = parser config = getattr(parser, "config", {}).copy() config.update(self.extra_kwargs.pop("config", {})) # Publish datasets stats directly in the dashboard datasets = config.pop("datasets", None) try: self.wandb = wandb.init( project=self.project, group=self.group, name=self.run, id=self.run, config=config, # Since we use unique run names based on group ID (e.g. finetune-student_MjcJG), # we can use "allow" mode for resuming a stopped Taskcluster run in case of preemption. # It will continue logging to the same run if it exists. # Offline publication should handle run deletion separately (use --override-runs). resume="allow", **self.extra_kwargs, ) if self.wandb.resumed: logger.info(f"W&B run is being resumed from existing run '{self.run}'.") except Exception as e: logger.error(f"WandB client could not be initialized: {e}. No data will be published.") if datasets is not None: # Log dataset sizes as a custom bar chart self.wandb.log( { "Datasets": wandb.plot.bar( wandb.Table( columns=["Name", "Count"], data=[[key, value] for key, value in datasets.items()], ), "Name", "Count", title="Datasets", ) } ) def generic_log(self, data: TrainingEpoch | ValidationEpoch) -> None: if self.wandb is None: return epoch = vars(data) step = epoch.pop("up") for key, val in epoch.items(): if val is None: # Do not publish null values (e.g. perplexity in Marian 1.10) continue self.wandb.log(step=step, data={key: val}) def handle_training(self, training: TrainingEpoch) -> None: self.generic_log(training) def handle_validation(self, validation: ValidationEpoch) -> None: self.generic_log(validation) def handle_metrics(self, metrics: Sequence[Metric]) -> None: if self.wandb is None: return for metric in metrics: title = metric.importer if metric.augmentation: title = f"{title}_{metric.augmentation}" if metric.dataset: title = f"{title}_{metric.dataset}" # Publish a bar chart (a table with values will also be available from W&B) self.wandb.log( { title: wandb.plot.bar( wandb.Table( columns=["Metric", "Value"], data=[ [key, getattr(metric, key)] for key in METRIC_KEYS if getattr(metric, key) is not None ], ), "Metric", "Value", title=title, ) } ) @classmethod def publish_group_logs( cls, *, logs_parent_folder: list[str], project: str, group: str, suffix: str, existing_runs: list[str] | None = None, snakemake: bool = False, ) -> None: """ Publish files within `logs_dir` to W&B artifacts for a specific group. A fake W&B run named `group_logs` is created to publish those artifacts among with all evaluation files (quantized + experiments). If existing run is set, runs found not specified in this list will also be published to W&B. """ from translations_parser.parser import TrainingParser try: if ( len( wandb.Api().runs( path=project, filters={"display_name": "group_logs", "group": group} ) ) > 0 ): logger.warning("Skipping group_logs fake run publication as it already exists") return except ValueError as e: # Project may not exist yet as group_logs is published before the first training task if "could not find project" not in str(e).lower(): logger.warning(f"Detection of a previous group_logs run failed: {e}") logs_dir = Path("/".join([*logs_parent_folder[:-1], "logs", project, group])) models_dir = Path("/".join([*logs_parent_folder[:-1], "models", project, group])) # Old experiments use `speed` directory for quantized metrics quantized_metrics = sorted( Path( "/".join( [*logs_parent_folder[:-1], "models", project, group, "evaluation", "speed"] ) ).glob("*.metrics") ) logs_metrics = sorted((logs_dir / "eval").glob("eval*.log")) direct_metrics = sorted((logs_dir / "metrics").glob("*.metrics")) taskcluster_metrics = [] # Do not retrieve metrics from models directory for legacy Snakemake experiments if snakemake is False: taskcluster_metrics = sorted((models_dir).glob("**/*.metrics")) if quantized_metrics: logger.info(f"Found {len(quantized_metrics)} quantized metrics from speed folder") if logs_metrics: logger.info(f"Found {len(logs_metrics)} metrics from task logs") if direct_metrics: logger.info(f"Found {len(direct_metrics)} Snakemake metrics from .metrics artifacts") if taskcluster_metrics: logger.info( f"Found {len(taskcluster_metrics)} Taskcluster metrics from .metrics artifacts" ) # Store metrics by run name metrics = defaultdict(list) # Add metrics from the speed folder for file in quantized_metrics: importer, dataset = file.stem.split("_", 1) metrics["quantized"].append(Metric.from_file(file, importer=importer, dataset=dataset)) # Add metrics from tasks logs for file in logs_metrics: try: model_name, importer, dataset, aug = parse_task_label(file.stem) with file.open("r") as f: lines = f.readlines() metrics[model_name].append( Metric.from_tc_context( importer=importer, dataset=dataset, lines=lines, augmentation=aug ) ) except ValueError as e: logger.error(f"Could not parse metrics from {file.resolve()}: {e}") # Add metrics from old SnakeMake .metrics files for file in direct_metrics: model_name, importer, dataset, aug = parse_task_label(file.stem) try: metrics[model_name].append( Metric.from_file(file, importer=importer, dataset=dataset, augmentation=aug) ) except ValueError as e: logger.error(f"Could not parse metrics from {file.resolve()}: {e}") # Add metrics from new Taskcluster .metrics files for file in taskcluster_metrics: model_name = patch_model_name(file.parent.name) try: metric_attrs = parse_gcp_metric(file.stem) metrics[model_name].append( Metric.from_file( file, importer=metric_attrs.importer, dataset=metric_attrs.dataset, augmentation=metric_attrs.augmentation, ) ) except ValueError as e: logger.error(f"Could not parse metrics from {file.resolve()}: {e}") # Publish missing runs (runs without training data) missing_run_metrics = {} if existing_runs is not None: missing_run_metrics = { name: metrics for name, metrics in metrics.items() if name not in existing_runs } for model_name, model_metrics in missing_run_metrics.items(): logger.info(f"Creating missing run {model_name} with associated metrics") publisher = cls( project=project, group=group, name=model_name, suffix=suffix, ) publisher.open(TrainingParser(logs_iter=iter([]), publishers=[])) publisher.handle_metrics(model_metrics) publisher.close() # Publication of the `group_logs` fake run config = {} config_path = Path( "/".join([*logs_parent_folder[:-1], "experiments", project, group, "config.yml"]) ) if not config_path.is_file(): logger.warning(f"No configuration file at {config_path}, skipping.") else: # Publish the YAML configuration as configuration on the group run with config_path.open("r") as f: data = f.read() try: config.update(yaml.safe_load(data)) except Exception as e: logger.error(f"Config could not be read at {config_path}: {e}") publisher = cls( project=project, group=group, name="group_logs", suffix=suffix, ) publisher.wandb = wandb.init( project=project, group=group, name=publisher.run, id=publisher.run, config=config, ) if metrics: # Publish all evaluation metrics to a table table = wandb.Table( columns=["Group", "Model", "Importer", "Dataset", "Augmenation", *METRIC_KEYS], data=[ [group, run_name, metric.importer, metric.dataset, metric.augmentation] + [getattr(metric, attr) for attr in METRIC_KEYS] for run_name, run_metrics in metrics.items() for metric in run_metrics ], ) publisher.wandb.log({"metrics": table}) if logs_dir.is_dir(): # Publish logs directory content as artifacts artifact = wandb.Artifact(name=group, type="logs") artifact.add_dir(local_path=str(logs_dir.resolve())) publisher.wandb.log_artifact(artifact) publisher.wandb.finish()

tracking/translations_parser/publishers.py (311 lines of code) (raw):