neuron-explainer/neuron_explainer/activations/activations.py

# Dataclasses and enums for storing neuron-indexed information about activations. Also, related # helper functions. import math from dataclasses import dataclass, field from typing import List, Optional, Union import urllib.request import blobfile as bf import boostedblob as bbb from neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass from neuron_explainer.azure import standardize_azure_url @register_dataclass @dataclass class ActivationRecord(FastDataclass): """Collated lists of tokens and their activations for a single neuron.""" tokens: List[str] """Tokens in the text sequence, represented as strings.""" activations: List[float] """Raw activation values for the neuron on each token in the text sequence.""" @register_dataclass @dataclass class NeuronId(FastDataclass): """Identifier for a neuron in an artificial neural network.""" layer_index: int """The index of layer the neuron is in. The first layer used during inference has index 0.""" neuron_index: int """The neuron's index within in its layer. Indices start from 0 in each layer.""" def _check_slices( slices_by_split: dict[str, slice], expected_num_values: int, ) -> None: """Assert that the slices are disjoint and fully cover the intended range.""" indices = set() sum_of_slice_lengths = 0 n_splits = len(slices_by_split.keys()) for s in slices_by_split.values(): subrange = range(expected_num_values)[s] sum_of_slice_lengths += len(subrange) indices |= set(subrange) assert ( sum_of_slice_lengths == expected_num_values ), f"{sum_of_slice_lengths=} != {expected_num_values=}" stride = n_splits expected_indices = set.union( *[set(range(start_index, expected_num_values, stride)) for start_index in range(n_splits)] ) assert indices == expected_indices, f"{indices=} != {expected_indices=}" def get_slices_for_splits( splits: list[str], num_activation_records_per_split: int, ) -> dict[str, slice]: """ Get equal-sized interleaved subsets for each of a list of splits, given the number of elements to include in each split. """ stride = len(splits) num_activation_records_for_even_splits = num_activation_records_per_split * stride slices_by_split = { split: slice(split_index, num_activation_records_for_even_splits, stride) for split_index, split in enumerate(splits) } _check_slices( slices_by_split=slices_by_split, expected_num_values=num_activation_records_for_even_splits, ) return slices_by_split @dataclass class ActivationRecordSliceParams: """How to select splits (train, valid, etc.) of activation records.""" n_examples_per_split: Optional[int] """The number of examples to include in each split.""" @register_dataclass @dataclass class NeuronRecord(FastDataclass): """Neuron-indexed activation data, including summary stats and notable activation records.""" neuron_id: NeuronId """Identifier for the neuron.""" random_sample: list[ActivationRecord] = field(default_factory=list) """ Random activation records for this neuron. The random sample is independent from those used for other neurons. """ random_sample_by_quantile: Optional[list[list[ActivationRecord]]] = None """ Random samples of activation records in each of the specified quantiles. None if quantile tracking is disabled. """ quantile_boundaries: Optional[list[float]] = None """Boundaries of the quantiles used to generate the random_sample_by_quantile field.""" # Moments of activations mean: Optional[float] = math.nan variance: Optional[float] = math.nan skewness: Optional[float] = math.nan kurtosis: Optional[float] = math.nan most_positive_activation_records: list[ActivationRecord] = field(default_factory=list) """ Activation records with the most positive figure of merit value for this neuron over all dataset examples. """ @property def max_activation(self) -> float: """Return the maximum activation value over all top-activating activation records.""" return max([max(ar.activations) for ar in self.most_positive_activation_records]) def _get_top_activation_slices( self, activation_record_slice_params: ActivationRecordSliceParams ) -> dict[str, slice]: splits = ["train", "calibration", "valid", "test"] n_examples_per_split = activation_record_slice_params.n_examples_per_split if n_examples_per_split is None: n_examples_per_split = len(self.most_positive_activation_records) // len(splits) assert len(self.most_positive_activation_records) >= n_examples_per_split * len(splits) return get_slices_for_splits(splits, n_examples_per_split) def _get_random_activation_slices( self, activation_record_slice_params: ActivationRecordSliceParams ) -> dict[str, slice]: splits = ["calibration", "valid", "test"] n_examples_per_split = activation_record_slice_params.n_examples_per_split if n_examples_per_split is None: n_examples_per_split = len(self.random_sample) // len(splits) # NOTE: this assert could trigger on some old datasets with only 10 random samples, in which case you may have to remove "test" from the set of splits assert len(self.random_sample) >= n_examples_per_split * len(splits) return get_slices_for_splits(splits, n_examples_per_split) def train_activation_records( self, activation_record_slice_params: ActivationRecordSliceParams, ) -> list[ActivationRecord]: """ Train split, typically used for generating explanations. Consists exclusively of top-activating records since context window limitations make it difficult to include random records. """ return self.most_positive_activation_records[ self._get_top_activation_slices(activation_record_slice_params)["train"] ] def calibration_activation_records( self, activation_record_slice_params: ActivationRecordSliceParams, ) -> list[ActivationRecord]: """ Calibration split, typically used for calibrating neuron simulations. See http://go/neuron_explanation_methodology for an explanation of calibration. Consists of top-activating records and random records in a 1:1 ratio. """ return ( self.most_positive_activation_records[ self._get_top_activation_slices(activation_record_slice_params)["calibration"] ] + self.random_sample[ self._get_random_activation_slices(activation_record_slice_params)["calibration"] ] ) def valid_activation_records( self, activation_record_slice_params: ActivationRecordSliceParams, ) -> list[ActivationRecord]: """ Validation split, typically used for evaluating explanations, either automatically with simulation + correlation coefficient scoring, or manually by humans. Consists of top-activating records and random records in a 1:1 ratio. """ return ( self.most_positive_activation_records[ self._get_top_activation_slices(activation_record_slice_params)["valid"] ] + self.random_sample[ self._get_random_activation_slices(activation_record_slice_params)["valid"] ] ) def test_activation_records( self, activation_record_slice_params: ActivationRecordSliceParams, ) -> list[ActivationRecord]: """ Test split, typically used for explanation evaluations that can't use the validation split. Consists of top-activating records and random records in a 1:1 ratio. """ return ( self.most_positive_activation_records[ self._get_top_activation_slices(activation_record_slice_params)["test"] ] + self.random_sample[ self._get_random_activation_slices(activation_record_slice_params)["test"] ] ) def neuron_exists( dataset_path: str, layer_index: Union[str, int], neuron_index: Union[str, int] ) -> bool: """Return whether the specified neuron exists.""" file = bf.join(dataset_path, "neurons", str(layer_index), f"{neuron_index}.json") return bf.exists(file) def load_neuron( layer_index: Union[str, int], neuron_index: Union[str, int], dataset_path: str = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/collated-activations", ) -> NeuronRecord: """Load the NeuronRecord for the specified neuron.""" url = "/".join([dataset_path, str(layer_index), f"{neuron_index}.json"]) url = standardize_azure_url(url) with urllib.request.urlopen(url) as f: neuron_record = loads(f.read()) if not isinstance(neuron_record, NeuronRecord): raise ValueError( f"Stored data incompatible with current version of NeuronRecord dataclass." ) return neuron_record @bbb.ensure_session async def load_neuron_async( layer_index: Union[str, int], neuron_index: Union[str, int], dataset_path: str = "az://openaipublic/neuron-explainer/data/collated-activations", ) -> NeuronRecord: """Async version of load_neuron.""" file = bf.join(dataset_path, str(layer_index), f"{neuron_index}.json") return await read_neuron_file(file) @bbb.ensure_session async def read_neuron_file(neuron_filename: str) -> NeuronRecord: """Like load_neuron_async, but takes a raw neuron filename.""" raw_contents = await bbb.read.read_single(neuron_filename) neuron_record = loads(raw_contents.decode("utf-8")) if not isinstance(neuron_record, NeuronRecord): raise ValueError( f"Stored data incompatible with current version of NeuronRecord dataclass." ) return neuron_record def get_sorted_neuron_indices(dataset_path: str, layer_index: Union[str, int]) -> List[int]: """Returns the indices of all neurons in this layer, in ascending order.""" layer_dir = bf.join(dataset_path, "neurons", str(layer_index)) return sorted( [int(f.split(".")[0]) for f in bf.listdir(layer_dir) if f.split(".")[0].isnumeric()] ) def get_sorted_layers(dataset_path: str) -> List[str]: """ Return the indices of all layers in this dataset, in ascending numerical order, as strings. """ return [ str(x) for x in sorted( [int(x) for x in bf.listdir(bf.join(dataset_path, "neurons")) if x.isnumeric()] ) ]

neuron-explainer/neuron_explainer/activations/activations.py (179 lines of code) (raw):