def neff_cache()

in optimum/neuron/models/inference/backend/cache.py [0:0]
52 lines of code
6 McCabe index (conditional complexity)

def neff_cache(cache_dir: Optional[str] = None):
    """
    Context manager to patch `torch_neuronx.xla_impl.trace.generate_neff`.

    This temporarily replaces the original function in the `torch_neuronx.xla_impl.trace` module
    to use a cache for storing and retrieving compiled NEFF files.

    Usage:

        with neff_cache(cache_dir="/path/to/cache"):
            # Your code that generates NEFF files goes here
            # The cache will be used to store and retrieve compiled NEFF files
            # The original function will be restored after exiting the context

    This function uses the `libneuronxla` library to create a compile cache.
    Each entry in the cache is identified by a hash of the HLO module and its compilation flags.

    Args:
        cache_dir (`str`, *optional*):
            Directory to store the cache. If not provided, a default directory will be used.
    """

    def generate_neff_with_cache(
        hlo_artifacts: HloArtifacts,
        compiler_workdir: Optional[Union[str, pathlib.Path]] = None,
        compiler_args: Optional[Union[List[str], str]] = None,
        inline_weights_to_neff: bool = True,
    ):
        """
        Generate a NEFF file from the HLO artifacts using the specified compiler arguments.

        Unlike the original implementation, this function uses a cache to store and retrieve compiled NEFF files.
        If the weights were not in the optimal layout, the compiler also produces a wrapped neff HLO stub that needs
        to be cached as well.

        Args:
            hlo_artifacts (`HloArtifacts`):
                HLO artifacts containing the HLO module and constant parameter tensors.
            compiler_workdir (`str`, *optional*):
                Directory to store the compiler workdir. If not provided, a default directory will be used.
            compiler_args (`List[str]` or `str`, *optional*):
                Compiler arguments to be used for compilation. If not provided, a default set of arguments will be used.
            inline_weights_to_neff (`bool`, *optional*):
                Whether to inline weights to NEFF. Defaults to `True`.
        Returns:
            `NeffArtifacts`:
                NEFF artifacts containing the path to the compiled NEFF file.
        """
        if inline_weights_to_neff:
            # We don't cache compilation artifacts containing weights
            return generate_neff(
                hlo_artifacts,
                compiler_workdir=compiler_workdir,
                compiler_args=compiler_args,
                inline_weights_to_neff=inline_weights_to_neff,
            )

        # Generate the HLO and other artifacts required for compilation
        compiler_target = setup_compiler_dirs(
            hlo_artifacts.hlo_module,
            compiler_workdir,
            hlo_artifacts.constant_parameter_tensors,
            inline_weights_to_neff,
        )

        # Create a hub compile cache proxying the libneuronxla cache
        # It will fetch contents from the hub if they are not found in the local cache
        cache_url = CacheUrl.get_cache_url(cache_dir=cache_dir)
        compile_cache = create_hub_compile_cache_proxy(cache_url)

        # The cache key is a hash of the HLO module and the compiler arguments
        cache_key = get_hash_module(hlo_artifacts.hlo_module, compiler_args)

        # Look in the cache
        compile_flags_str = json.dumps(compiler_args)
        entry = compile_cache.lookup(cache_key, compiler_args)

        # The result of the compilation that we need to fetch or produce in the compiler
        # working directory is composed of the NEFF file and an optional wrapped neff HLO stub
        neff_filename = os.path.join(compiler_workdir, "graph.neff")
        wrapped_neff_filename = os.path.join(compiler_workdir, "wrapped_neff.hlo")

        with entry:
            if entry.exists:
                # There is an entry in the cache, download it at the expected location
                entry.download_neff(neff_filename)
                # If the weights were not in the optimal layout, there might also be a wrapped neff HLO stub
                entry.download_wrapped_neff(wrapped_neff_filename)
                logger.info(f"Using a cached neff at {entry.neff_path}")
                return NeffArtifacts(neff_filename)

        # This graph doesn't have a NEFF in the cache yet, and we're holding the lock for it
        # First make sure the inputs are in the cache
        entry.upload_inputs(compiler_target, compile_flags_str)

        # Now compile the graph
        compiled_neff_filename = hlo_compile(
            compiler_target, compiler_workdir=compiler_workdir, compiler_args=compiler_args
        )
        if compiled_neff_filename != neff_filename:
            # The compiled NEFF file is not at the expected location, which reveals that the hlo_compile implementation has evolved
            raise ValueError(
                "Incompatible torch_neuronx.xla_impl.trace.hlo_compile implementation. Did you update the library ?"
            )

        # Store the generated artifacts in the cache
        logger.info(f"Caching neff at {entry.neff_path}")
        entry.upload_neff(neff_filename)
        if os.path.exists(wrapped_neff_filename):
            logger.info(f"Caching wrapped neff HLO stub at {entry.neff_path}")
            entry.upload_wrapped_neff(wrapped_neff_filename)

        return NeffArtifacts(neff_filename)

    try:
        patch_everywhere("generate_neff", generate_neff_with_cache, "torch_neuronx.xla_impl.trace")
        yield
    finally:
        patch_everywhere("generate_neff", generate_neff, "torch_neuronx.xla_impl.trace")