def bind_cores_for_best_perf()

in optimum/intel/utils/modeling_utils.py [0:0]


def bind_cores_for_best_perf():
    """
    Set number of threads per rank, numa cpu affinity and numa memory binding if not already set for better OOB performance.
    Works for wold_size >= 1 and rank >= 1

    Example:
    .. code-block:: python

        from optimum.intel.ipex import IPEXModelForCausalLM
        from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf

        bind_cores_for_best_perf()
        model = IPEXModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.bfloat16, export=True)
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        input_sentence = ["tell me a story about a trip to the moon"]
        model_inputs = tokenizer(input_sentence, return_tensors="pt")
        generation_kwargs = dict(max_new_tokens=500)
        generated_ids = model.generate(**model_inputs, **generation_kwargs)

    Returns:
        None

    """
    if platform.system() != "Linux":
        logger.error("bind_cores_for_best_perf: OS not supported, this function can only be run on Linux systems.")
        raise OSError("bind_cores_for_best_perf: OS not supported, this function can only be run on Linux systems.")
    if not is_psutil_available():
        logger.error("`psutil` module not found")
        raise ImportError("'psutil' module not found, install with 'pip install psutil'")
    import psutil

    if not is_numa_available():
        logger.error("'numa' module not found")
        raise ImportError("'numa' module not found, install with 'pip install py-libnuma'")
    import numa

    local_size = get_int_from_env(
        ["LOCAL_WORLD_SIZE", "MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"], 1
    )
    rank_id = get_int_from_env(
        ["LOCAL_RANK", "MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK"], 0
    )
    nodes = numa.info.get_max_node() + 1
    rank_per_node = math.ceil(local_size / nodes)
    num_cpus_per_nodes = int(psutil.cpu_count(logical=False) / nodes)
    node_id = int(rank_id / rank_per_node)
    rank_offset_per_node = rank_id % rank_per_node
    if os.getenv("OMP_NUM_THREADS") is None:
        num_cpus_per_rank = max(int(num_cpus_per_nodes / rank_per_node), 1)
        logger.info(f"Setting OMP_NUM_THREADS to {num_cpus_per_rank} for better performance")
    else:
        num_cpus_per_rank = int(os.getenv("OMP_NUM_THREADS"))
        logger.info(f"OMP_NUM_THREADS already set to  {num_cpus_per_rank}")
    if len(numa.memory.get_membind_nodes()) == nodes:
        # if numa memory binding is not set, set it to the node where the rank is running
        numa.memory.set_membind_nodes((node_id))

    torch.set_num_threads(num_cpus_per_rank)

    if len(numa.schedule.get_affinitive_cpus(0)) == psutil.cpu_count(logical=True):
        # if numa affinity is unset (default value is set to all logical cores) set it to the physical cores assigned to the rank
        cpu_start = num_cpus_per_rank * rank_offset_per_node
        numa.schedule.run_on_cpus(
            0,
            *(numa.info.node_to_cpus(node_id)[cpu_start : cpu_start + num_cpus_per_rank]),
        )

    logger.info(f"affinity={numa.schedule.get_affinitive_cpus(0)}, membind = {numa.memory.get_membind_nodes()}")