in compiler_gym/leaderboard/llvm_instcount.py [0:0]
def eval_llvm_instcount_policy(policy: Policy) -> None:
"""Evaluate an LLVM codesize policy and generate results for a leaderboard
submission.
To use it, you define your policy as a function that takes an
:class:`LlvmEnv <compiler_gym.envs.LlvmEnv>` instance as input and modifies
it in place. For example, for a trivial random policy:
>>> from compiler_gym.envs import LlvmEnv
>>> def my_policy(env: LlvmEnv) -> None:
.... # Defines a policy that takes 10 random steps.
... for _ in range(10):
... _, _, done, _ = env.step(env.action_space.sample())
... if done: break
If your policy is stateful, you can use a class and override the
:code:`__call__()` method:
>>> class MyPolicy:
... def __init__(self):
... self.my_stateful_vars = {} # or similar
... def __call__(self, env: LlvmEnv) -> None:
... pass # ... do fun stuff!
>>> my_policy = MyPolicy()
The role of your policy is to perform a sequence of actions on the supplied
environment so as to maximize cumulative reward. By default, no observation
space is set on the environment, so :meth:`env.step()
<compiler_gym.envs.CompilerEnv.step>` will return :code:`None` for the
observation. You may set a new observation space:
>>> env.observation_space = "InstCount" # Set a new space for env.step()
>>> env.observation["InstCount"] # Calculate a one-off observation.
However, the policy may not change the reward space of the environment, or
the benchmark.
Once you have defined your policy, call the
:func:`eval_llvm_instcount_policy()
<compiler_gym.leaderboard.llvm_instcount.eval_llvm_instcount_policy>` helper
function, passing it your policy as its only argument:
>>> eval_llvm_instcount_policy(my_policy)
The :func:`eval_llvm_instcount_policy()
<compiler_gym.leaderboard.llvm_instcount.eval_llvm_instcount_policy>`
function calls the policy function for each benchmark in the dataset, one at
a time, from a single thread. Stateful policies can assume thread safe
access to member variables.
Put together as a complete example, a leaderboard submission script may look
like:
.. code-block:: python
# my_policy.py
from compiler_gym.leaderboard.llvm_instcount import eval_llvm_instcount_policy
from compiler_gym.envs import LlvmEnv
def my_policy(env: LlvmEnv) -> None:
env.observation_space = "InstCount" # we're going to use instcount space
pass # ... do fun stuff!
if __name__ == "__main__":
eval_llvm_instcount_policy(my_policy)
The :func:`eval_llvm_instcount_policy()
<compiler_gym.leaderboard.llvm_instcount.eval_llvm_instcount_policy>` helper
defines a number of commandline flags that can be overriden to control the
behavior of the evaluation. For example the flag :code:`--n` determines the
number of times the policy is run on each benchmark (default is 10), and
:code:`--leaderboard_results` determines the path of the generated results
file:
.. code-block::
$ python my_policy.py --n=5 --leaderboard_results=my_policy_results.csv
You can use :code:`--helpfull` flag to list all of the flags that are
defined:
.. code-block::
$ python my_policy.py --helpfull
Once you are happy with your approach, see the `contributing guide
<https://github.com/facebookresearch/CompilerGym/blob/development/CONTRIBUTING.md#leaderboard-submissions>`_
for instructions on preparing a submission to the leaderboard.
"""
def main(argv):
assert len(argv) == 1, f"Unknown args: {argv[:1]}"
assert FLAGS.n > 0, "n must be > 0"
with gym.make("llvm-ic-v0") as env:
# Stream verbose CompilerGym logs to file.
logger = logging.getLogger("compiler_gym")
logger.setLevel(logging.DEBUG)
log_handler = logging.FileHandler(FLAGS.leaderboard_logfile)
logger.addHandler(log_handler)
logger.propagate = False
print(f"Writing results to {FLAGS.leaderboard_results}")
print(f"Writing logs to {FLAGS.leaderboard_logfile}")
# Build the list of benchmarks to evaluate.
benchmarks = env.datasets[FLAGS.test_dataset].benchmark_uris()
if FLAGS.max_benchmarks:
benchmarks = islice(benchmarks, FLAGS.max_benchmarks)
benchmarks = list(benchmarks)
# Repeat the searches for the requested number of iterations.
benchmarks *= FLAGS.n
total_count = len(benchmarks)
# If we are resuming from a previous job, read the states that have
# already been proccessed and remove those benchmarks from the list
# of benchmarks to evaluate.
init_states = []
if FLAGS.resume and Path(FLAGS.leaderboard_results).is_file():
with CompilerEnvStateReader(open(FLAGS.leaderboard_results)) as reader:
for state in reader:
init_states.append(state)
if state.benchmark in benchmarks:
benchmarks.remove(state.benchmark)
# Run the benchmark loop in background so that we can asynchronously
# log progress.
worker = _EvalPolicyWorker(env, benchmarks, policy, init_states)
worker.start()
timer = Timer().reset()
try:
print(
f"=== Evaluating policy on "
f"{humanize.intcomma(total_count)} "
f"{FLAGS.test_dataset} benchmarks ==="
"\n\n" # Blank lines will be filled below
)
while worker.is_alive():
done_count = len(worker.states)
remaining_count = total_count - done_count
time = timer.time
gmean_reward = geometric_mean([s.reward for s in worker.states])
mean_walltime = (
arithmetic_mean([s.walltime for s in worker.states]) or time
)
print(
"\r\033[2A"
"\033[K"
f"Runtime: {humanize_duration_hms(time)}. "
f"Estimated completion: {humanize_duration_hms(mean_walltime * remaining_count)}. "
f"Completed: {humanize.intcomma(done_count)} / {humanize.intcomma(total_count)} "
f"({done_count / total_count:.1%})."
"\n\033[K"
f"Current mean walltime: {mean_walltime:.3f}s / benchmark."
"\n\033[K"
f"Current geomean reward: {gmean_reward:.4f}.",
flush=True,
end="",
)
sleep(1)
except KeyboardInterrupt:
print("\nkeyboard interrupt", flush=True)
worker.alive = False
# User interrupt, don't validate.
FLAGS.validate = False
if FLAGS.validate:
FLAGS.env = "llvm-ic-v0"
validate(["argv0", FLAGS.leaderboard_results])
app.run(main)