in leaderboard/llvm_instcount/e_greedy/e_greedy.py [0:0]
def e_greedy_search(env: LlvmEnv) -> None:
"""Run an ϵ-greedy search on an environment."""
step_count = 0
with ThreadPoolExecutor(max_workers=FLAGS.nproc) as executor:
while True:
step_count += 1
if random.random() < FLAGS.epsilon:
# Exploratory step. Randomly select and apply an action.
action = env.action_space.sample()
_, reward, done, _ = env.step(action)
logging.debug(
"Step %d, exploratory action %s, reward %.4f, cumulative %.4f",
step_count,
env.action_space.flags[action],
reward,
env.episode_reward,
)
else:
# Select the best reward and apply it, or terminate the search
# if no positive reward is attainable.
best = select_best_action(env, executor)
if best.reward <= 0:
logging.debug(
"Greedy search terminated after %d steps, "
"no further reward attainable",
step_count,
)
done = True
else:
_, reward, done, _ = env.step(best.action)
logging.debug(
"Step %d, greedy action %s, reward %.4f, cumulative %.4f",
step_count,
env.action_space.flags[best.action],
reward,
env.episode_reward,
)
if env.reward_space.deterministic and reward != best.reward:
logging.warning(
"Action %s produced different reward on replay, %.4f != %.4f",
env.action_space.flags[best.action],
best.reward,
reward,
)
# Stop the search if we have reached a terminal state.
if done:
return