in evaluator/evaluator.py [0:0]
def load_llama_models_tokenizer(args):
tokenizer = Llama2Tokenizer.from_pretrained(args.checkpoint_path)
model = Model.from_pretrained(
args.checkpoint_path,
device_map=f'cuda:{args.gpu}')
model.generation_config.do_sample = False # use greedy decoding
model.generation_config.repetition_penalty = 1.0 # disable repetition penalty
return model, tokenizer